From 0f74226649fb2875a91b68f3750f55220aa73425 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 Feb 2020 09:14:09 -0600 Subject: [PATCH 01/65] kernel: module: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 33569a01d6e1..b88ec9cd2a7f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1515,7 +1515,7 @@ struct module_sect_attr { struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; - struct module_sect_attr attrs[0]; + struct module_sect_attr attrs[]; }; static ssize_t module_sect_show(struct module_attribute *mattr, @@ -1608,7 +1608,7 @@ static void remove_sect_attrs(struct module *mod) struct module_notes_attrs { struct kobject *dir; unsigned int notes; - struct bin_attribute attrs[0]; + struct bin_attribute attrs[]; }; static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, From ab70a73aa45beccab01cc0bb1d920361e3642393 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 30 Jan 2020 17:04:19 +0000 Subject: [PATCH 02/65] riscv: Use flush_icache_mm for flush_icache_user_range The only call path is: __access_remote_vm -> copy_to_user_page -> flush_icache_user_range Seems it's ok to use flush_icache_mm instead of flush_icache_all and it could reduce flush_icache_all called on other harts. Signed-off-by: Guo Ren [Palmer: git-am wouldn't apply the patch, I did so manually] Fixes: 08f051eda33b ("RISC-V: Flush I$ when making a dirty page executable") Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheflush.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 555b20b11dc3..c8677c75f82c 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -85,7 +85,7 @@ static inline void flush_dcache_page(struct page *page) * so instead we just flush the whole thing. */ #define flush_icache_range(start, end) flush_icache_all() -#define flush_icache_user_range(vma, pg, addr, len) flush_icache_all() +#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0) #ifndef CONFIG_SMP From 2fab7a15604cfe3605775c5d146a7dfcf97412bb Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sun, 5 Jan 2020 12:10:20 -0800 Subject: [PATCH 03/65] riscv: Delete CONFIG_SYSFS_SYSCALL from defconfigs According to init/Kconfig: "sys_sysfs is an obsolete system call no longer supported in libc. Note that disabling this option is more secure but might break compatibility with some systems." This syscall is not required for new architectures. Since the config defaults to 'y'. Set this to 'n' exlicitly. Signed-off-by: Deepa Dinamani Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + arch/riscv/configs/rv32_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index e2ff95cb3390..58f97b3cb24c 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -125,3 +125,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y +# CONFIG_SYSFS_SYSCALL is not set diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index eb519407c841..f4076b6ac063 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -122,3 +122,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y +# CONFIG_SYSFS_SYSCALL is not set From aff7783392e0c0152ee01653ce9c61abb4928910 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 4 Feb 2020 19:19:47 +0800 Subject: [PATCH 04/65] riscv: force hart_lottery to put in .sdata section In PIC code model, the zero initialized data always be put in .bss section, so when building kernel as PIE, the hart_lottery won't present in small data section, and it causes more than one harts to get the lottery, because the main hart clears the content of .bss section immediately after it getting the lottery. Signed-off-by: Zong Li Reviewed-by: Anup Patel [Palmer: added a comment] Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 0a6d415b0a5a..cb836fcc6118 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -39,8 +39,12 @@ struct screen_info screen_info = { }; #endif -/* The lucky hart to first increment this variable will boot the other cores */ -atomic_t hart_lottery; +/* + * The lucky hart to first increment this variable will boot the other cores. + * This is used before the kernel initializes the BSS so it can't be in the + * BSS. + */ +atomic_t hart_lottery __section(.sdata); unsigned long boot_cpu_hartid; void __init parse_dtb(void) From 064223b947a8c3d0b35a4ac9ae6e31e3f77657fd Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 18 Feb 2020 13:17:06 -0800 Subject: [PATCH 05/65] RISC-V: Stop putting .sbss in .sdata I don't know why we were doing this, as it's been there since the beginning. After d841f729e655 ("riscv: force hart_lottery to put in .sdata section") my guess would be that it made the kernel boot and we forgot to fix it more cleanly. The default .bss segment already contains the .sbss section, so we don't need to do anything additional to ensure the symbols in .sbss continue to work. Tested-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vmlinux.lds.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 1e0193ded420..a8fb52a00295 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -64,7 +64,6 @@ SECTIONS *(.sdata*) /* End of data section */ _edata = .; - *(.sbss*) } BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) From 52e7c52d2ded5908e6a4f8a7248e5fa6e0d6809a Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 27 Feb 2020 11:07:28 -0800 Subject: [PATCH 06/65] RISC-V: Stop relying on GCC's register allocator's hueristics GCC allows users to hint to the register allocation that a variable should be placed in a register by using a syntax along the lines of function(...) { register long in_REG __asm__("REG"); } We've abused this a bit throughout the RISC-V port to access fixed registers directly as C variables. In practice it's never going to blow up because GCC isn't going to allocate these registers, but it's not a well defined syntax so we really shouldn't be relying upon this. Luckily there is a very similar but well defined syntax that allows us to still access these registers directly as C variables, which is to simply declare the register variables globally. For fixed variables this doesn't change the ABI. LLVM disallows this ambiguous syntax, so this isn't just strictly a formatting change. Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/current.h | 5 +++-- arch/riscv/kernel/process.c | 5 +++-- arch/riscv/kernel/stacktrace.c | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h index dd973efe5d7c..1de233d8e8de 100644 --- a/arch/riscv/include/asm/current.h +++ b/arch/riscv/include/asm/current.h @@ -17,6 +17,8 @@ struct task_struct; +register struct task_struct *riscv_current_is_tp __asm__("tp"); + /* * This only works because "struct thread_info" is at offset 0 from "struct * task_struct". This constraint seems to be necessary on other architectures @@ -26,8 +28,7 @@ struct task_struct; */ static __always_inline struct task_struct *get_current(void) { - register struct task_struct *tp __asm__("tp"); - return tp; + return riscv_current_is_tp; } #define current get_current() diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 817cf7b0974c..610c11e91606 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -22,6 +22,8 @@ #include #include +unsigned long gp_in_global __asm__("gp"); + extern asmlinkage void ret_from_fork(void); extern asmlinkage void ret_from_kernel_thread(void); @@ -107,9 +109,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, /* p->thread holds context to be restored by __switch_to() */ if (unlikely(p->flags & PF_KTHREAD)) { /* Kernel thread */ - const register unsigned long gp __asm__ ("gp"); memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gp = gp; + childregs->gp = gp_in_global; /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 0940681d2f68..02087fe539c6 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -19,6 +19,8 @@ struct stackframe { unsigned long ra; }; +register unsigned long sp_in_global __asm__("sp"); + void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(unsigned long, void *), void *arg) { @@ -29,7 +31,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, sp = user_stack_pointer(regs); pc = instruction_pointer(regs); } else if (task == NULL || task == current) { - const register unsigned long current_sp __asm__ ("sp"); + const register unsigned long current_sp = sp_in_global; fp = (unsigned long)__builtin_frame_address(0); sp = current_sp; pc = (unsigned long)walk_stackframe; @@ -73,8 +75,7 @@ static void notrace walk_stackframe(struct task_struct *task, sp = user_stack_pointer(regs); pc = instruction_pointer(regs); } else if (task == NULL || task == current) { - const register unsigned long current_sp __asm__ ("sp"); - sp = current_sp; + sp = sp_in_global; pc = (unsigned long)walk_stackframe; } else { /* task blocked in __switch_to */ From fdff9911f266951b14b20e25557278b5b3f0d90d Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 27 Feb 2020 11:15:03 -0800 Subject: [PATCH 07/65] RISC-V: Inline the assembly register save/restore macros These are only used once, and when reading the code I've always found them to be more of a headache than a benefit. While they were never worth removing before, LLVM's integrated assembler doesn't support LOCAL so rather that trying to figure out how to refactor the macros it seems saner to just inline them. Reviewed-by: Nick Desaulniers Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 143 ++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 82 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index bad4d85b5e91..f2e8e7c8089d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -13,17 +13,11 @@ #include #include - .text - .altmacro - -/* - * Prepares to enter a system call or exception by saving all registers to the - * stack. - */ - .macro SAVE_ALL - LOCAL _restore_kernel_tpsp - LOCAL _save_context +#if !IS_ENABLED(CONFIG_PREEMPTION) +.set resume_kernel, restore_all +#endif +ENTRY(handle_exception) /* * If coming from userspace, preserve the user thread pointer and load * the kernel thread pointer. If we came from the kernel, the scratch @@ -90,77 +84,6 @@ _save_context: REG_S s3, PT_BADADDR(sp) REG_S s4, PT_CAUSE(sp) REG_S s5, PT_TP(sp) - .endm - -/* - * Prepares to return from a system call or exception by restoring all - * registers from the stack. - */ - .macro RESTORE_ALL - REG_L a0, PT_STATUS(sp) - /* - * The current load reservation is effectively part of the processor's - * state, in the sense that load reservations cannot be shared between - * different hart contexts. We can't actually save and restore a load - * reservation, so instead here we clear any existing reservation -- - * it's always legal for implementations to clear load reservations at - * any point (as long as the forward progress guarantee is kept, but - * we'll ignore that here). - * - * Dangling load reservations can be the result of taking a trap in the - * middle of an LR/SC sequence, but can also be the result of a taken - * forward branch around an SC -- which is how we implement CAS. As a - * result we need to clear reservations between the last CAS and the - * jump back to the new context. While it is unlikely the store - * completes, implementations are allowed to expand reservations to be - * arbitrarily large. - */ - REG_L a2, PT_EPC(sp) - REG_SC x0, a2, PT_EPC(sp) - - csrw CSR_STATUS, a0 - csrw CSR_EPC, a2 - - REG_L x1, PT_RA(sp) - REG_L x3, PT_GP(sp) - REG_L x4, PT_TP(sp) - REG_L x5, PT_T0(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x8, PT_S0(sp) - REG_L x9, PT_S1(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x18, PT_S2(sp) - REG_L x19, PT_S3(sp) - REG_L x20, PT_S4(sp) - REG_L x21, PT_S5(sp) - REG_L x22, PT_S6(sp) - REG_L x23, PT_S7(sp) - REG_L x24, PT_S8(sp) - REG_L x25, PT_S9(sp) - REG_L x26, PT_S10(sp) - REG_L x27, PT_S11(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) - - REG_L x2, PT_SP(sp) - .endm - -#if !IS_ENABLED(CONFIG_PREEMPTION) -.set resume_kernel, restore_all -#endif - -ENTRY(handle_exception) - SAVE_ALL /* * Set the scratch register to 0, so that if a recursive exception @@ -298,7 +221,63 @@ resume_userspace: csrw CSR_SCRATCH, tp restore_all: - RESTORE_ALL + REG_L a0, PT_STATUS(sp) + /* + * The current load reservation is effectively part of the processor's + * state, in the sense that load reservations cannot be shared between + * different hart contexts. We can't actually save and restore a load + * reservation, so instead here we clear any existing reservation -- + * it's always legal for implementations to clear load reservations at + * any point (as long as the forward progress guarantee is kept, but + * we'll ignore that here). + * + * Dangling load reservations can be the result of taking a trap in the + * middle of an LR/SC sequence, but can also be the result of a taken + * forward branch around an SC -- which is how we implement CAS. As a + * result we need to clear reservations between the last CAS and the + * jump back to the new context. While it is unlikely the store + * completes, implementations are allowed to expand reservations to be + * arbitrarily large. + */ + REG_L a2, PT_EPC(sp) + REG_SC x0, a2, PT_EPC(sp) + + csrw CSR_STATUS, a0 + csrw CSR_EPC, a2 + + REG_L x1, PT_RA(sp) + REG_L x3, PT_GP(sp) + REG_L x4, PT_TP(sp) + REG_L x5, PT_T0(sp) + REG_L x6, PT_T1(sp) + REG_L x7, PT_T2(sp) + REG_L x8, PT_S0(sp) + REG_L x9, PT_S1(sp) + REG_L x10, PT_A0(sp) + REG_L x11, PT_A1(sp) + REG_L x12, PT_A2(sp) + REG_L x13, PT_A3(sp) + REG_L x14, PT_A4(sp) + REG_L x15, PT_A5(sp) + REG_L x16, PT_A6(sp) + REG_L x17, PT_A7(sp) + REG_L x18, PT_S2(sp) + REG_L x19, PT_S3(sp) + REG_L x20, PT_S4(sp) + REG_L x21, PT_S5(sp) + REG_L x22, PT_S6(sp) + REG_L x23, PT_S7(sp) + REG_L x24, PT_S8(sp) + REG_L x25, PT_S9(sp) + REG_L x26, PT_S10(sp) + REG_L x27, PT_S11(sp) + REG_L x28, PT_T3(sp) + REG_L x29, PT_T4(sp) + REG_L x30, PT_T5(sp) + REG_L x31, PT_T6(sp) + + REG_L x2, PT_SP(sp) + #ifdef CONFIG_RISCV_M_MODE mret #else From abc71bf0a70311ab294f97a7f16e8de03718c05a Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 27 Feb 2020 11:16:28 -0800 Subject: [PATCH 08/65] RISC-V: Stop using LOCAL for the uaccess fixups LLVM's integrated assembler doesn't support the LOCAL directive, which we're using when generating our uaccess fixup tables. Luckily the table fragment is small enough that there's only one internal symbol, so using a relative symbol reference doesn't really complicate anything. Signed-off-by: Palmer Dabbelt --- arch/riscv/lib/uaccess.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index f29d2ba2c0a6..fceaeb18cc64 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -3,14 +3,12 @@ #include #include - .altmacro .macro fixup op reg addr lbl - LOCAL _epc -_epc: +100: \op \reg, \addr .section __ex_table,"a" .balign RISCV_SZPTR - RISCV_PTR _epc, \lbl + RISCV_PTR 100b, \lbl .previous .endm From 3133287b53ee88444e2294e601d2bad54a798f59 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 20 Feb 2020 01:10:23 -0500 Subject: [PATCH 09/65] riscv: Use p*d_leaf macros to define p*d_huge The newly introduced p*d_leaf macros allow to check if an entry of the page table map to a physical page instead of the next level. To avoid duplication of code, use those macros to determine if a page table entry points to a hugepage. Suggested-by: Paul Walmsley Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/hugetlbpage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 0d4747e9d5b5..a6189ed36c5f 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -4,14 +4,12 @@ int pud_huge(pud_t pud) { - return pud_present(pud) && - (pud_val(pud) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)); + return pud_leaf(pud); } int pmd_huge(pmd_t pmd) { - return pmd_present(pmd) && - (pmd_val(pmd) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)); + return pmd_leaf(pmd); } static __init int setup_hugepagesz(char *opt) From 9f40b6e77d2f888a8c0608036eb124cedb6d2434 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Mon, 24 Feb 2020 11:34:36 -0800 Subject: [PATCH 10/65] RISC-V: Move all address space definition macros to one place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the following compilation error. --------------------------------------------------------------- ./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’: ./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared (first use in this function); did you mean ‘mem_map’? #define __pfn_to_page(pfn) (vmemmap + (pfn)) ^~~~~~~ ./include/asm-generic/memory_model.h:82:21: note: in expansion of macro ‘__pfn_to_page’ #define pfn_to_page __pfn_to_page ^~~~~~~~~~~~~ ./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro ‘pfn_to_page’ return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); --------------------------------------------------------------- Fix the compliation errors by moving all the address space definition macros before including pgtable-64.h. Fixes: 8ad8b72721d0 (riscv: Add KASAN support) Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 78 +++++++++++++++++--------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e43041519edd..393f2014dfee 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -19,6 +19,47 @@ #include #include +#ifdef CONFIG_MMU + +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#define VMALLOC_END (PAGE_OFFSET - 1) +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) + +#define BPF_JIT_REGION_SIZE (SZ_128M) +#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) +#define BPF_JIT_REGION_END (VMALLOC_END) + +/* + * Roughly size the vmemmap space to be large enough to fit enough + * struct pages to map half the virtual address space. Then + * position vmemmap directly below the VMALLOC region. + */ +#define VMEMMAP_SHIFT \ + (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) +#define VMEMMAP_END (VMALLOC_START - 1) +#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) + +/* + * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel + * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. + */ +#define vmemmap ((struct page *)VMEMMAP_START) + +#define PCI_IO_SIZE SZ_16M +#define PCI_IO_END VMEMMAP_START +#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) + +#define FIXADDR_TOP PCI_IO_START +#ifdef CONFIG_64BIT +#define FIXADDR_SIZE PMD_SIZE +#else +#define FIXADDR_SIZE PGDIR_SIZE +#endif +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#endif + #ifdef CONFIG_64BIT #include #else @@ -90,31 +131,6 @@ extern pgd_t swapper_pg_dir[]; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (PAGE_OFFSET - 1) -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) - -#define BPF_JIT_REGION_SIZE (SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) - -/* - * Roughly size the vmemmap space to be large enough to fit enough - * struct pages to map half the virtual address space. Then - * position vmemmap directly below the VMALLOC region. - */ -#define VMEMMAP_SHIFT \ - (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) -#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) -#define VMEMMAP_END (VMALLOC_START - 1) -#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) - -/* - * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel - * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. - */ -#define vmemmap ((struct page *)VMEMMAP_START) - static inline int pmd_present(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE)); @@ -432,18 +448,6 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define PCI_IO_SIZE SZ_16M -#define PCI_IO_END VMEMMAP_START -#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) - -#define FIXADDR_TOP PCI_IO_START -#ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE -#else -#define FIXADDR_SIZE PGDIR_SIZE -#endif -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. From ba000760eb0f182e6ef04faca70bb9737a9674b4 Mon Sep 17 00:00:00 2001 From: afzal mohammed Date: Sun, 1 Mar 2020 06:56:55 +0530 Subject: [PATCH 11/65] m68k: Replace setup_irq() by request_irq() request_irq() is preferred over setup_irq(). Invocations of setup_irq() occur after memory allocators are ready. Per tglx[1], setup_irq() existed in olden days when allocators were not ready by the time early interrupts were initialized. Hence replace setup_irq() by request_irq(). [1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos Signed-off-by: afzal mohammed Tested-by: Greg Ungerer Signed-off-by: Greg Ungerer --- arch/m68k/68000/timers.c | 16 +++++++--------- arch/m68k/coldfire/pit.c | 16 +++++++--------- arch/m68k/coldfire/sltimers.c | 29 +++++++++++++++-------------- arch/m68k/coldfire/timers.c | 31 +++++++++++++++---------------- 4 files changed, 44 insertions(+), 48 deletions(-) diff --git a/arch/m68k/68000/timers.c b/arch/m68k/68000/timers.c index 71ddb4c98726..1c8e8a83c325 100644 --- a/arch/m68k/68000/timers.c +++ b/arch/m68k/68000/timers.c @@ -68,14 +68,6 @@ static irqreturn_t hw_tick(int irq, void *dummy) /***************************************************************************/ -static struct irqaction m68328_timer_irq = { - .name = "timer", - .flags = IRQF_TIMER, - .handler = hw_tick, -}; - -/***************************************************************************/ - static u64 m68328_read_clk(struct clocksource *cs) { unsigned long flags; @@ -102,11 +94,17 @@ static struct clocksource m68328_clk = { void hw_timer_init(irq_handler_t handler) { + int ret; + /* disable timer 1 */ TCTL = 0; /* set ISR */ - setup_irq(TMR_IRQ_NUM, &m68328_timer_irq); + ret = request_irq(TMR_IRQ_NUM, hw_tick, IRQF_TIMER, "timer", NULL); + if (ret) { + pr_err("Failed to request irq %d (timer): %pe\n", TMR_IRQ_NUM, + ERR_PTR(ret)); + } /* Restart mode, Enable int, Set clock source */ TCTL = TCTL_OM | TCTL_IRQEN | CLOCK_SOURCE; diff --git a/arch/m68k/coldfire/pit.c b/arch/m68k/coldfire/pit.c index eb6f16b0e2e6..fd1d9c915daa 100644 --- a/arch/m68k/coldfire/pit.c +++ b/arch/m68k/coldfire/pit.c @@ -111,14 +111,6 @@ static irqreturn_t pit_tick(int irq, void *dummy) /***************************************************************************/ -static struct irqaction pit_irq = { - .name = "timer", - .flags = IRQF_TIMER, - .handler = pit_tick, -}; - -/***************************************************************************/ - static u64 pit_read_clk(struct clocksource *cs) { unsigned long flags; @@ -146,6 +138,8 @@ static struct clocksource pit_clk = { void hw_timer_init(irq_handler_t handler) { + int ret; + cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id()); cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32); cf_pit_clockevent.max_delta_ns = @@ -156,7 +150,11 @@ void hw_timer_init(irq_handler_t handler) cf_pit_clockevent.min_delta_ticks = 0x3f; clockevents_register_device(&cf_pit_clockevent); - setup_irq(MCF_IRQ_PIT1, &pit_irq); + ret = request_irq(MCF_IRQ_PIT1, pit_tick, IRQF_TIMER, "timer", NULL); + if (ret) { + pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_PIT1, + ERR_PTR(ret)); + } clocksource_register_hz(&pit_clk, FREQ); } diff --git a/arch/m68k/coldfire/sltimers.c b/arch/m68k/coldfire/sltimers.c index 1b11e7bacab3..5ab81c9c552d 100644 --- a/arch/m68k/coldfire/sltimers.c +++ b/arch/m68k/coldfire/sltimers.c @@ -50,18 +50,19 @@ irqreturn_t mcfslt_profile_tick(int irq, void *dummy) return IRQ_HANDLED; } -static struct irqaction mcfslt_profile_irq = { - .name = "profile timer", - .flags = IRQF_TIMER, - .handler = mcfslt_profile_tick, -}; - void mcfslt_profile_init(void) { + int ret; + printk(KERN_INFO "PROFILE: lodging TIMER 1 @ %dHz as profile timer\n", PROFILEHZ); - setup_irq(MCF_IRQ_PROFILER, &mcfslt_profile_irq); + ret = request_irq(MCF_IRQ_PROFILER, mcfslt_profile_tick, IRQF_TIMER, + "profile timer", NULL); + if (ret) { + pr_err("Failed to request irq %d (profile timer): %pe\n", + MCF_IRQ_PROFILER, ERR_PTR(ret)); + } /* Set up TIMER 2 as high speed profile clock */ __raw_writel(MCF_BUSCLK / PROFILEHZ - 1, PA(MCFSLT_STCNT)); @@ -92,12 +93,6 @@ static irqreturn_t mcfslt_tick(int irq, void *dummy) return timer_interrupt(irq, dummy); } -static struct irqaction mcfslt_timer_irq = { - .name = "timer", - .flags = IRQF_TIMER, - .handler = mcfslt_tick, -}; - static u64 mcfslt_read_clk(struct clocksource *cs) { unsigned long flags; @@ -126,6 +121,8 @@ static struct clocksource mcfslt_clk = { void hw_timer_init(irq_handler_t handler) { + int r; + mcfslt_cycles_per_jiffy = MCF_BUSCLK / HZ; /* * The coldfire slice timer (SLT) runs from STCNT to 0 included, @@ -140,7 +137,11 @@ void hw_timer_init(irq_handler_t handler) mcfslt_cnt = mcfslt_cycles_per_jiffy; timer_interrupt = handler; - setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq); + r = request_irq(MCF_IRQ_TIMER, mcfslt_tick, IRQF_TIMER, "timer", NULL); + if (r) { + pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER, + ERR_PTR(r)); + } clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK); diff --git a/arch/m68k/coldfire/timers.c b/arch/m68k/coldfire/timers.c index 227aa5d13709..b8301fddf901 100644 --- a/arch/m68k/coldfire/timers.c +++ b/arch/m68k/coldfire/timers.c @@ -82,14 +82,6 @@ static irqreturn_t mcftmr_tick(int irq, void *dummy) /***************************************************************************/ -static struct irqaction mcftmr_timer_irq = { - .name = "timer", - .flags = IRQF_TIMER, - .handler = mcftmr_tick, -}; - -/***************************************************************************/ - static u64 mcftmr_read_clk(struct clocksource *cs) { unsigned long flags; @@ -118,6 +110,8 @@ static struct clocksource mcftmr_clk = { void hw_timer_init(irq_handler_t handler) { + int r; + __raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR)); mcftmr_cycles_per_jiffy = FREQ / HZ; /* @@ -134,7 +128,11 @@ void hw_timer_init(irq_handler_t handler) timer_interrupt = handler; init_timer_irq(); - setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq); + r = request_irq(MCF_IRQ_TIMER, mcftmr_tick, IRQF_TIMER, "timer", NULL); + if (r) { + pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER, + ERR_PTR(r)); + } #ifdef CONFIG_HIGHPROFILE coldfire_profile_init(); @@ -170,14 +168,10 @@ irqreturn_t coldfire_profile_tick(int irq, void *dummy) /***************************************************************************/ -static struct irqaction coldfire_profile_irq = { - .name = "profile timer", - .flags = IRQF_TIMER, - .handler = coldfire_profile_tick, -}; - void coldfire_profile_init(void) { + int ret; + printk(KERN_INFO "PROFILE: lodging TIMER2 @ %dHz as profile timer\n", PROFILEHZ); @@ -188,7 +182,12 @@ void coldfire_profile_init(void) __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, PA(MCFTIMER_TMR)); - setup_irq(MCF_IRQ_PROFILER, &coldfire_profile_irq); + ret = request_irq(MCF_IRQ_PROFILER, coldfire_profile_tick, IRQF_TIMER, + "profile timer", NULL); + if (ret) { + pr_err("Failed to request irq %d (profile timer): %pe\n", + MCF_IRQ_PROFILER, ERR_PTR(ret)); + } } /***************************************************************************/ From d3ab332a5021235a74fd832a49c6a99404920d88 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:36 +0800 Subject: [PATCH 12/65] riscv: add ARCH_HAS_SET_MEMORY support Add set_memory_ro/rw/x/nx architecture hooks to change the page attribution. Use own set_memory.h rather than generic set_memory.h (i.e. include/asm-generic/set_memory.h), because we want to add other function prototypes here. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/set_memory.h | 24 +++++ arch/riscv/mm/Makefile | 2 +- arch/riscv/mm/pageattr.c | 150 ++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/set_memory.h create mode 100644 arch/riscv/mm/pageattr.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 73f029eae0cc..3da5c2004191 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -59,6 +59,7 @@ config RISCV select HAVE_EBPF_JIT if 64BIT select EDAC_SUPPORT select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_SET_MEMORY select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h new file mode 100644 index 000000000000..79a810f0f38b --- /dev/null +++ b/arch/riscv/include/asm/set_memory.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 SiFive + */ + +#ifndef _ASM_RISCV_SET_MEMORY_H +#define _ASM_RISCV_SET_MEMORY_H + +/* + * Functions to change memory attributes. + */ +#ifdef CONFIG_MMU +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#endif /* _ASM_RISCV_SET_MEMORY_H */ diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 50b7af58c566..e0e10b618273 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -7,7 +7,7 @@ endif obj-y += init.o obj-y += extable.o -obj-$(CONFIG_MMU) += fault.o +obj-$(CONFIG_MMU) += fault.o pageattr.o obj-y += cacheflush.o obj-y += context.o diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c new file mode 100644 index 000000000000..fcd59ef2835b --- /dev/null +++ b/arch/riscv/mm/pageattr.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 SiFive + */ + +#include +#include +#include +#include + +struct pageattr_masks { + pgprot_t set_mask; + pgprot_t clear_mask; +}; + +static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk) +{ + struct pageattr_masks *masks = walk->private; + unsigned long new_val = val; + + new_val &= ~(pgprot_val(masks->clear_mask)); + new_val |= (pgprot_val(masks->set_mask)); + + return new_val; +} + +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pgd_t val = READ_ONCE(*pgd); + + if (pgd_leaf(val)) { + val = __pgd(set_pageattr_masks(pgd_val(val), walk)); + set_pgd(pgd, val); + } + + return 0; +} + +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + p4d_t val = READ_ONCE(*p4d); + + if (p4d_leaf(val)) { + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); + set_p4d(p4d, val); + } + + return 0; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = READ_ONCE(*pud); + + if (pud_leaf(val)) { + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = READ_ONCE(*pmd); + + if (pmd_leaf(val)) { + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = READ_ONCE(*pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + set_pte(pte, val); + + return 0; +} + +static int pageattr_pte_hole(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + /* Nothing to do here */ + return 0; +} + +const static struct mm_walk_ops pageattr_ops = { + .pgd_entry = pageattr_pgd_entry, + .p4d_entry = pageattr_p4d_entry, + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, + .pte_hole = pageattr_pte_hole, +}; + +static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, + pgprot_t clear_mask) +{ + int ret; + unsigned long start = addr; + unsigned long end = start + PAGE_SIZE * numpages; + struct pageattr_masks masks = { + .set_mask = set_mask, + .clear_mask = clear_mask + }; + + if (!numpages) + return 0; + + down_read(&init_mm.mmap_sem); + ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, + &masks); + up_read(&init_mm.mmap_sem); + + flush_tlb_kernel_range(start, end); + + return ret; +} + +int set_memory_ro(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_READ), + __pgprot(_PAGE_WRITE)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE), + __pgprot(0)); +} + +int set_memory_x(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_EXEC), __pgprot(0)); +} + +int set_memory_nx(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC)); +} From 395a21ff859c9c2471ea62d7d56af8a85ec333f7 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:37 +0800 Subject: [PATCH 13/65] riscv: add ARCH_HAS_SET_DIRECT_MAP support Add set_direct_map_*() functions for setting the direct map alias for the page to its default permissions and to an invalid state that cannot be cached in a TLB. (See d253ca0c ("x86/mm/cpa: Add set_direct_map_*() functions")) Add a similar implementation for RISC-V. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/set_memory.h | 3 +++ arch/riscv/mm/pageattr.c | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3da5c2004191..34fc7450b9dd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -59,6 +59,7 @@ config RISCV select HAVE_EBPF_JIT if 64BIT select EDAC_SUPPORT select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 79a810f0f38b..620d81c372d9 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -21,4 +21,7 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + #endif /* _ASM_RISCV_SET_MEMORY_H */ diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index fcd59ef2835b..7be6cd67e2ef 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -148,3 +148,27 @@ int set_memory_nx(unsigned long addr, int numpages) { return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC)); } + +int set_direct_map_invalid_noflush(struct page *page) +{ + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + PAGE_SIZE; + struct pageattr_masks masks = { + .set_mask = __pgprot(0), + .clear_mask = __pgprot(_PAGE_PRESENT) + }; + + return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks); +} + +int set_direct_map_default_noflush(struct page *page) +{ + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + PAGE_SIZE; + struct pageattr_masks masks = { + .set_mask = PAGE_KERNEL, + .clear_mask = __pgprot(0) + }; + + return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks); +} From 5fde3db5eb028b95aeefa1ab192d36800414e8b8 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:38 +0800 Subject: [PATCH 14/65] riscv: add ARCH_SUPPORTS_DEBUG_PAGEALLOC support ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap pages for debugging purposes. Implement the __kernel_map_pages functions to fill the poison pattern. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 3 +++ arch/riscv/mm/pageattr.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 34fc7450b9dd..31fbc6ef7a6d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -131,6 +131,9 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_WANT_GENERAL_HUGETLB def_bool y +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + config SYS_SUPPORTS_HUGETLBFS def_bool y diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 7be6cd67e2ef..728759eb530a 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -172,3 +172,16 @@ int set_direct_map_default_noflush(struct page *page) return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks); } + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled()) + return; + + if (enable) + __set_memory((unsigned long)page_address(page), numpages, + __pgprot(_PAGE_PRESENT), __pgprot(0)); + else + __set_memory((unsigned long)page_address(page), numpages, + __pgprot(0), __pgprot(_PAGE_PRESENT)); +} From bd3d914d16aaf82412771a2d673299d4b5e3aeda Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:39 +0800 Subject: [PATCH 15/65] riscv: move exception table immediately after RO_DATA Move EXCEPTION_TABLE immediately after RO_DATA. Make it easy to set the attribution of the sections which should be read-only at a time. Add _data to specify the start of data section with write permission. This patch is prepared for STRICT_KERNEL_RWX support. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vmlinux.lds.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index a8fb52a00295..9e8adca9b20f 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -58,6 +58,10 @@ SECTIONS *(.srodata*) } + EXCEPTION_TABLE(0x10) + + _data = .; + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) .sdata : { __global_pointer$ = . + 0x800; @@ -68,8 +72,6 @@ SECTIONS BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) - EXCEPTION_TABLE(0x10) - .rel.dyn : { *(.rel.dyn*) } From 00cb41d5ad3189f52a59f42766918557693f94fa Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:40 +0800 Subject: [PATCH 16/65] riscv: add alignment for text, rodata and data sections The kernel mapping will tried to optimize its mapping by using bigger size. In rv64, it tries to use PMD_SIZE, and tryies to use PGDIR_SIZE in rv32. To ensure that the start address of these sections could fit the mapping entry size, make them align to the biggest alignment. Define a macro SECTION_ALIGN because the HPAGE_SIZE or PMD_SIZE, etc., are invisible in linker script. This patch is prepared for STRICT_KERNEL_RWX support. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/set_memory.h | 13 +++++++++++++ arch/riscv/kernel/vmlinux.lds.S | 5 ++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 620d81c372d9..4c5bae7ca01c 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -6,6 +6,7 @@ #ifndef _ASM_RISCV_SET_MEMORY_H #define _ASM_RISCV_SET_MEMORY_H +#ifndef __ASSEMBLY__ /* * Functions to change memory attributes. */ @@ -24,4 +25,16 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_ARCH_HAS_STRICT_KERNEL_RWX +#ifdef CONFIG_64BIT +#define SECTION_ALIGN (1 << 21) +#else +#define SECTION_ALIGN (1 << 22) +#endif +#else /* !CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */ +#define SECTION_ALIGN L1_CACHE_BYTES +#endif /* CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */ + #endif /* _ASM_RISCV_SET_MEMORY_H */ diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 9e8adca9b20f..4ad3c8eb241d 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -9,6 +9,7 @@ #include #include #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) @@ -36,6 +37,7 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) __init_end = .; + . = ALIGN(SECTION_ALIGN); .text : { _text = .; _stext = .; @@ -53,13 +55,14 @@ SECTIONS /* Start of data section */ _sdata = .; - RO_DATA(L1_CACHE_BYTES) + RO_DATA(SECTION_ALIGN) .srodata : { *(.srodata*) } EXCEPTION_TABLE(0x10) + . = ALIGN(SECTION_ALIGN); _data = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) From d27c3c90817e4c5ea655714065a725b4abd576f9 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:41 +0800 Subject: [PATCH 17/65] riscv: add STRICT_KERNEL_RWX support The commit contains that make text section as non-writable, rodata section as read-only, and data section as non-executable. The init section should be changed to non-executable. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/set_memory.h | 8 ++++++ arch/riscv/mm/init.c | 44 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 31fbc6ef7a6d..dfc9c3ea9f7d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -61,6 +61,7 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 4c5bae7ca01c..c38df4771c09 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -22,6 +22,14 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +#ifdef CONFIG_STRICT_KERNEL_RWX +void set_kernel_text_ro(void); +void set_kernel_text_rw(void); +#else +static inline void set_kernel_text_ro(void) { } +static inline void set_kernel_text_rw(void) { } +#endif + int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 965a8cf4829c..0c625a5e98db 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -477,6 +478,17 @@ static void __init setup_vm_final(void) csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); } + +void free_initmem(void) +{ + unsigned long init_begin = (unsigned long)__init_begin; + unsigned long init_end = (unsigned long)__init_end; + + /* Make the region as non-execuatble. */ + set_memory_nx(init_begin, (init_end - init_begin) >> PAGE_SHIFT); + free_initmem_default(POISON_FREE_INITMEM); +} + #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) { @@ -488,6 +500,38 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ +#ifdef CONFIG_STRICT_KERNEL_RWX +void set_kernel_text_rw(void) +{ + unsigned long text_start = (unsigned long)_text; + unsigned long text_end = (unsigned long)_etext; + + set_memory_rw(text_start, (text_end - text_start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long text_start = (unsigned long)_text; + unsigned long text_end = (unsigned long)_etext; + + set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long text_start = (unsigned long)_text; + unsigned long text_end = (unsigned long)_etext; + unsigned long rodata_start = (unsigned long)__start_rodata; + unsigned long data_start = (unsigned long)_data; + unsigned long max_low = (unsigned long)(__va(PFN_PHYS(max_low_pfn))); + + set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT); + set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); + set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT); +} +#endif + void __init paging_init(void) { setup_vm_final(); From b42d763a2d412d6ef7c29cb2f1b3e9985e2b1e38 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:42 +0800 Subject: [PATCH 18/65] riscv: add macro to get instruction length Extract the calculation of instruction length for common use. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/bug.h | 8 ++++++++ arch/riscv/kernel/traps.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index 75604fec1b1b..d6f1ec08d97b 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -19,6 +19,14 @@ #define __BUG_INSN_32 _UL(0x00100073) /* ebreak */ #define __BUG_INSN_16 _UL(0x9002) /* c.ebreak */ +#define GET_INSN_LENGTH(insn) \ +({ \ + unsigned long __len; \ + __len = ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? \ + 4UL : 2UL; \ + __len; \ +}) + typedef u32 bug_insn_t; #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index ffb3d94bf0cc..a4d136355f78 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -118,7 +118,8 @@ static inline unsigned long get_break_insn_length(unsigned long pc) if (probe_kernel_address((bug_insn_t *)pc, insn)) return 0; - return (((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? 4UL : 2UL); + + return GET_INSN_LENGTH(insn); } asmlinkage __visible void do_trap_break(struct pt_regs *regs) From 043cb41a85de1c0e944da61ad7a264960e22c865 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:43 +0800 Subject: [PATCH 19/65] riscv: introduce interfaces to patch kernel code On strict kernel memory permission, we couldn't patch code without writable permission. Preserve two holes in fixmap area, so we can map the kernel code temporarily to fixmap area, then patch the instructions. We need two pages here because we support the compressed instruction, so the instruction might be align to 2 bytes. When patching the 32-bit length instruction which is 2 bytes alignment, it will across two pages. Introduce two interfaces to patch kernel code: riscv_patch_text_nosync: - patch code without synchronization, it's caller's responsibility to synchronize all CPUs if needed. riscv_patch_text: - patch code and always synchronize with stop_machine() Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/fixmap.h | 2 + arch/riscv/include/asm/patch.h | 12 ++++ arch/riscv/kernel/Makefile | 4 +- arch/riscv/kernel/patch.c | 120 ++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/patch.h create mode 100644 arch/riscv/kernel/patch.c diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 42d2c42f3cc9..2368d49eb4ef 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -27,6 +27,8 @@ enum fixed_addresses { FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, FIX_PTE, FIX_PMD, + FIX_TEXT_POKE1, + FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, __end_of_fixed_addresses }; diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h new file mode 100644 index 000000000000..b5918a6e0615 --- /dev/null +++ b/arch/riscv/include/asm/patch.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 SiFive + */ + +#ifndef _ASM_RISCV_PATCH_H +#define _ASM_RISCV_PATCH_H + +int riscv_patch_text_nosync(void *addr, const void *insns, size_t len); +int riscv_patch_text(void *addr, u32 insn); + +#endif /* _ASM_RISCV_PATCH_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f40205cb9a22..d189bd3d8501 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -4,7 +4,8 @@ # ifdef CONFIG_FTRACE -CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_patch.o = -pg endif extra-y += head.o @@ -26,6 +27,7 @@ obj-y += traps.o obj-y += riscv_ksyms.o obj-y += stacktrace.o obj-y += cacheinfo.o +obj-y += patch.o obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_M_MODE) += clint.o diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c new file mode 100644 index 000000000000..8a4fc65ee022 --- /dev/null +++ b/arch/riscv/kernel/patch.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 SiFive + */ + +#include +#include +#include +#include +#include +#include +#include + +struct riscv_insn_patch { + void *addr; + u32 insn; + atomic_t cpu_count; +}; + +#ifdef CONFIG_MMU +static DEFINE_RAW_SPINLOCK(patch_lock); + +static void __kprobes *patch_map(void *addr, int fixmap) +{ + uintptr_t uintaddr = (uintptr_t) addr; + struct page *page; + + if (core_kernel_text(uintaddr)) + page = phys_to_page(__pa_symbol(addr)); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + page = vmalloc_to_page(addr); + else + return addr; + + BUG_ON(!page); + + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} + +static void __kprobes patch_unmap(int fixmap) +{ + clear_fixmap(fixmap); +} + +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len) +{ + void *waddr = addr; + bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + + if (across_pages) + patch_map(addr + len, FIX_TEXT_POKE1); + + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = probe_kernel_write(waddr, insn, len); + + patch_unmap(FIX_TEXT_POKE0); + + if (across_pages) + patch_unmap(FIX_TEXT_POKE1); + + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} +#else +static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len) +{ + return probe_kernel_write(addr, insn, len); +} +#endif /* CONFIG_MMU */ + +int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len) +{ + u32 *tp = addr; + int ret; + + ret = riscv_insn_write(tp, insns, len); + + if (!ret) + flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len); + + return ret; +} + +static int __kprobes riscv_patch_text_cb(void *data) +{ + struct riscv_insn_patch *patch = data; + int ret = 0; + + if (atomic_inc_return(&patch->cpu_count) == 1) { + ret = + riscv_patch_text_nosync(patch->addr, &patch->insn, + GET_INSN_LENGTH(patch->insn)); + atomic_inc(&patch->cpu_count); + } else { + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) + cpu_relax(); + smp_mb(); + } + + return ret; +} + +int __kprobes riscv_patch_text(void *addr, u32 insn) +{ + struct riscv_insn_patch patch = { + .addr = addr, + .insn = insn, + .cpu_count = ATOMIC_INIT(0), + }; + + return stop_machine_cpuslocked(riscv_patch_text_cb, + &patch, cpu_online_mask); +} From 8fdddb2eae731dd6beb89f9b812e5915d1beb744 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Tue, 10 Mar 2020 00:55:44 +0800 Subject: [PATCH 20/65] riscv: patch code by fixmap mapping On strict kernel memory permission, the ftrace have to change the permission of text for dynamic patching the intructions. Use riscv_patch_text_nosync() to patch code instead of probe_kernel_write. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index c40fdcdeb950..ce69b34ff55d 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_DYNAMIC_FTRACE static int ftrace_check_current_call(unsigned long hook_pos, @@ -46,20 +47,14 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, { unsigned int call[2]; unsigned int nops[2] = {NOP4, NOP4}; - int ret = 0; make_call(hook_pos, target, call); - /* replace the auipc-jalr pair at once */ - ret = probe_kernel_write((void *)hook_pos, enable ? call : nops, - MCOUNT_INSN_SIZE); - /* return must be -EPERM on write error */ - if (ret) + /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */ + if (riscv_patch_text_nosync + ((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE)) return -EPERM; - smp_mb(); - flush_icache_range((void *)hook_pos, (void *)hook_pos + MCOUNT_INSN_SIZE); - return 0; } From 59c4da8640ccf4721d54d36835706f3eefb521a4 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Thu, 12 Mar 2020 10:58:35 +0800 Subject: [PATCH 21/65] riscv: Add support to dump the kernel page tables In a similar manner to arm64, x86, powerpc, etc., it can traverse all page tables, and dump the page table layout with the memory types and permissions. Add a debugfs file at /sys/kernel/debug/kernel_page_tables to export the page table layout to userspace. Signed-off-by: Zong Li Tested-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable.h | 10 + arch/riscv/include/asm/ptdump.h | 11 ++ arch/riscv/mm/Makefile | 1 + arch/riscv/mm/ptdump.c | 317 +++++++++++++++++++++++++++++++ 5 files changed, 340 insertions(+) create mode 100644 arch/riscv/include/asm/ptdump.h create mode 100644 arch/riscv/mm/ptdump.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index dfc9c3ea9f7d..b2764a0a0c8c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -29,6 +29,7 @@ config RISCV select GENERIC_SMP_IDLE_THREAD select GENERIC_ATOMIC64 if !64BIT select GENERIC_IOREMAP + select GENERIC_PTDUMP if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 393f2014dfee..9c188ad2e52d 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -448,6 +448,16 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* + * In the RV64 Linux scheme, we give the user half of the virtual-address space + * and give the kernel the other (upper) half. + */ +#ifdef CONFIG_64BIT +#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE) +#else +#define KERN_VIRT_START FIXADDR_START +#endif + /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h new file mode 100644 index 000000000000..e29af7191909 --- /dev/null +++ b/arch/riscv/include/asm/ptdump.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 SiFive + */ + +#ifndef _ASM_RISCV_PTDUMP_H +#define _ASM_RISCV_PTDUMP_H + +void ptdump_check_wx(void); + +#endif /* _ASM_RISCV_PTDUMP_H */ diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index e0e10b618273..363ef01c30b1 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -15,6 +15,7 @@ ifeq ($(CONFIG_MMU),y) obj-$(CONFIG_SMP) += tlbflush.o endif obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_KASAN) += kasan_init.o ifdef CONFIG_KASAN diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c new file mode 100644 index 000000000000..7eab76a93106 --- /dev/null +++ b/arch/riscv/mm/ptdump.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 SiFive + */ + +#include +#include +#include +#include + +#include +#include +#include + +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + if (m) \ + seq_printf(m, fmt); \ +}) + +/* + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the pte entries. When the continuity is broken it then + * dumps out a description of the range. + */ +struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; + int level; + u64 current_prot; + bool check_wx; + unsigned long wx_pages; +}; + +/* Address marker */ +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +static struct addr_marker address_markers[] = { +#ifdef CONFIG_KASAN + {KASAN_SHADOW_START, "Kasan shadow start"}, + {KASAN_SHADOW_END, "Kasan shadow end"}, +#endif + {FIXADDR_START, "Fixmap start"}, + {FIXADDR_TOP, "Fixmap end"}, + {PCI_IO_START, "PCI I/O start"}, + {PCI_IO_END, "PCI I/O end"}, +#ifdef CONFIG_SPARSEMEM_VMEMMAP + {VMEMMAP_START, "vmemmap start"}, + {VMEMMAP_END, "vmemmap end"}, +#endif + {VMALLOC_START, "vmalloc() area"}, + {VMALLOC_END, "vmalloc() end"}, + {PAGE_OFFSET, "Linear mapping"}, + {-1, NULL}, +}; + +/* Page Table Entry */ +struct prot_bits { + u64 mask; + u64 val; + const char *set; + const char *clear; +}; + +static const struct prot_bits pte_bits[] = { + { + .mask = _PAGE_SOFT, + .val = _PAGE_SOFT, + .set = "RSW", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "D", + .clear = ".", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "A", + .clear = ".", + }, { + .mask = _PAGE_GLOBAL, + .val = _PAGE_GLOBAL, + .set = "G", + .clear = ".", + }, { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "U", + .clear = ".", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = "X", + .clear = ".", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "W", + .clear = ".", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "R", + .clear = ".", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "V", + .clear = ".", + } +}; + +/* Page Level */ +struct pg_level { + const char *name; + u64 mask; +}; + +static struct pg_level pg_level[] = { + { /* pgd */ + .name = "PGD", + }, { /* p4d */ + .name = (CONFIG_PGTABLE_LEVELS > 4) ? "P4D" : "PGD", + }, { /* pud */ + .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD", + }, { /* pmd */ + .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD", + }, { /* pte */ + .name = "PTE", + }, +}; + +static void dump_prot(struct pg_state *st) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(pte_bits); i++) { + const char *s; + + if ((st->current_prot & pte_bits[i].mask) == pte_bits[i].val) + s = pte_bits[i].set; + else + s = pte_bits[i].clear; + + if (s) + pt_dump_seq_printf(st->seq, " %s", s); + } +} + +#ifdef CONFIG_64BIT +#define ADDR_FORMAT "0x%016lx" +#else +#define ADDR_FORMAT "0x%08lx" +#endif +static void dump_addr(struct pg_state *st, unsigned long addr) +{ + static const char units[] = "KMGTPE"; + const char *unit = units; + unsigned long delta; + + pt_dump_seq_printf(st->seq, ADDR_FORMAT "-" ADDR_FORMAT " ", + st->start_address, addr); + + pt_dump_seq_printf(st->seq, " " ADDR_FORMAT " ", st->start_pa); + delta = (addr - st->start_address) >> 10; + + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + + pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit, + pg_level[st->level].name); +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if ((st->current_prot & (_PAGE_WRITE | _PAGE_EXEC)) != + (_PAGE_WRITE | _PAGE_EXEC)) + return; + + WARN_ONCE(1, "riscv/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, + int level, unsigned long val) +{ + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + u64 pa = PFN_PHYS(pte_pfn(__pte(val))); + u64 prot = 0; + + if (level >= 0) + prot = val & pg_level[level].mask; + + if (st->level == -1) { + st->level = level; + st->current_prot = prot; + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } else if (prot != st->current_prot || + level != st->level || addr >= st->marker[1].start_address) { + if (st->current_prot) { + note_prot_wx(st, addr); + dump_addr(st, addr); + dump_prot(st); + pt_dump_seq_puts(st->seq, "\n"); + } + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", + st->marker->name); + } + + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + st->current_prot = prot; + st->level = level; + } else { + st->last_pa = pa; + } +} + +static void ptdump_walk(struct seq_file *s) +{ + struct pg_state st = { + .seq = s, + .marker = address_markers, + .level = -1, + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {KERN_VIRT_START, ULONG_MAX}, + {0, 0} + } + } + }; + + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); +} + +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + {0, NULL}, + {-1, NULL}, + }, + .level = -1, + .check_wx = true, + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {KERN_VIRT_START, ULONG_MAX}, + {0, 0} + } + } + }; + + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + + if (st.wx_pages) + pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n", + st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk(m); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump); + +static int ptdump_init(void) +{ + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(pg_level); i++) + for (j = 0; j < ARRAY_SIZE(pte_bits); j++) + pg_level[i].mask |= pte_bits[j].mask; + + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, + &ptdump_fops); + + return 0; +} + +device_initcall(ptdump_init); From 88d110382555ac2aef3bca8e4f4c6e5602a22faf Mon Sep 17 00:00:00 2001 From: Zong Li Date: Thu, 12 Mar 2020 10:58:36 +0800 Subject: [PATCH 22/65] riscv: Use macro definition instead of magic number The KERN_VIRT_START defines the start virtual address of kernel space. Use this macro instead of magic number. Signed-off-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kasan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index eee6e6588b12..b47045cb85ce 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -13,7 +13,7 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << (38 - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START 0xffffffc000000000 /* 2^64 - 2^38 */ +#define KASAN_SHADOW_START KERN_VIRT_START /* 2^64 - 2^38 */ #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << \ From 8446923ae4d776f42bf088ab99b1f91141ab6370 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:34 -0700 Subject: [PATCH 23/65] RISC-V: Mark existing SBI as 0.1 SBI. As per the new SBI specification, current SBI implementation version is defined as 0.1 and will be removed/replaced in future. Each of the function call in 0.1 is defined as a separate extension which makes easier to replace them one at a time. Rename existing implementation to reflect that. This patch is just a preparatory patch for SBI v0.2 and doesn't introduce any functional changes. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 41 +++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 2570c1e683d3..2a637ebd7a22 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015 Regents of the University of California + * Copyright (c) 2020 Western Digital Corporation or its affiliates. */ #ifndef _ASM_RISCV_SBI_H @@ -9,15 +10,15 @@ #include #ifdef CONFIG_RISCV_SBI -#define SBI_SET_TIMER 0 -#define SBI_CONSOLE_PUTCHAR 1 -#define SBI_CONSOLE_GETCHAR 2 -#define SBI_CLEAR_IPI 3 -#define SBI_SEND_IPI 4 -#define SBI_REMOTE_FENCE_I 5 -#define SBI_REMOTE_SFENCE_VMA 6 -#define SBI_REMOTE_SFENCE_VMA_ASID 7 -#define SBI_SHUTDOWN 8 +#define SBI_EXT_0_1_SET_TIMER 0x0 +#define SBI_EXT_0_1_CONSOLE_PUTCHAR 0x1 +#define SBI_EXT_0_1_CONSOLE_GETCHAR 0x2 +#define SBI_EXT_0_1_CLEAR_IPI 0x3 +#define SBI_EXT_0_1_SEND_IPI 0x4 +#define SBI_EXT_0_1_REMOTE_FENCE_I 0x5 +#define SBI_EXT_0_1_REMOTE_SFENCE_VMA 0x6 +#define SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID 0x7 +#define SBI_EXT_0_1_SHUTDOWN 0x8 #define SBI_CALL(which, arg0, arg1, arg2, arg3) ({ \ register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); \ @@ -43,48 +44,49 @@ static inline void sbi_console_putchar(int ch) { - SBI_CALL_1(SBI_CONSOLE_PUTCHAR, ch); + SBI_CALL_1(SBI_EXT_0_1_CONSOLE_PUTCHAR, ch); } static inline int sbi_console_getchar(void) { - return SBI_CALL_0(SBI_CONSOLE_GETCHAR); + return SBI_CALL_0(SBI_EXT_0_1_CONSOLE_GETCHAR); } static inline void sbi_set_timer(uint64_t stime_value) { #if __riscv_xlen == 32 - SBI_CALL_2(SBI_SET_TIMER, stime_value, stime_value >> 32); + SBI_CALL_2(SBI_EXT_0_1_SET_TIMER, stime_value, + stime_value >> 32); #else - SBI_CALL_1(SBI_SET_TIMER, stime_value); + SBI_CALL_1(SBI_EXT_0_1_SET_TIMER, stime_value); #endif } static inline void sbi_shutdown(void) { - SBI_CALL_0(SBI_SHUTDOWN); + SBI_CALL_0(SBI_EXT_0_1_SHUTDOWN); } static inline void sbi_clear_ipi(void) { - SBI_CALL_0(SBI_CLEAR_IPI); + SBI_CALL_0(SBI_EXT_0_1_CLEAR_IPI); } static inline void sbi_send_ipi(const unsigned long *hart_mask) { - SBI_CALL_1(SBI_SEND_IPI, hart_mask); + SBI_CALL_1(SBI_EXT_0_1_SEND_IPI, hart_mask); } static inline void sbi_remote_fence_i(const unsigned long *hart_mask) { - SBI_CALL_1(SBI_REMOTE_FENCE_I, hart_mask); + SBI_CALL_1(SBI_EXT_0_1_REMOTE_FENCE_I, hart_mask); } static inline void sbi_remote_sfence_vma(const unsigned long *hart_mask, unsigned long start, unsigned long size) { - SBI_CALL_3(SBI_REMOTE_SFENCE_VMA, hart_mask, start, size); + SBI_CALL_3(SBI_EXT_0_1_REMOTE_SFENCE_VMA, hart_mask, start, size); } static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, @@ -92,7 +94,8 @@ static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long size, unsigned long asid) { - SBI_CALL_4(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask, start, size, asid); + SBI_CALL_4(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, hart_mask, + start, size, asid); } #else /* CONFIG_RISCV_SBI */ /* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */ From b9dcd9e415872ae29f87667d23c8a8b946d24611 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:35 -0700 Subject: [PATCH 24/65] RISC-V: Add basic support for SBI v0.2 The SBI v0.2 introduces a base extension which is backward compatible with v0.1. Implement all helper functions and minimum required SBI calls from v0.2 for now. All other base extension function will be added later as per need. As v0.2 calling convention is backward compatible with v0.1, remove the v0.1 helper functions and just use v0.2 calling convention. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 149 +++++++++++---------- arch/riscv/kernel/sbi.c | 243 ++++++++++++++++++++++++++++++++++- arch/riscv/kernel/setup.c | 5 + 3 files changed, 319 insertions(+), 78 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 2a637ebd7a22..5b7b91c7e7e6 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -10,92 +10,88 @@ #include #ifdef CONFIG_RISCV_SBI -#define SBI_EXT_0_1_SET_TIMER 0x0 -#define SBI_EXT_0_1_CONSOLE_PUTCHAR 0x1 -#define SBI_EXT_0_1_CONSOLE_GETCHAR 0x2 -#define SBI_EXT_0_1_CLEAR_IPI 0x3 -#define SBI_EXT_0_1_SEND_IPI 0x4 -#define SBI_EXT_0_1_REMOTE_FENCE_I 0x5 -#define SBI_EXT_0_1_REMOTE_SFENCE_VMA 0x6 -#define SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID 0x7 -#define SBI_EXT_0_1_SHUTDOWN 0x8 +enum sbi_ext_id { + SBI_EXT_0_1_SET_TIMER = 0x0, + SBI_EXT_0_1_CONSOLE_PUTCHAR = 0x1, + SBI_EXT_0_1_CONSOLE_GETCHAR = 0x2, + SBI_EXT_0_1_CLEAR_IPI = 0x3, + SBI_EXT_0_1_SEND_IPI = 0x4, + SBI_EXT_0_1_REMOTE_FENCE_I = 0x5, + SBI_EXT_0_1_REMOTE_SFENCE_VMA = 0x6, + SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID = 0x7, + SBI_EXT_0_1_SHUTDOWN = 0x8, + SBI_EXT_BASE = 0x10, +}; -#define SBI_CALL(which, arg0, arg1, arg2, arg3) ({ \ - register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); \ - register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); \ - register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); \ - register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); \ - register uintptr_t a7 asm ("a7") = (uintptr_t)(which); \ - asm volatile ("ecall" \ - : "+r" (a0) \ - : "r" (a1), "r" (a2), "r" (a3), "r" (a7) \ - : "memory"); \ - a0; \ -}) +enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, + SBI_EXT_BASE_PROBE_EXT, + SBI_EXT_BASE_GET_MVENDORID, + SBI_EXT_BASE_GET_MARCHID, + SBI_EXT_BASE_GET_MIMPID, +}; -/* Lazy implementations until SBI is finalized */ -#define SBI_CALL_0(which) SBI_CALL(which, 0, 0, 0, 0) -#define SBI_CALL_1(which, arg0) SBI_CALL(which, arg0, 0, 0, 0) -#define SBI_CALL_2(which, arg0, arg1) SBI_CALL(which, arg0, arg1, 0, 0) -#define SBI_CALL_3(which, arg0, arg1, arg2) \ - SBI_CALL(which, arg0, arg1, arg2, 0) -#define SBI_CALL_4(which, arg0, arg1, arg2, arg3) \ - SBI_CALL(which, arg0, arg1, arg2, arg3) +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff -static inline void sbi_console_putchar(int ch) +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 + +extern unsigned long sbi_spec_version; +struct sbiret { + long error; + long value; +}; + +int sbi_init(void); +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +void sbi_console_putchar(int ch); +int sbi_console_getchar(void); +void sbi_set_timer(uint64_t stime_value); +void sbi_shutdown(void); +void sbi_clear_ipi(void); +void sbi_send_ipi(const unsigned long *hart_mask); +void sbi_remote_fence_i(const unsigned long *hart_mask); +void sbi_remote_sfence_vma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size); + +void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long asid); +int sbi_probe_extension(int ext); + +/* Check if current SBI specification version is 0.1 or not */ +static inline int sbi_spec_is_0_1(void) { - SBI_CALL_1(SBI_EXT_0_1_CONSOLE_PUTCHAR, ch); + return (sbi_spec_version == SBI_SPEC_VERSION_DEFAULT) ? 1 : 0; } -static inline int sbi_console_getchar(void) +/* Get the major version of SBI */ +static inline unsigned long sbi_major_version(void) { - return SBI_CALL_0(SBI_EXT_0_1_CONSOLE_GETCHAR); + return (sbi_spec_version >> SBI_SPEC_VERSION_MAJOR_SHIFT) & + SBI_SPEC_VERSION_MAJOR_MASK; } -static inline void sbi_set_timer(uint64_t stime_value) +/* Get the minor version of SBI */ +static inline unsigned long sbi_minor_version(void) { -#if __riscv_xlen == 32 - SBI_CALL_2(SBI_EXT_0_1_SET_TIMER, stime_value, - stime_value >> 32); -#else - SBI_CALL_1(SBI_EXT_0_1_SET_TIMER, stime_value); -#endif -} - -static inline void sbi_shutdown(void) -{ - SBI_CALL_0(SBI_EXT_0_1_SHUTDOWN); -} - -static inline void sbi_clear_ipi(void) -{ - SBI_CALL_0(SBI_EXT_0_1_CLEAR_IPI); -} - -static inline void sbi_send_ipi(const unsigned long *hart_mask) -{ - SBI_CALL_1(SBI_EXT_0_1_SEND_IPI, hart_mask); -} - -static inline void sbi_remote_fence_i(const unsigned long *hart_mask) -{ - SBI_CALL_1(SBI_EXT_0_1_REMOTE_FENCE_I, hart_mask); -} - -static inline void sbi_remote_sfence_vma(const unsigned long *hart_mask, - unsigned long start, - unsigned long size) -{ - SBI_CALL_3(SBI_EXT_0_1_REMOTE_SFENCE_VMA, hart_mask, start, size); -} - -static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, - unsigned long start, - unsigned long size, - unsigned long asid) -{ - SBI_CALL_4(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, hart_mask, - start, size, asid); + return sbi_spec_version & SBI_SPEC_VERSION_MINOR_MASK; } #else /* CONFIG_RISCV_SBI */ /* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */ @@ -103,5 +99,6 @@ void sbi_set_timer(uint64_t stime_value); void sbi_clear_ipi(void); void sbi_send_ipi(const unsigned long *hart_mask); void sbi_remote_fence_i(const unsigned long *hart_mask); +void sbi_init(void); #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f6c7c3e82d28..4aee0b49df3c 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -1,17 +1,256 @@ // SPDX-License-Identifier: GPL-2.0-only +/* + * SBI initialilization and all extension implementation. + * + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ #include #include #include +/* default SBI version is 0.1 */ +unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT; +EXPORT_SYMBOL(sbi_spec_version); + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + struct sbiret ret; + + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + asm volatile ("ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + return ret; +} +EXPORT_SYMBOL(sbi_ecall); + +static int sbi_err_map_linux_errno(int err) +{ + switch (err) { + case SBI_SUCCESS: + return 0; + case SBI_ERR_DENIED: + return -EPERM; + case SBI_ERR_INVALID_PARAM: + return -EINVAL; + case SBI_ERR_INVALID_ADDRESS: + return -EFAULT; + case SBI_ERR_NOT_SUPPORTED: + case SBI_ERR_FAILURE: + default: + return -ENOTSUPP; + }; +} + +/** + * sbi_console_putchar() - Writes given character to the console device. + * @ch: The data to be written to the console. + * + * Return: None + */ +void sbi_console_putchar(int ch) +{ + sbi_ecall(SBI_EXT_0_1_CONSOLE_PUTCHAR, 0, ch, 0, 0, 0, 0, 0); +} +EXPORT_SYMBOL(sbi_console_putchar); + +/** + * sbi_console_getchar() - Reads a byte from console device. + * + * Returns the value read from console. + */ +int sbi_console_getchar(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_0_1_CONSOLE_GETCHAR, 0, 0, 0, 0, 0, 0, 0); + + return ret.error; +} +EXPORT_SYMBOL(sbi_console_getchar); + +/** + * sbi_set_timer() - Program the timer for next timer event. + * @stime_value: The value after which next timer event should fire. + * + * Return: None + */ +void sbi_set_timer(uint64_t stime_value) +{ +#if __riscv_xlen == 32 + sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, + stime_value >> 32, 0, 0, 0, 0); +#else + sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, 0, 0, 0, 0, 0); +#endif +} +EXPORT_SYMBOL(sbi_set_timer); + +/** + * sbi_shutdown() - Remove all the harts from executing supervisor code. + * + * Return: None + */ +void sbi_shutdown(void) +{ + sbi_ecall(SBI_EXT_0_1_SHUTDOWN, 0, 0, 0, 0, 0, 0, 0); +} +EXPORT_SYMBOL(sbi_shutdown); + +/** + * sbi_clear_ipi() - Clear any pending IPIs for the calling hart. + * + * Return: None + */ +void sbi_clear_ipi(void) +{ + sbi_ecall(SBI_EXT_0_1_CLEAR_IPI, 0, 0, 0, 0, 0, 0, 0); +} + +/** + * sbi_send_ipi() - Send an IPI to any hart. + * @hart_mask: A cpu mask containing all the target harts. + * + * Return: None + */ +void sbi_send_ipi(const unsigned long *hart_mask) +{ + sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, + 0, 0, 0, 0, 0); +} +EXPORT_SYMBOL(sbi_send_ipi); + +/** + * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts. + * @hart_mask: A cpu mask containing all the target harts. + * + * Return: None + */ +void sbi_remote_fence_i(const unsigned long *hart_mask) +{ + sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, (unsigned long)hart_mask, + 0, 0, 0, 0, 0); +} +EXPORT_SYMBOL(sbi_remote_fence_i); + +/** + * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote + * harts for the specified virtual address range. + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the virtual address + * @size: Total size of the virtual address range. + * + * Return: None + */ +void sbi_remote_sfence_vma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size) +{ + sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, + (unsigned long)hart_mask, start, size, 0, 0, 0); +} +EXPORT_SYMBOL(sbi_remote_sfence_vma); + +/** + * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given + * remote harts for a virtual address range belonging to a specific ASID. + * + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the virtual address + * @size: Total size of the virtual address range. + * @asid: The value of address space identifier (ASID). + * + * Return: None + */ +void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long asid) +{ + sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, + (unsigned long)hart_mask, start, size, asid, 0, 0); +} +EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); + +/** + * sbi_probe_extension() - Check if an SBI extension ID is supported or not. + * @extid: The extension ID to be probed. + * + * Return: Extension specific nonzero value f yes, -ENOTSUPP otherwise. + */ +int sbi_probe_extension(int extid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_PROBE_EXT, extid, + 0, 0, 0, 0, 0); + if (!ret.error) + if (ret.value) + return ret.value; + + return -ENOTSUPP; +} +EXPORT_SYMBOL(sbi_probe_extension); + +static long __sbi_base_ecall(int fid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); + if (!ret.error) + return ret.value; + else + return sbi_err_map_linux_errno(ret.error); +} + +static inline long sbi_get_spec_version(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION); +} + +static inline long sbi_get_firmware_id(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_ID); +} + +static inline long sbi_get_firmware_version(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION); +} + static void sbi_power_off(void) { sbi_shutdown(); } -static int __init sbi_init(void) +int __init sbi_init(void) { + int ret; + pm_power_off = sbi_power_off; + ret = sbi_get_spec_version(); + if (ret > 0) + sbi_spec_version = ret; + + pr_info("SBI specification v%lu.%lu detected\n", + sbi_major_version(), sbi_minor_version()); + if (!sbi_spec_is_0_1()) + pr_info("SBI implementation ID=0x%lx Version=0x%lx\n", + sbi_get_firmware_id(), sbi_get_firmware_version()); return 0; } -early_initcall(sbi_init); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index cb836fcc6118..07f4e7503223 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,10 @@ void __init setup_arch(char **cmdline_p) kasan_init(); #endif +#if IS_ENABLED(CONFIG_RISCV_SBI) + sbi_init(); +#endif + #ifdef CONFIG_SMP setup_smp(); #endif From ecbacc2a3efd90cae34790379cc3e1b4932889d0 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:36 -0700 Subject: [PATCH 25/65] RISC-V: Add SBI v0.2 extension definitions Few v0.1 SBI calls are being replaced by new SBI calls that follows v0.2 calling convention. This patch just defines these new extensions. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 5b7b91c7e7e6..f68b6ed10a18 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -21,6 +21,9 @@ enum sbi_ext_id { SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID = 0x7, SBI_EXT_0_1_SHUTDOWN = 0x8, SBI_EXT_BASE = 0x10, + SBI_EXT_TIME = 0x54494D45, + SBI_EXT_IPI = 0x735049, + SBI_EXT_RFENCE = 0x52464E43, }; enum sbi_ext_base_fid { @@ -33,6 +36,24 @@ enum sbi_ext_base_fid { SBI_EXT_BASE_GET_MIMPID, }; +enum sbi_ext_time_fid { + SBI_EXT_TIME_SET_TIMER = 0, +}; + +enum sbi_ext_ipi_fid { + SBI_EXT_IPI_SEND_IPI = 0, +}; + +enum sbi_ext_rfence_fid { + SBI_EXT_RFENCE_REMOTE_FENCE_I = 0, + SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, + SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, + SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, + SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, + SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, + SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, +}; + #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 #define SBI_SPEC_VERSION_MAJOR_MASK 0x7f From efca13989250c3edebaf8fcaa8ca7c966739c65a Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:37 -0700 Subject: [PATCH 26/65] RISC-V: Introduce a new config for SBI v0.1 We now have SBI v0.2 which is more scalable and extendable to handle future needs for RISC-V supervisor interfaces. Introduce a new config and move all SBI v0.1 code under that config. This allows to implement the new replacement SBI extensions cleanly and remove v0.1 extensions easily in future. Currently, the config is enabled by default. Once all M-mode software, with v0.1, is no longer in use, this config option and all relevant code can be easily removed. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 7 ++ arch/riscv/include/asm/sbi.h | 2 + arch/riscv/kernel/sbi.c | 140 ++++++++++++++++++++++++++++------- 3 files changed, 122 insertions(+), 27 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b2764a0a0c8c..cac23a5c1104 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -314,6 +314,13 @@ config SECCOMP and the task is only allowed to execute a few safe syscalls defined by each seccomp mode. +config RISCV_SBI_V01 + bool "SBI v0.1 support" + default y + depends on RISCV_SBI + help + This config allows kernel to use SBI v0.1 APIs. This will be + deprecated in future once legacy M-mode software are no longer in use. endmenu menu "Boot options" diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index f68b6ed10a18..d712b61f8dbc 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -11,6 +11,7 @@ #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { +#ifdef CONFIG_RISCV_SBI_V01 SBI_EXT_0_1_SET_TIMER = 0x0, SBI_EXT_0_1_CONSOLE_PUTCHAR = 0x1, SBI_EXT_0_1_CONSOLE_GETCHAR = 0x2, @@ -20,6 +21,7 @@ enum sbi_ext_id { SBI_EXT_0_1_REMOTE_SFENCE_VMA = 0x6, SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID = 0x7, SBI_EXT_0_1_SHUTDOWN = 0x8, +#endif SBI_EXT_BASE = 0x10, SBI_EXT_TIME = 0x54494D45, SBI_EXT_IPI = 0x735049, diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 4aee0b49df3c..1368da62ec82 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -13,6 +13,12 @@ unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); +static void (*__sbi_set_timer)(uint64_t stime); +static int (*__sbi_send_ipi)(const unsigned long *hart_mask); +static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, + unsigned long start, unsigned long size, + unsigned long arg4, unsigned long arg5); + struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, @@ -57,6 +63,7 @@ static int sbi_err_map_linux_errno(int err) }; } +#ifdef CONFIG_RISCV_SBI_V01 /** * sbi_console_putchar() - Writes given character to the console device. * @ch: The data to be written to the console. @@ -84,23 +91,6 @@ int sbi_console_getchar(void) } EXPORT_SYMBOL(sbi_console_getchar); -/** - * sbi_set_timer() - Program the timer for next timer event. - * @stime_value: The value after which next timer event should fire. - * - * Return: None - */ -void sbi_set_timer(uint64_t stime_value) -{ -#if __riscv_xlen == 32 - sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, - stime_value >> 32, 0, 0, 0, 0); -#else - sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, 0, 0, 0, 0, 0); -#endif -} -EXPORT_SYMBOL(sbi_set_timer); - /** * sbi_shutdown() - Remove all the harts from executing supervisor code. * @@ -110,7 +100,7 @@ void sbi_shutdown(void) { sbi_ecall(SBI_EXT_0_1_SHUTDOWN, 0, 0, 0, 0, 0, 0, 0); } -EXPORT_SYMBOL(sbi_shutdown); +EXPORT_SYMBOL(sbi_set_timer); /** * sbi_clear_ipi() - Clear any pending IPIs for the calling hart. @@ -121,6 +111,96 @@ void sbi_clear_ipi(void) { sbi_ecall(SBI_EXT_0_1_CLEAR_IPI, 0, 0, 0, 0, 0, 0, 0); } +EXPORT_SYMBOL(sbi_shutdown); + +/** + * sbi_set_timer_v01() - Program the timer for next timer event. + * @stime_value: The value after which next timer event should fire. + * + * Return: None + */ +static void __sbi_set_timer_v01(uint64_t stime_value) +{ +#if __riscv_xlen == 32 + sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, + stime_value >> 32, 0, 0, 0, 0); +#else + sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, 0, 0, 0, 0, 0); +#endif +} + +static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +{ + sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, + 0, 0, 0, 0, 0); + return 0; +} + +static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, + unsigned long start, unsigned long size, + unsigned long arg4, unsigned long arg5) +{ + int result = 0; + + /* v0.2 function IDs are equivalent to v0.1 extension IDs */ + switch (fid) { + case SBI_EXT_RFENCE_REMOTE_FENCE_I: + sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, + (unsigned long)hart_mask, 0, 0, 0, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: + sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, + (unsigned long)hart_mask, start, size, + 0, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: + sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, + (unsigned long)hart_mask, start, size, + arg4, 0, 0); + break; + default: + pr_err("SBI call [%d]not supported in SBI v0.1\n", fid); + result = -EINVAL; + } + + return result; +} +#else +static void __sbi_set_timer_v01(uint64_t stime_value) +{ + pr_warn("Timer extension is not available in SBI v%lu.%lu\n", + sbi_major_version(), sbi_minor_version()); +} + +static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +{ + pr_warn("IPI extension is not available in SBI v%lu.%lu\n", + sbi_major_version(), sbi_minor_version()); + + return 0; +} + +static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, + unsigned long start, unsigned long size, + unsigned long arg4, unsigned long arg5) +{ + pr_warn("remote fence extension is not available in SBI v%lu.%lu\n", + sbi_major_version(), sbi_minor_version()); + + return 0; +} +#endif /* CONFIG_RISCV_SBI_V01 */ + +/** + * sbi_set_timer() - Program the timer for next timer event. + * @stime_value: The value after which next timer event should fire. + * + * Return: None + */ +void sbi_set_timer(uint64_t stime_value) +{ + __sbi_set_timer(stime_value); +} /** * sbi_send_ipi() - Send an IPI to any hart. @@ -130,8 +210,7 @@ void sbi_clear_ipi(void) */ void sbi_send_ipi(const unsigned long *hart_mask) { - sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, - 0, 0, 0, 0, 0); + __sbi_send_ipi(hart_mask); } EXPORT_SYMBOL(sbi_send_ipi); @@ -143,8 +222,8 @@ EXPORT_SYMBOL(sbi_send_ipi); */ void sbi_remote_fence_i(const unsigned long *hart_mask) { - sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, (unsigned long)hart_mask, - 0, 0, 0, 0, 0); + __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, + hart_mask, 0, 0, 0, 0); } EXPORT_SYMBOL(sbi_remote_fence_i); @@ -161,8 +240,8 @@ void sbi_remote_sfence_vma(const unsigned long *hart_mask, unsigned long start, unsigned long size) { - sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, - (unsigned long)hart_mask, start, size, 0, 0, 0); + __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, + hart_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma); @@ -182,8 +261,8 @@ void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long size, unsigned long asid) { - sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, - (unsigned long)hart_mask, start, size, asid, 0, 0); + __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, + hart_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); @@ -249,8 +328,15 @@ int __init sbi_init(void) pr_info("SBI specification v%lu.%lu detected\n", sbi_major_version(), sbi_minor_version()); - if (!sbi_spec_is_0_1()) + + if (!sbi_spec_is_0_1()) { pr_info("SBI implementation ID=0x%lx Version=0x%lx\n", sbi_get_firmware_id(), sbi_get_firmware_version()); + } + + __sbi_set_timer = __sbi_set_timer_v01; + __sbi_send_ipi = __sbi_send_ipi_v01; + __sbi_rfence = __sbi_rfence_v01; + return 0; } From 1ef46c231df4b856559ec0234bfcbb41a1180b97 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:38 -0700 Subject: [PATCH 27/65] RISC-V: Implement new SBI v0.2 extensions Few v0.1 SBI calls are being replaced by new SBI calls that follows v0.2 calling convention. Implement the replacement extensions and few additional new SBI function calls that makes way for a better SBI interface in future. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 14 ++ arch/riscv/include/asm/smp.h | 7 + arch/riscv/kernel/sbi.c | 253 ++++++++++++++++++++++++++++++++++- 3 files changed, 270 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index d712b61f8dbc..29ce2c494386 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -96,6 +96,20 @@ void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long start, unsigned long size, unsigned long asid); +int sbi_remote_hfence_gvma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size); +int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long vmid); +int sbi_remote_hfence_vvma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size); +int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long asid); int sbi_probe_extension(int ext); /* Check if current SBI specification version is 0.1 or not */ diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index a83451d73a4e..023f74fb8b3b 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -61,5 +61,12 @@ static inline unsigned long cpuid_to_hartid_map(int cpu) return boot_cpu_hartid; } +static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, + struct cpumask *out) +{ + cpumask_clear(out); + cpumask_set_cpu(boot_cpu_hartid, out); +} + #endif /* CONFIG_SMP */ #endif /* _ASM_RISCV_SMP_H */ diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 1368da62ec82..1cc0052e1b63 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -8,6 +8,7 @@ #include #include #include +#include /* default SBI version is 0.1 */ unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT; @@ -191,6 +192,153 @@ static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, } #endif /* CONFIG_RISCV_SBI_V01 */ +static void __sbi_set_timer_v02(uint64_t stime_value) +{ +#if __riscv_xlen == 32 + sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value, + stime_value >> 32, 0, 0, 0, 0); +#else + sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value, 0, + 0, 0, 0, 0); +#endif +} + +static int __sbi_send_ipi_v02(const unsigned long *hart_mask) +{ + unsigned long hartid, hmask_val, hbase; + struct cpumask tmask; + struct sbiret ret = {0}; + int result; + + if (!hart_mask || !(*hart_mask)) { + riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); + hart_mask = cpumask_bits(&tmask); + } + + hmask_val = 0; + hbase = 0; + for_each_set_bit(hartid, hart_mask, NR_CPUS) { + if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { + ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, + hmask_val, hbase, 0, 0, 0, 0); + if (ret.error) + goto ecall_failed; + hmask_val = 0; + hbase = 0; + } + if (!hmask_val) + hbase = hartid; + hmask_val |= 1UL << (hartid - hbase); + } + + if (hmask_val) { + ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, + hmask_val, hbase, 0, 0, 0, 0); + if (ret.error) + goto ecall_failed; + } + + return 0; + +ecall_failed: + result = sbi_err_map_linux_errno(ret.error); + pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", + __func__, hbase, hmask_val, result); + return result; +} + +static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, + unsigned long hbase, unsigned long start, + unsigned long size, unsigned long arg4, + unsigned long arg5) +{ + struct sbiret ret = {0}; + int ext = SBI_EXT_RFENCE; + int result = 0; + + switch (fid) { + case SBI_EXT_RFENCE_REMOTE_FENCE_I: + ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, arg4, 0); + break; + + case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, arg4, 0); + break; + case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, 0, 0); + break; + case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: + ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + size, arg4, 0); + break; + default: + pr_err("unknown function ID [%lu] for SBI extension [%d]\n", + fid, ext); + result = -EINVAL; + } + + if (ret.error) { + result = sbi_err_map_linux_errno(ret.error); + pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", + __func__, hbase, hmask_val, result); + } + + return result; +} + +static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask, + unsigned long start, unsigned long size, + unsigned long arg4, unsigned long arg5) +{ + unsigned long hmask_val, hartid, hbase; + struct cpumask tmask; + int result; + + if (!hart_mask || !(*hart_mask)) { + riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); + hart_mask = cpumask_bits(&tmask); + } + + hmask_val = 0; + hbase = 0; + for_each_set_bit(hartid, hart_mask, NR_CPUS) { + if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { + result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + start, size, arg4, arg5); + if (result) + return result; + hmask_val = 0; + hbase = 0; + } + if (!hmask_val) + hbase = hartid; + hmask_val |= 1UL << (hartid - hbase); + } + + if (hmask_val) { + result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + start, size, arg4, arg5); + if (result) + return result; + } + + return 0; +} + /** * sbi_set_timer() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. @@ -266,6 +414,85 @@ void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); +/** + * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote + * harts for the specified guest physical address range. + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the guest physical address + * @size: Total size of the guest physical address range. + * + * Return: None + */ +int sbi_remote_hfence_gvma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size) +{ + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, + hart_mask, start, size, 0, 0); +} +EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); + +/** + * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given + * remote harts for a guest physical address range belonging to a specific VMID. + * + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the guest physical address + * @size: Total size of the guest physical address range. + * @vmid: The value of guest ID (VMID). + * + * Return: 0 if success, Error otherwise. + */ +int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long vmid) +{ + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, + hart_mask, start, size, vmid, 0); +} +EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid); + +/** + * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote + * harts for the current guest virtual address range. + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the current guest virtual address + * @size: Total size of the current guest virtual address range. + * + * Return: None + */ +int sbi_remote_hfence_vvma(const unsigned long *hart_mask, + unsigned long start, + unsigned long size) +{ + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, + hart_mask, start, size, 0, 0); +} +EXPORT_SYMBOL(sbi_remote_hfence_vvma); + +/** + * sbi_remote_hfence_vvma_asid() - Execute HFENCE.VVMA instructions on given + * remote harts for current guest virtual address range belonging to a specific + * ASID. + * + * @hart_mask: A cpu mask containing all the target harts. + * @start: Start of the current guest virtual address + * @size: Total size of the current guest virtual address range. + * @asid: The value of address space identifier (ASID). + * + * Return: None + */ +int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, + unsigned long start, + unsigned long size, + unsigned long asid) +{ + return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, + hart_mask, start, size, asid, 0); +} +EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid); + /** * sbi_probe_extension() - Check if an SBI extension ID is supported or not. * @extid: The extension ID to be probed. @@ -332,11 +559,29 @@ int __init sbi_init(void) if (!sbi_spec_is_0_1()) { pr_info("SBI implementation ID=0x%lx Version=0x%lx\n", sbi_get_firmware_id(), sbi_get_firmware_version()); + if (sbi_probe_extension(SBI_EXT_TIME) > 0) { + __sbi_set_timer = __sbi_set_timer_v02; + pr_info("SBI v0.2 TIME extension detected\n"); + } else { + __sbi_set_timer = __sbi_set_timer_v01; + } + if (sbi_probe_extension(SBI_EXT_IPI) > 0) { + __sbi_send_ipi = __sbi_send_ipi_v02; + pr_info("SBI v0.2 IPI extension detected\n"); + } else { + __sbi_send_ipi = __sbi_send_ipi_v01; + } + if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) { + __sbi_rfence = __sbi_rfence_v02; + pr_info("SBI v0.2 RFENCE extension detected\n"); + } else { + __sbi_rfence = __sbi_rfence_v01; + } + } else { + __sbi_set_timer = __sbi_set_timer_v01; + __sbi_send_ipi = __sbi_send_ipi_v01; + __sbi_rfence = __sbi_rfence_v01; } - __sbi_set_timer = __sbi_set_timer_v01; - __sbi_send_ipi = __sbi_send_ipi_v01; - __sbi_rfence = __sbi_rfence_v01; - return 0; } From e011995e826f85fbe55dc7d4ce649461163d1052 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:39 -0700 Subject: [PATCH 28/65] RISC-V: Move relocate and few other functions out of __init The secondary hart booting and relocation code are under .init section. As a result, it will be freed once kernel booting is done. However, ordered booting protocol and CPU hotplug always requires these functions to be present to bringup harts after initial kernel boot. Move the required functions to a different section and make sure that they are in memory within first 2MB offset as trampoline page directory only maps first 2MB. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 153 +++++++++++++++++--------------- arch/riscv/kernel/vmlinux.lds.S | 5 +- 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 85f2073e7fe4..173507395a6b 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -14,7 +14,7 @@ #include #include -__INIT +__HEAD ENTRY(_start) /* * Image header expected by Linux boot-loaders. The image header data @@ -45,8 +45,85 @@ ENTRY(_start) .ascii RISCV_IMAGE_MAGIC2 .word 0 -.global _start_kernel -_start_kernel: +.align 2 +#ifdef CONFIG_MMU +relocate: + /* Relocate return address */ + li a1, PAGE_OFFSET + la a2, _start + sub a1, a1, a2 + add ra, ra, a1 + + /* Point stvec to virtual address of intruction after satp write */ + la a2, 1f + add a2, a2, a1 + csrw CSR_TVEC, a2 + + /* Compute satp for kernel page tables, but don't load it yet */ + srl a2, a0, PAGE_SHIFT + li a1, SATP_MODE + or a2, a2, a1 + + /* + * Load trampoline page directory, which will cause us to trap to + * stvec if VA != PA, or simply fall through if VA == PA. We need a + * full fence here because setup_vm() just wrote these PTEs and we need + * to ensure the new translations are in use. + */ + la a0, trampoline_pg_dir + srl a0, a0, PAGE_SHIFT + or a0, a0, a1 + sfence.vma + csrw CSR_SATP, a0 +.align 2 +1: + /* Set trap vector to spin forever to help debug */ + la a0, .Lsecondary_park + csrw CSR_TVEC, a0 + + /* Reload the global pointer */ +.option push +.option norelax + la gp, __global_pointer$ +.option pop + + /* + * Switch to kernel page tables. A full fence is necessary in order to + * avoid using the trampoline translations, which are only correct for + * the first superpage. Fetching the fence is guarnteed to work + * because that first superpage is translated the same way. + */ + csrw CSR_SATP, a2 + sfence.vma + + ret +#endif /* CONFIG_MMU */ +#ifdef CONFIG_SMP + /* Set trap vector to spin forever to help debug */ + la a3, .Lsecondary_park + csrw CSR_TVEC, a3 + + slli a3, a0, LGREG + .global secondary_start_common +secondary_start_common: + +#ifdef CONFIG_MMU + /* Enable virtual memory and relocate to virtual address */ + la a0, swapper_pg_dir + call relocate +#endif + tail smp_callin +#endif /* CONFIG_SMP */ + +.Lsecondary_park: + /* We lack SMP support or have too many harts, so park this hart */ + wfi + j .Lsecondary_park + +END(_start) + + __INIT +ENTRY(_start_kernel) /* Mask all interrupts */ csrw CSR_IE, zero csrw CSR_IP, zero @@ -134,59 +211,6 @@ clear_bss_done: call parse_dtb tail start_kernel -#ifdef CONFIG_MMU -relocate: - /* Relocate return address */ - li a1, PAGE_OFFSET - la a2, _start - sub a1, a1, a2 - add ra, ra, a1 - - /* Point stvec to virtual address of intruction after satp write */ - la a2, 1f - add a2, a2, a1 - csrw CSR_TVEC, a2 - - /* Compute satp for kernel page tables, but don't load it yet */ - srl a2, a0, PAGE_SHIFT - li a1, SATP_MODE - or a2, a2, a1 - - /* - * Load trampoline page directory, which will cause us to trap to - * stvec if VA != PA, or simply fall through if VA == PA. We need a - * full fence here because setup_vm() just wrote these PTEs and we need - * to ensure the new translations are in use. - */ - la a0, trampoline_pg_dir - srl a0, a0, PAGE_SHIFT - or a0, a0, a1 - sfence.vma - csrw CSR_SATP, a0 -.align 2 -1: - /* Set trap vector to spin forever to help debug */ - la a0, .Lsecondary_park - csrw CSR_TVEC, a0 - - /* Reload the global pointer */ -.option push -.option norelax - la gp, __global_pointer$ -.option pop - - /* - * Switch to kernel page tables. A full fence is necessary in order to - * avoid using the trampoline translations, which are only correct for - * the first superpage. Fetching the fence is guarnteed to work - * because that first superpage is translated the same way. - */ - csrw CSR_SATP, a2 - sfence.vma - - ret -#endif /* CONFIG_MMU */ - .Lsecondary_start: #ifdef CONFIG_SMP /* Set trap vector to spin forever to help debug */ @@ -211,16 +235,10 @@ relocate: beqz tp, .Lwait_for_cpu_up fence -#ifdef CONFIG_MMU - /* Enable virtual memory and relocate to virtual address */ - la a0, swapper_pg_dir - call relocate + tail secondary_start_common #endif - tail smp_callin -#endif - -END(_start) +END(_start_kernel) #ifdef CONFIG_RISCV_M_MODE ENTRY(reset_regs) @@ -301,13 +319,6 @@ ENTRY(reset_regs) END(reset_regs) #endif /* CONFIG_RISCV_M_MODE */ -.section ".text", "ax",@progbits -.align 2 -.Lsecondary_park: - /* We lack SMP support or have too many harts, so park this hart */ - wfi - j .Lsecondary_park - __PAGE_ALIGNED_BSS /* Empty zero page */ .balign PAGE_SIZE diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 4ad3c8eb241d..435cd60dca04 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -11,6 +11,7 @@ #include #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) @@ -21,8 +22,10 @@ SECTIONS /* Beginning of code and text segment */ . = LOAD_OFFSET; _start = .; - __init_begin = .; HEAD_TEXT_SECTION + . = ALIGN(PAGE_SIZE); + + __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) /* we have to discard exit text and such at runtime, not link time */ From 2875fe0561569f82d0e63658ccf0d11ce7da8922 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:40 -0700 Subject: [PATCH 29/65] RISC-V: Add cpu_ops and modify default booting method Currently, all non-booting harts start booting after the booting hart updates the per-hart stack pointer. This is done in a way that, it's difficult to implement any other booting method without breaking the backward compatibility. Define a cpu_ops method that allows to introduce other booting methods in future. Modify the current booting method to be compatible with cpu_ops. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpu_ops.h | 34 +++++++++++++++++++ arch/riscv/kernel/Makefile | 2 ++ arch/riscv/kernel/cpu_ops.c | 38 +++++++++++++++++++++ arch/riscv/kernel/cpu_ops_spinwait.c | 43 +++++++++++++++++++++++ arch/riscv/kernel/smpboot.c | 51 ++++++++++++++++------------ 5 files changed, 147 insertions(+), 21 deletions(-) create mode 100644 arch/riscv/include/asm/cpu_ops.h create mode 100644 arch/riscv/kernel/cpu_ops.c create mode 100644 arch/riscv/kernel/cpu_ops_spinwait.c diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h new file mode 100644 index 000000000000..5ce81a28e1d9 --- /dev/null +++ b/arch/riscv/include/asm/cpu_ops.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + * Based on arch/arm64/include/asm/cpu_ops.h + */ +#ifndef __ASM_CPU_OPS_H +#define __ASM_CPU_OPS_H + +#include +#include +#include + +/** + * struct cpu_operations - Callback operations for hotplugging CPUs. + * + * @name: Name of the boot protocol. + * @cpu_prepare: Early one-time preparation step for a cpu. If there + * is a mechanism for doing so, tests whether it is + * possible to boot the given HART. + * @cpu_start: Boots a cpu into the kernel. + */ +struct cpu_operations { + const char *name; + int (*cpu_prepare)(unsigned int cpu); + int (*cpu_start)(unsigned int cpu, + struct task_struct *tidle); +}; + +extern const struct cpu_operations *cpu_ops[NR_CPUS]; +void __init cpu_set_ops(int cpu); +void cpu_update_secondary_bootdata(unsigned int cpuid, + struct task_struct *tidle); + +#endif /* ifndef __ASM_CPU_OPS_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index d189bd3d8501..43d49ea36a3f 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -34,6 +34,8 @@ obj-$(CONFIG_RISCV_M_MODE) += clint.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += cpu_ops.o +obj-$(CONFIG_SMP) += cpu_ops_spinwait.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c new file mode 100644 index 000000000000..62705908eee5 --- /dev/null +++ b/arch/riscv/kernel/cpu_ops.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; + +void *__cpu_up_stack_pointer[NR_CPUS]; +void *__cpu_up_task_pointer[NR_CPUS]; + +extern const struct cpu_operations cpu_ops_spinwait; + +void cpu_update_secondary_bootdata(unsigned int cpuid, + struct task_struct *tidle) +{ + int hartid = cpuid_to_hartid_map(cpuid); + + /* Make sure tidle is updated */ + smp_mb(); + WRITE_ONCE(__cpu_up_stack_pointer[hartid], + task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); +} + +void __init cpu_set_ops(int cpuid) +{ + cpu_ops[cpuid] = &cpu_ops_spinwait; +} diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c new file mode 100644 index 000000000000..b2c957bb68c1 --- /dev/null +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include +#include +#include +#include + +const struct cpu_operations cpu_ops_spinwait; + +static int spinwait_cpu_prepare(unsigned int cpuid) +{ + if (!cpu_ops_spinwait.cpu_start) { + pr_err("cpu start method not defined for CPU [%d]\n", cpuid); + return -ENODEV; + } + return 0; +} + +static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) +{ + /* + * In this protocol, all cpus boot on their own accord. _start + * selects the first cpu to boot the kernel and causes the remainder + * of the cpus to spin in a loop waiting for their stack pointer to be + * setup by that main cpu. Writing to bootdata + * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they + * can continue the boot process. + */ + cpu_update_secondary_bootdata(cpuid, tidle); + + return 0; +} + +const struct cpu_operations cpu_ops_spinwait = { + .name = "spinwait", + .cpu_prepare = spinwait_cpu_prepare, + .cpu_start = spinwait_cpu_start, +}; diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 8bc01f0ca73b..e89396a2a1af 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,8 +35,6 @@ #include "head.h" -void *__cpu_up_stack_pointer[NR_CPUS]; -void *__cpu_up_task_pointer[NR_CPUS]; static DECLARE_COMPLETION(cpu_running); void __init smp_prepare_boot_cpu(void) @@ -46,6 +45,7 @@ void __init smp_prepare_boot_cpu(void) void __init smp_prepare_cpus(unsigned int max_cpus) { int cpuid; + int ret; /* This covers non-smp usecase mandated by "nosmp" option */ if (max_cpus == 0) @@ -54,6 +54,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpuid) { if (cpuid == smp_processor_id()) continue; + if (cpu_ops[cpuid]->cpu_prepare) { + ret = cpu_ops[cpuid]->cpu_prepare(cpuid); + if (ret) + continue; + } set_cpu_present(cpuid, true); } } @@ -65,6 +70,8 @@ void __init setup_smp(void) bool found_boot_cpu = false; int cpuid = 1; + cpu_set_ops(0); + for_each_of_cpu_node(dn) { hart = riscv_of_processor_hartid(dn); if (hart < 0) @@ -92,36 +99,38 @@ void __init setup_smp(void) cpuid, nr_cpu_ids); for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) { - if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) + if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) { + cpu_set_ops(cpuid); set_cpu_possible(cpuid, true); + } } } +int start_secondary_cpu(int cpu, struct task_struct *tidle) +{ + if (cpu_ops[cpu]->cpu_start) + return cpu_ops[cpu]->cpu_start(cpu, tidle); + + return -EOPNOTSUPP; +} + int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int ret = 0; - int hartid = cpuid_to_hartid_map(cpu); tidle->thread_info.cpu = cpu; - /* - * On RISC-V systems, all harts boot on their own accord. Our _start - * selects the first hart to boot the kernel and causes the remainder - * of the harts to spin in a loop waiting for their stack pointer to be - * setup by that main hart. Writing __cpu_up_stack_pointer signals to - * the spinning harts that they can continue the boot process. - */ - smp_mb(); - WRITE_ONCE(__cpu_up_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); - WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); - - lockdep_assert_held(&cpu_running); - wait_for_completion_timeout(&cpu_running, + ret = start_secondary_cpu(cpu, tidle); + if (!ret) { + lockdep_assert_held(&cpu_running); + wait_for_completion_timeout(&cpu_running, msecs_to_jiffies(1000)); - if (!cpu_online(cpu)) { - pr_crit("CPU%u: failed to come online\n", cpu); - ret = -EIO; + if (!cpu_online(cpu)) { + pr_crit("CPU%u: failed to come online\n", cpu); + ret = -EIO; + } + } else { + pr_crit("CPU%u: failed to start\n", cpu); } return ret; From f90b43ce176c129a84237c9d57fae51aeff3e6ec Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:41 -0700 Subject: [PATCH 30/65] RISC-V: Export SBI error to linux error mapping function All SBI related extensions will not be implemented in sbi.c to avoid bloating. Thus, sbi_err_map_linux_errno() will be used in other files implementing that specific extension. Export the function so that it can be used later. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 2 ++ arch/riscv/kernel/sbi.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 29ce2c494386..2bbfd6bada93 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -130,6 +130,8 @@ static inline unsigned long sbi_minor_version(void) { return sbi_spec_version & SBI_SPEC_VERSION_MINOR_MASK; } + +int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ /* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */ void sbi_set_timer(uint64_t stime_value); diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 1cc0052e1b63..7c24da59bccf 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -46,7 +46,7 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, } EXPORT_SYMBOL(sbi_ecall); -static int sbi_err_map_linux_errno(int err) +int sbi_err_map_linux_errno(int err) { switch (err) { case SBI_SUCCESS: @@ -63,6 +63,7 @@ static int sbi_err_map_linux_errno(int err) return -ENOTSUPP; }; } +EXPORT_SYMBOL(sbi_err_map_linux_errno); #ifdef CONFIG_RISCV_SBI_V01 /** From db5a79460315bd12dedee5f964cd72f3a534ecb2 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:42 -0700 Subject: [PATCH 31/65] RISC-V: Add SBI HSM extension definitions SBI specification defines HSM extension that allows to start/stop a hart by a supervisor anytime. The specification is available at https://github.com/riscv/riscv-sbi-doc/blob/master/riscv-sbi.adoc Add those definitions here. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 2bbfd6bada93..653edb25d495 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -26,6 +26,7 @@ enum sbi_ext_id { SBI_EXT_TIME = 0x54494D45, SBI_EXT_IPI = 0x735049, SBI_EXT_RFENCE = 0x52464E43, + SBI_EXT_HSM = 0x48534D, }; enum sbi_ext_base_fid { @@ -56,6 +57,19 @@ enum sbi_ext_rfence_fid { SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, }; +enum sbi_ext_hsm_fid { + SBI_EXT_HSM_HART_START = 0, + SBI_EXT_HSM_HART_STOP, + SBI_EXT_HSM_HART_STATUS, +}; + +enum sbi_hsm_hart_status { + SBI_HSM_HART_STATUS_STARTED = 0, + SBI_HSM_HART_STATUS_STOPPED, + SBI_HSM_HART_STATUS_START_PENDING, + SBI_HSM_HART_STATUS_STOP_PENDING, +}; + #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 #define SBI_SPEC_VERSION_MAJOR_MASK 0x7f From cfafe260137418d0265d0df3bb18dc494af2b43e Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:43 -0700 Subject: [PATCH 32/65] RISC-V: Add supported for ordered booting method using HSM Currently, all harts have to jump Linux in RISC-V. This complicates the multi-stage boot process as every transient stage also has to ensure all harts enter to that stage and jump to Linux afterwards. It also obstructs a clean Kexec implementation. SBI HSM extension provides alternate solutions where only a single hart need to boot and enter Linux. The booting hart can bring up secondary harts one by one afterwards. Add SBI HSM based cpu_ops that implements an ordered booting method in RISC-V. This change is also backward compatible with older firmware not implementing HSM extension. If a latest kernel is used with older firmware, it will continue to use the default spinning booting method. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 3 ++ arch/riscv/kernel/cpu_ops.c | 10 +++- arch/riscv/kernel/cpu_ops_sbi.c | 81 +++++++++++++++++++++++++++++++++ arch/riscv/kernel/head.S | 26 +++++++++++ arch/riscv/kernel/smpboot.c | 2 +- arch/riscv/kernel/traps.c | 2 +- 6 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/kernel/cpu_ops_sbi.c diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 43d49ea36a3f..674a23cc4223 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -46,5 +46,8 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o obj-$(CONFIG_RISCV_SBI) += sbi.o +ifeq ($(CONFIG_RISCV_SBI), y) +obj-$(CONFIG_SMP) += cpu_ops_sbi.o +endif clean: diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 62705908eee5..c4c33bf02369 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -18,6 +18,7 @@ const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; void *__cpu_up_stack_pointer[NR_CPUS]; void *__cpu_up_task_pointer[NR_CPUS]; +extern const struct cpu_operations cpu_ops_sbi; extern const struct cpu_operations cpu_ops_spinwait; void cpu_update_secondary_bootdata(unsigned int cpuid, @@ -34,5 +35,12 @@ void cpu_update_secondary_bootdata(unsigned int cpuid, void __init cpu_set_ops(int cpuid) { - cpu_ops[cpuid] = &cpu_ops_spinwait; +#if IS_ENABLED(CONFIG_RISCV_SBI) + if (sbi_probe_extension(SBI_EXT_HSM) > 0) { + if (!cpuid) + pr_info("SBI v0.2 HSM extension detected\n"); + cpu_ops[cpuid] = &cpu_ops_sbi; + } else +#endif + cpu_ops[cpuid] = &cpu_ops_spinwait; } diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c new file mode 100644 index 000000000000..66f3cded91f5 --- /dev/null +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HSM extension and cpu_ops implementation. + * + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include +#include +#include + +extern char secondary_start_sbi[]; +const struct cpu_operations cpu_ops_sbi; + +static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr, + unsigned long priv) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_START, + hartid, saddr, priv, 0, 0, 0); + if (ret.error) + return sbi_err_map_linux_errno(ret.error); + else + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static int sbi_hsm_hart_stop(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STOP, 0, 0, 0, 0, 0, 0); + + if (ret.error) + return sbi_err_map_linux_errno(ret.error); + else + return 0; +} + +static int sbi_hsm_hart_get_status(unsigned long hartid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STATUS, + hartid, 0, 0, 0, 0, 0); + if (ret.error) + return sbi_err_map_linux_errno(ret.error); + else + return ret.value; +} +#endif + +static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) +{ + int rc; + unsigned long boot_addr = __pa_symbol(secondary_start_sbi); + int hartid = cpuid_to_hartid_map(cpuid); + + cpu_update_secondary_bootdata(cpuid, tidle); + rc = sbi_hsm_hart_start(hartid, boot_addr, 0); + + return rc; +} + +static int sbi_cpu_prepare(unsigned int cpuid) +{ + if (!cpu_ops_sbi.cpu_start) { + pr_err("cpu start method not defined for CPU [%d]\n", cpuid); + return -ENODEV; + } + return 0; +} + +const struct cpu_operations cpu_ops_sbi = { + .name = "sbi", + .cpu_prepare = sbi_cpu_prepare, + .cpu_start = sbi_cpu_start, +}; diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 173507395a6b..e5115d5e0b3a 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -99,11 +99,37 @@ relocate: ret #endif /* CONFIG_MMU */ #ifdef CONFIG_SMP + .global secondary_start_sbi +secondary_start_sbi: + /* Mask all interrupts */ + csrw CSR_IE, zero + csrw CSR_IP, zero + + /* Load the global pointer */ + .option push + .option norelax + la gp, __global_pointer$ + .option pop + + /* + * Disable FPU to detect illegal usage of + * floating point in kernel space + */ + li t0, SR_FS + csrc CSR_STATUS, t0 + /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park csrw CSR_TVEC, a3 slli a3, a0, LGREG + la a4, __cpu_up_stack_pointer + la a5, __cpu_up_task_pointer + add a4, a3, a4 + add a5, a3, a5 + REG_L sp, (a4) + REG_L tp, (a5) + .global secondary_start_common secondary_start_common: diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index e89396a2a1af..4e9922790f6e 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -143,7 +143,7 @@ void __init smp_cpus_done(unsigned int max_cpus) /* * C entry point for a secondary processor. */ -asmlinkage __visible void __init smp_callin(void) +asmlinkage __visible void smp_callin(void) { struct mm_struct *mm = &init_mm; diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index a4d136355f78..23a57b92cd1d 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -148,7 +148,7 @@ int is_valid_bugaddr(unsigned long pc) } #endif /* CONFIG_GENERIC_BUG */ -void __init trap_init(void) +void trap_init(void) { /* * Set sup0 scratch register to 0, indicating to exception vector From f1e58583b9c7ceae7f11646e9edf2561d67f29c9 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 17 Mar 2020 18:11:44 -0700 Subject: [PATCH 33/65] RISC-V: Support cpu hotplug This patch enable support for cpu hotplug in RISC-V. It uses SBI HSM extension to online/offline any hart. As a result, the harts are returned to firmware once they are offline. If the harts are brought online afterwards, they re-enter Linux kernel as if a secondary hart booted for the first time. All booting requirements are honored during this process. Tested both on QEMU and HighFive Unleashed board with. Test result follows. --------------------------------------------------- Offline cpu 2 --------------------------------------------------- $ echo 0 > /sys/devices/system/cpu/cpu2/online [ 32.828684] CPU2: off $ cat /proc/cpuinfo processor : 0 hart : 0 isa : rv64imafdcsu mmu : sv48 processor : 1 hart : 1 isa : rv64imafdcsu mmu : sv48 processor : 3 hart : 3 isa : rv64imafdcsu mmu : sv48 processor : 4 hart : 4 isa : rv64imafdcsu mmu : sv48 processor : 5 hart : 5 isa : rv64imafdcsu mmu : sv48 processor : 6 hart : 6 isa : rv64imafdcsu mmu : sv48 processor : 7 hart : 7 isa : rv64imafdcsu mmu : sv48 --------------------------------------------------- online cpu 2 --------------------------------------------------- $ echo 1 > /sys/devices/system/cpu/cpu2/online $ cat /proc/cpuinfo processor : 0 hart : 0 isa : rv64imafdcsu mmu : sv48 processor : 1 hart : 1 isa : rv64imafdcsu mmu : sv48 processor : 2 hart : 2 isa : rv64imafdcsu mmu : sv48 processor : 3 hart : 3 isa : rv64imafdcsu mmu : sv48 processor : 4 hart : 4 isa : rv64imafdcsu mmu : sv48 processor : 5 hart : 5 isa : rv64imafdcsu mmu : sv48 processor : 6 hart : 6 isa : rv64imafdcsu mmu : sv48 processor : 7 hart : 7 isa : rv64imafdcsu mmu : sv48 Signed-off-by: Atish Patra Reviewed-by: Anup Patel --- arch/riscv/Kconfig | 12 ++++- arch/riscv/include/asm/cpu_ops.h | 12 +++++ arch/riscv/include/asm/smp.h | 17 +++++++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/cpu-hotplug.c | 87 ++++++++++++++++++++++++++++++++ arch/riscv/kernel/cpu_ops_sbi.c | 34 +++++++++++++ arch/riscv/kernel/setup.c | 19 ++++++- 7 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 arch/riscv/kernel/cpu-hotplug.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index cac23a5c1104..bc713666f00a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -20,7 +20,6 @@ config RISCV select CLONE_BACKWARDS select COMMON_CLK select GENERIC_CLOCKEVENTS - select GENERIC_CPU_DEVICES select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK @@ -254,6 +253,17 @@ config NR_CPUS depends on SMP default "8" +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" + depends on SMP + select GENERIC_IRQ_MIGRATION + help + + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu. + + Say N if you want to disable CPU hotplug. + choice prompt "CPU Tuning" default TUNE_GENERIC diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h index 5ce81a28e1d9..a8ec3c5c1bd2 100644 --- a/arch/riscv/include/asm/cpu_ops.h +++ b/arch/riscv/include/asm/cpu_ops.h @@ -18,12 +18,24 @@ * is a mechanism for doing so, tests whether it is * possible to boot the given HART. * @cpu_start: Boots a cpu into the kernel. + * @cpu_disable: Prepares a cpu to die. May fail for some + * mechanism-specific reason, which will cause the hot + * unplug to be aborted. Called from the cpu to be killed. + * @cpu_stop: Makes a cpu leave the kernel. Must not fail. Called from + * the cpu being stopped. + * @cpu_is_stopped: Ensures a cpu has left the kernel. Called from another + * cpu. */ struct cpu_operations { const char *name; int (*cpu_prepare)(unsigned int cpu); int (*cpu_start)(unsigned int cpu, struct task_struct *tidle); +#ifdef CONFIG_HOTPLUG_CPU + int (*cpu_disable)(unsigned int cpu); + void (*cpu_stop)(void); + int (*cpu_is_stopped)(unsigned int cpu); +#endif }; extern const struct cpu_operations *cpu_ops[NR_CPUS]; diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index 023f74fb8b3b..f4c7cfda6b7f 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -43,6 +43,13 @@ void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); */ #define raw_smp_processor_id() (current_thread_info()->cpu) +#if defined CONFIG_HOTPLUG_CPU +int __cpu_disable(void); +void __cpu_die(unsigned int cpu); +void cpu_stop(void); +#else +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline void show_ipi_stats(struct seq_file *p, int prec) @@ -69,4 +76,14 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, } #endif /* CONFIG_SMP */ + +#if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP) +bool cpu_has_hotplug(unsigned int cpu); +#else +static inline bool cpu_has_hotplug(unsigned int cpu) +{ + return false; +} +#endif + #endif /* _ASM_RISCV_SMP_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 674a23cc4223..c121cc491eb8 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -49,5 +49,6 @@ obj-$(CONFIG_RISCV_SBI) += sbi.o ifeq ($(CONFIG_RISCV_SBI), y) obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif +obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o clean: diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c new file mode 100644 index 000000000000..df84e0c13db1 --- /dev/null +++ b/arch/riscv/kernel/cpu-hotplug.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void cpu_stop(void); +void arch_cpu_idle_dead(void) +{ + cpu_stop(); +} + +bool cpu_has_hotplug(unsigned int cpu) +{ + if (cpu_ops[cpu]->cpu_stop) + return true; + + return false; +} + +/* + * __cpu_disable runs on the processor to be shutdown. + */ +int __cpu_disable(void) +{ + int ret = 0; + unsigned int cpu = smp_processor_id(); + + if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_stop) + return -EOPNOTSUPP; + + if (cpu_ops[cpu]->cpu_disable) + ret = cpu_ops[cpu]->cpu_disable(cpu); + + if (ret) + return ret; + + remove_cpu_topology(cpu); + set_cpu_online(cpu, false); + irq_migrate_all_off_this_cpu(); + + return ret; +} + +/* + * Called on the thread which is asking for a CPU to be shutdown. + */ +void __cpu_die(unsigned int cpu) +{ + int ret = 0; + + if (!cpu_wait_death(cpu, 5)) { + pr_err("CPU %u: didn't die\n", cpu); + return; + } + pr_notice("CPU%u: off\n", cpu); + + /* Verify from the firmware if the cpu is really stopped*/ + if (cpu_ops[cpu]->cpu_is_stopped) + ret = cpu_ops[cpu]->cpu_is_stopped(cpu); + if (ret) + pr_warn("CPU%d may not have stopped: %d\n", cpu, ret); +} + +/* + * Called from the idle thread for the CPU which has been shutdown. + */ +void cpu_stop(void) +{ + idle_task_exit(); + + (void)cpu_report_death(); + + cpu_ops[smp_processor_id()]->cpu_stop(); + /* It should never reach here */ + BUG(); +} diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 66f3cded91f5..685fae72b7f5 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -74,8 +74,42 @@ static int sbi_cpu_prepare(unsigned int cpuid) return 0; } +#ifdef CONFIG_HOTPLUG_CPU +static int sbi_cpu_disable(unsigned int cpuid) +{ + if (!cpu_ops_sbi.cpu_stop) + return -EOPNOTSUPP; + return 0; +} + +static void sbi_cpu_stop(void) +{ + int ret; + + ret = sbi_hsm_hart_stop(); + pr_crit("Unable to stop the cpu %u (%d)\n", smp_processor_id(), ret); +} + +static int sbi_cpu_is_stopped(unsigned int cpuid) +{ + int rc; + int hartid = cpuid_to_hartid_map(cpuid); + + rc = sbi_hsm_hart_get_status(hartid); + + if (rc == SBI_HSM_HART_STATUS_STOPPED) + return 0; + return rc; +} +#endif + const struct cpu_operations cpu_ops_sbi = { .name = "sbi", .cpu_prepare = sbi_cpu_prepare, .cpu_start = sbi_cpu_start, +#ifdef CONFIG_HOTPLUG_CPU + .cpu_disable = sbi_cpu_disable, + .cpu_stop = sbi_cpu_stop, + .cpu_is_stopped = sbi_cpu_is_stopped, +#endif }; diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 07f4e7503223..145128a7e560 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -16,12 +16,13 @@ #include #include #include +#include #include +#include #include #include #include -#include #include #include #include @@ -47,6 +48,7 @@ struct screen_info screen_info = { */ atomic_t hart_lottery __section(.sdata); unsigned long boot_cpu_hartid; +static DEFINE_PER_CPU(struct cpu, cpu_devices); void __init parse_dtb(void) { @@ -94,3 +96,18 @@ void __init setup_arch(char **cmdline_p) riscv_fill_hwcap(); } + +static int __init topology_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct cpu *cpu = &per_cpu(cpu_devices, i); + + cpu->hotpluggable = cpu_has_hotplug(i); + register_cpu(cpu, i); + } + + return 0; +} +subsys_initcall(topology_init); From 9553d16fa671b9621c5e2847d08bd90d3be3349c Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 30 Mar 2020 17:11:38 +0530 Subject: [PATCH 34/65] init/kconfig: Add LD_VERSION Kconfig This option can be used in Kconfig files to compare the ld version and enable/disable incompatible config options if required. This option is used in the subsequent patch along with GCC_VERSION to filter out an incompatible feature. Signed-off-by: Amit Daniel Kachhap Signed-off-by: Catalin Marinas --- init/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 452bc1835cd4..68ddbcd974c7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -17,6 +17,10 @@ config GCC_VERSION default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC default 0 +config LD_VERSION + int + default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) + config CC_IS_CLANG def_bool $(success,$(CC) --version | head -n 1 | grep -q clang) From 15cd0e675f3f76b4d21c313795fe0c23df0ee20f Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 30 Mar 2020 17:11:39 +0530 Subject: [PATCH 35/65] arm64: Kconfig: ptrauth: Add binutils version check to fix mismatch Recent addition of ARM64_PTR_AUTH exposed a mismatch issue with binutils. 9.1+ versions of gcc inserts a section note .note.gnu.property but this can be used properly by binutils version greater than 2.33.1. If older binutils are used then the following warnings are generated, aarch64-linux-ld: warning: arch/arm64/kernel/vdso/vgettimeofday.o: unsupported GNU_PROPERTY_TYPE (5) type: 0xc0000000 aarch64-linux-objdump: warning: arch/arm64/lib/csum.o: unsupported GNU_PROPERTY_TYPE (5) type: 0xc0000000 aarch64-linux-nm: warning: .tmp_vmlinux1: unsupported GNU_PROPERTY_TYPE (5) type: 0xc0000000 This patch enables ARM64_PTR_AUTH when gcc and binutils versions are compatible with each other. Older gcc which do not insert such section continue to work as before. This scenario may not occur with clang as a recent commit 3b446c7d27ddd06 ("arm64: Kconfig: verify binutils support for ARM64_PTR_AUTH") masks binutils version lesser then 2.34. Reported-by: kbuild test robot Suggested-by: Vincenzo Frascino Signed-off-by: Amit Daniel Kachhap [catalin.marinas@arm.com: slight adjustment to the comment] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e6712b6818fa..9ba3287bb29d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1503,7 +1503,10 @@ config ARM64_PTR_AUTH default y depends on !KVM || ARM64_VHE depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC - depends on CC_IS_GCC || (CC_IS_CLANG && AS_HAS_CFI_NEGATE_RA_STATE) + # GCC 9.1 and later inserts a .note.gnu.property section note for PAC + # which is only understood by binutils starting with version 2.33.1. + depends on !CC_IS_GCC || GCC_VERSION < 90100 || LD_VERSION >= 233010000 + depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) help Pointer authentication (part of the ARMv8.3 Extensions) provides From b8fdef311a0bd9223f10754f94fdcf1a594a3457 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 31 Mar 2020 20:44:59 +0100 Subject: [PATCH 36/65] arm64: Always force a branch protection mode when the compiler has one Compilers with branch protection support can be configured to enable it by default, it is likely that distributions will do this as part of deploying branch protection system wide. As well as the slight overhead from having some extra NOPs for unused branch protection features this can cause more serious problems when the kernel is providing pointer authentication to userspace but not built for pointer authentication itself. In that case our switching of keys for userspace can affect the kernel unexpectedly, causing pointer authentication instructions in the kernel to corrupt addresses. To ensure that we get consistent and reliable behaviour always explicitly initialise the branch protection mode, ensuring that the kernel is built the same way regardless of the compiler defaults. Fixes: 7503197562567 (arm64: add basic pointer authentication support) Reported-by: Szabolcs Nagy Signed-off-by: Mark Brown Cc: stable@vger.kernel.org [catalin.marinas@arm.com: remove Kconfig option in favour of Makefile check] Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index f15f92ba53e6..85e4149cc5d5 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -65,6 +65,10 @@ stack_protector_prepare: prepare0 include/generated/asm-offsets.h)) endif +# Ensure that if the compiler supports branch protection we default it +# off, this will be overridden if we are using branch protection. +branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) + ifeq ($(CONFIG_ARM64_PTR_AUTH),y) branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf @@ -73,9 +77,10 @@ branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pa # we pass it only to the assembler. This option is utilized only in case of non # integrated assemblers. branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a -KBUILD_CFLAGS += $(branch-prot-flags-y) endif +KBUILD_CFLAGS += $(branch-prot-flags-y) + ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ From e16e65a02913d29a7b27c4e3a415ceec967b0629 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 29 Mar 2020 16:12:58 +0200 Subject: [PATCH 37/65] arm64: remove CONFIG_DEBUG_ALIGN_RODATA feature When CONFIG_DEBUG_ALIGN_RODATA is enabled, kernel segments mapped with different permissions (r-x for .text, r-- for .rodata, rw- for .data, etc) are rounded up to 2 MiB so they can be mapped more efficiently. In particular, it permits the segments to be mapped using level 2 block entries when using 4k pages, which is expected to result in less TLB pressure. However, the mappings for the bulk of the kernel will use level 2 entries anyway, and the misaligned fringes are organized such that they can take advantage of the contiguous bit, and use far fewer level 3 entries than would be needed otherwise. This makes the value of this feature dubious at best, and since it is not enabled in defconfig or in the distro configs, it does not appear to be in wide use either. So let's just remove it. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Acked-by: Will Deacon Acked-by: Laura Abbott Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig.debug | 13 ------------- arch/arm64/include/asm/memory.h | 12 +----------- drivers/firmware/efi/libstub/arm64-stub.c | 8 +++----- 3 files changed, 4 insertions(+), 29 deletions(-) diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 1c906d932d6b..a1efa246c9ed 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -52,19 +52,6 @@ config DEBUG_WX If in doubt, say "Y". -config DEBUG_ALIGN_RODATA - depends on STRICT_KERNEL_RWX - bool "Align linker sections up to SECTION_SIZE" - help - If this option is enabled, sections that may potentially be marked as - read only or non-executable will be aligned up to the section size of - the kernel. This prevents sections from being split into pages and - avoids a potential TLB penalty. The downside is an increase in - alignment and potentially wasted space. Turn on this option if - performance is more important than memory pressure. - - If in doubt, say N. - config DEBUG_EFI depends on EFI && DEBUG_INFO bool "UEFI debugging" diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 2be67b232499..a1871bb32bb1 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -120,22 +120,12 @@ /* * Alignment of kernel segments (e.g. .text, .data). - */ -#if defined(CONFIG_DEBUG_ALIGN_RODATA) -/* - * 4 KB granule: 1 level 2 entry - * 16 KB granule: 128 level 3 entries, with contiguous bit - * 64 KB granule: 32 level 3 entries, with contiguous bit - */ -#define SEGMENT_ALIGN SZ_2M -#else -/* + * * 4 KB granule: 16 level 3 entries, with contiguous bit * 16 KB granule: 4 level 3 entries, without contiguous bit * 64 KB granule: 1 level 3 entry */ #define SEGMENT_ALIGN SZ_64K -#endif /* * Memory types available. diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 2915b44132e6..2bba0c9c3664 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -82,14 +82,12 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) { /* - * If CONFIG_DEBUG_ALIGN_RODATA is not set, produce a - * displacement in the interval [0, MIN_KIMG_ALIGN) that - * doesn't violate this kernel's de-facto alignment + * Produce a displacement in the interval [0, MIN_KIMG_ALIGN) + * that doesn't violate this kernel's de-facto alignment * constraints. */ u32 mask = (MIN_KIMG_ALIGN - 1) & ~(EFI_KIMG_ALIGN - 1); - u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ? - (phys_seed >> 32) & mask : TEXT_OFFSET; + u32 offset = (phys_seed >> 32) & mask; /* * With CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET may not From 0c89649a70bed679fd408c1eb82fa99dbe1354a0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Apr 2020 22:12:12 +1000 Subject: [PATCH 38/65] powerpc/64s: Fix doorbell wakeup msgclr optimisation Commit 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C") broke the doorbell wakeup optimisation introduced by commit a9af97aa0a12 ("powerpc/64s: msgclr when handling doorbell exceptions from system reset"). This patch restores the msgclr, in C code. It's now done in the system reset wakeup path rather than doorbell interrupt replay where it used to be, because it is always the right thing to do in the wakeup case, but it may be rarely of use in other interrupt replay situations in which case it's wasted work - we would have to run measurements to see if that was a worthwhile optimisation, and I suspect it would not be. The results are similar to those in the original commit, test on POWER8 of context_switch selftests benchmark with polling idle disabled (e.g., always nap, giving cross-CPU IPIs) gives the following results: broken patched Different threads, same core: 317k/s 375k/s +18.7% Different cores: 280k/s 282k/s +1.0% Fixes: 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200402121212.1118218-1-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 19 ------------------- arch/powerpc/kernel/irq.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 18bbce143084..728ccb0f560c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -3121,22 +3121,3 @@ handle_dabr_fault: li r5,SIGSEGV bl bad_page_fault b interrupt_return - -/* - * When doorbell is triggered from system reset wakeup, the message is - * not cleared, so it would fire again when EE is enabled. - * - * When coming from local_irq_enable, there may be the same problem if - * we were hard disabled. - * - * Execute msgclr to clear pending exceptions before handling it. - */ -h_doorbell_common_msgclr: - LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) - PPC_MSGCLR(3) - b h_doorbell_common_virt - -doorbell_super_common_msgclr: - LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) - PPC_MSGCLRP(3) - b doorbell_super_common_virt diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index a25ed47087ee..1f1169856dc8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -527,6 +527,19 @@ void irq_set_pending_from_srr1(unsigned long srr1) return; } + if (reason == PACA_IRQ_DBELL) { + /* + * When doorbell triggers a system reset wakeup, the message + * is not cleared, so if the doorbell interrupt is replayed + * and the IPI handled, the doorbell interrupt would still + * fire when EE is enabled. + * + * To avoid taking the superfluous doorbell interrupt, + * execute a msgclr here before the interrupt is replayed. + */ + ppc_msgclr(PPC_DBELL_MSGTYPE); + } + /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, * so this can be called unconditionally with the SRR1 wake From bbe9064f30f06e7791d04f9f61a842226a6a44fe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 26 Mar 2020 17:11:44 +1100 Subject: [PATCH 39/65] selftests/eeh: Skip ahci adapters The ahci driver doesn't support error recovery, and if your root filesystem is attached to it the eeh-basic.sh test will likely kill your machine. So skip any device we see using the ahci driver. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200326061144.2006522-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/eeh/eeh-basic.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh index f988d2f42e8f..8a8d0f456946 100755 --- a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh +++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh @@ -41,6 +41,11 @@ for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do continue; fi + if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then + echo "$dev, Skipped: ahci doesn't support recovery" + continue + fi + # Don't inject errosr into an already-frozen PE. This happens with # PEs that contain multiple PCI devices (e.g. multi-function cards) # and injecting new errors during the recovery process will probably From a95a0a1654f16366360399574e10efd87e867b39 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Fri, 20 Mar 2020 16:31:19 +0530 Subject: [PATCH 40/65] powerpc/pseries: Fix MCE handling on pseries MCE handling on pSeries platform fails as recent rework to use common code for pSeries and PowerNV in machine check error handling tries to access per-cpu variables in realmode. The per-cpu variables may be outside the RMO region on pSeries platform and needs translation to be enabled for access. Just moving these per-cpu variable into RMO region did'nt help because we queue some work to workqueues in real mode, which again tries to touch per-cpu variables. Also fwnmi_release_errinfo() cannot be called when translation is not enabled. This patch fixes this by enabling translation in the exception handler when all required real mode handling is done. This change only affects the pSeries platform. Without this fix below kernel crash is seen on injecting SLB multihit: BUG: Unable to handle kernel data access on read at 0xc00000027b205950 Faulting instruction address: 0xc00000000003b7e0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: mcetest_slb(OE+) af_packet(E) xt_tcpudp(E) ip6t_rpfilter(E) ip6t_REJECT(E) ipt_REJECT(E) xt_conntrack(E) ip_set(E) nfnetlink(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) ip_tables(E) x_tables(E) xfs(E) ibmveth(E) vmx_crypto(E) gf128mul(E) uio_pdrv_genirq(E) uio(E) crct10dif_vpmsum(E) rtc_generic(E) btrfs(E) libcrc32c(E) xor(E) zstd_decompress(E) zstd_compress(E) raid6_pq(E) sr_mod(E) sd_mod(E) cdrom(E) ibmvscsi(E) scsi_transport_srp(E) crc32c_vpmsum(E) dm_mod(E) sg(E) scsi_mod(E) CPU: 34 PID: 8154 Comm: insmod Kdump: loaded Tainted: G OE 5.5.0-mahesh #1 NIP: c00000000003b7e0 LR: c0000000000f2218 CTR: 0000000000000000 REGS: c000000007dcb960 TRAP: 0300 Tainted: G OE (5.5.0-mahesh) MSR: 8000000000001003 CR: 28002428 XER: 20040000 CFAR: c0000000000f2214 DAR: c00000027b205950 DSISR: 40000000 IRQMASK: 0 GPR00: c0000000000f2218 c000000007dcbbf0 c000000001544800 c000000007dcbd70 GPR04: 0000000000000001 c000000007dcbc98 c008000000d00258 c0080000011c0000 GPR08: 0000000000000000 0000000300000003 c000000001035950 0000000003000048 GPR12: 000000027a1d0000 c000000007f9c000 0000000000000558 0000000000000000 GPR16: 0000000000000540 c008000001110000 c008000001110540 0000000000000000 GPR20: c00000000022af10 c00000025480fd70 c008000001280000 c00000004bfbb300 GPR24: c000000001442330 c00800000800000d c008000008000000 4009287a77000510 GPR28: 0000000000000000 0000000000000002 c000000001033d30 0000000000000001 NIP [c00000000003b7e0] save_mce_event+0x30/0x240 LR [c0000000000f2218] pseries_machine_check_realmode+0x2c8/0x4f0 Call Trace: Instruction dump: 3c4c0151 38429050 7c0802a6 60000000 fbc1fff0 fbe1fff8 f821ffd1 3d42ffaf 3fc2ffaf e98d0030 394a1150 3bdef530 <7d6a62aa> 1d2b0048 2f8b0063 380b0001 ---[ end trace 46fd63f36bbdd940 ]--- Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common event code") Reviewed-by: Mahesh Salgaonkar Reviewed-by: Nicholas Piggin Signed-off-by: Ganesh Goudar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200320110119.10207-1-ganeshgr@linux.ibm.com --- arch/powerpc/platforms/pseries/ras.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index aa6208c8d4f0..1d1da639b8b7 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -686,6 +686,17 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) #endif out: + /* + * Enable translation as we will be accessing per-cpu variables + * in save_mce_event() which may fall outside RMO region, also + * leave it enabled because subsequently we will be queuing work + * to workqueues where again per-cpu variables accessed, besides + * fwnmi_release_errinfo() crashes when called in realmode on + * pseries. + * Note: All the realmode handling like flushing SLB entries for + * SLB multihit is done by now. + */ + mtmsr(mfmsr() | MSR_IR | MSR_DR); save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, &mce_err, regs->nip, eaddr, paddr); From a36e8ba60b991d563677227f172db69e030797e6 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Fri, 13 Mar 2020 11:22:37 +0530 Subject: [PATCH 41/65] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events. IMC(In-memory Collection Counters) does performance monitoring in two different modes, i.e accumulation mode(core-imc and thread-imc events), and trace mode(trace-imc events). A cpu thread can either be in accumulation-mode or trace-mode at a time and this is done via the LDBAR register in POWER architecture. The current design does not address the races between thread-imc and trace-imc events. Patch implements a global id and lock to avoid the races between core, trace and thread imc events. With this global id-lock implementation, the system can either run core, thread or trace imc events at a time. i.e. to run any core-imc events, thread/trace imc events should not be enabled/monitored. Signed-off-by: Anju T Sudhakar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200313055238.8656-1-anju@linux.vnet.ibm.com --- arch/powerpc/perf/imc-pmu.c | 173 +++++++++++++++++++++++++++++++----- 1 file changed, 149 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..eb82dda884e5 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem); static struct imc_pmu_ref *trace_imc_refc; static int trace_imc_mem_size; +/* + * Global data structure used to avoid races between thread, + * core and trace-imc + */ +static struct imc_pmu_ref imc_global_refc = { + .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .id = 0, + .refc = 0, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return -EINVAL; ref->refc = 0; + /* + * Reduce the global reference count, if this is the + * last cpu in this core and core-imc event running + * in this cpu. + */ + mutex_lock(&imc_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_CORE) + imc_global_refc.refc--; + + mutex_unlock(&imc_global_refc.lock); } return 0; } @@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void) ppc_core_imc_cpu_offline); } +static void reset_global_refc(struct perf_event *event) +{ + mutex_lock(&imc_global_refc.lock); + imc_global_refc.refc--; + + /* + * If no other thread is running any + * event for this domain(thread/core/trace), + * set the global id to zero. + */ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + mutex_unlock(&imc_global_refc.lock); +} + static void core_imc_counters_release(struct perf_event *event) { int rc, core_id; @@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event *event) ref->refc = 0; } mutex_unlock(&ref->lock); + + reset_global_refc(event); } static int core_imc_event_init(struct perf_event *event) @@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event) ++ref->refc; mutex_unlock(&ref->lock); + /* + * Since the system can run either in accumulation or trace-mode + * of IMC at a time, core-imc events are allowed only if no other + * trace/thread imc events are enabled/monitored. + * + * Take the global lock, and check the refc.id + * to know whether any other trace/thread imc + * events are running. + */ + mutex_lock(&imc_global_refc.lock); + if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) { + /* + * No other trace/thread imc events are running in + * the system, so set the refc.id to core-imc. + */ + imc_global_refc.id = IMC_DOMAIN_CORE; + imc_global_refc.refc++; + } else { + mutex_unlock(&imc_global_refc.lock); + return -EBUSY; + } + mutex_unlock(&imc_global_refc.lock); + event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); event->destroy = core_imc_counters_release; return 0; @@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + /* + * Set the bit 0 of LDBAR to zero. + * + * If bit 0 of LDBAR is unset, it will stop posting + * the counter data to memory. + * For thread-imc, bit 0 of LDBAR will be set to 1 in the + * event_add function. So reset this bit here, to stop the updates + * to memory in the cpu_offline path. + */ + mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63)))); + + /* Reduce the refc if thread-imc event running on this cpu */ + mutex_lock(&imc_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_THREAD) + imc_global_refc.refc--; + mutex_unlock(&imc_global_refc.lock); + return 0; } @@ -916,7 +994,22 @@ static int thread_imc_event_init(struct perf_event *event) if (!target) return -EINVAL; + mutex_lock(&imc_global_refc.lock); + /* + * Check if any other trace/core imc events are running in the + * system, if not set the global id to thread-imc. + */ + if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) { + imc_global_refc.id = IMC_DOMAIN_THREAD; + imc_global_refc.refc++; + } else { + mutex_unlock(&imc_global_refc.lock); + return -EBUSY; + } + mutex_unlock(&imc_global_refc.lock); + event->pmu->task_ctx_nr = perf_sw_context; + event->destroy = reset_global_refc; return 0; } @@ -1063,10 +1156,12 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - mtspr(SPRN_LDBAR, 0); - core_id = smp_processor_id() / threads_per_core; ref = &core_imc_refc[core_id]; + if (!ref) { + pr_debug("imc: Failed to get event reference count\n"); + return; + } mutex_lock(&ref->lock); ref->refc--; @@ -1082,6 +1177,10 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(&ref->lock); + + /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */ + mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63)))); + /* * Take a snapshot and calculate the delta and update * the event counter values. @@ -1133,7 +1232,18 @@ static int ppc_trace_imc_cpu_online(unsigned int cpu) static int ppc_trace_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + /* + * No need to set bit 0 of LDBAR to zero, as + * it is set to zero for imc trace-mode + * + * Reduce the refc if any trace-imc event running + * on this cpu. + */ + mutex_lock(&imc_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_TRACE) + imc_global_refc.refc--; + mutex_unlock(&imc_global_refc.lock); + return 0; } @@ -1226,15 +1336,14 @@ static int trace_imc_event_add(struct perf_event *event, int flags) local_mem = get_trace_imc_event_base_addr(); ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; - if (core_imc_refc) - ref = &core_imc_refc[core_id]; + /* trace-imc reference count */ + if (trace_imc_refc) + ref = &trace_imc_refc[core_id]; if (!ref) { - /* If core-imc is not enabled, use trace-imc reference count */ - if (trace_imc_refc) - ref = &trace_imc_refc[core_id]; - if (!ref) - return -EINVAL; + pr_debug("imc: Failed to get the event reference count\n"); + return -EINVAL; } + mtspr(SPRN_LDBAR, ldbar_value); mutex_lock(&ref->lock); if (ref->refc == 0) { @@ -1242,13 +1351,11 @@ static int trace_imc_event_add(struct perf_event *event, int flags) get_hard_smp_processor_id(smp_processor_id()))) { mutex_unlock(&ref->lock); pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); - mtspr(SPRN_LDBAR, 0); return -EINVAL; } } ++ref->refc; mutex_unlock(&ref->lock); - return 0; } @@ -1274,16 +1381,13 @@ static void trace_imc_event_del(struct perf_event *event, int flags) int core_id = smp_processor_id() / threads_per_core; struct imc_pmu_ref *ref = NULL; - if (core_imc_refc) - ref = &core_imc_refc[core_id]; + if (trace_imc_refc) + ref = &trace_imc_refc[core_id]; if (!ref) { - /* If core-imc is not enabled, use trace-imc reference count */ - if (trace_imc_refc) - ref = &trace_imc_refc[core_id]; - if (!ref) - return; + pr_debug("imc: Failed to get event reference count\n"); + return; } - mtspr(SPRN_LDBAR, 0); + mutex_lock(&ref->lock); ref->refc--; if (ref->refc == 0) { @@ -1297,6 +1401,7 @@ static void trace_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(&ref->lock); + trace_imc_event_stop(event, flags); } @@ -1314,10 +1419,30 @@ static int trace_imc_event_init(struct perf_event *event) if (event->attr.sample_period == 0) return -ENOENT; + /* + * Take the global lock, and make sure + * no other thread is running any core/thread imc + * events + */ + mutex_lock(&imc_global_refc.lock); + if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) { + /* + * No core/thread imc events are running in the + * system, so set the refc.id to trace-imc. + */ + imc_global_refc.id = IMC_DOMAIN_TRACE; + imc_global_refc.refc++; + } else { + mutex_unlock(&imc_global_refc.lock); + return -EBUSY; + } + mutex_unlock(&imc_global_refc.lock); + event->hw.idx = -1; target = event->hw.target; event->pmu->task_ctx_nr = perf_hw_context; + event->destroy = reset_global_refc; return 0; } @@ -1429,10 +1554,10 @@ static void cleanup_all_core_imc_memory(void) static void thread_imc_ldbar_disable(void *dummy) { /* - * By Zeroing LDBAR, we disable thread-imc - * updates. + * By setting 0th bit of LDBAR to zero, we disable thread-imc + * updates to memory. */ - mtspr(SPRN_LDBAR, 0); + mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63)))); } void thread_imc_disable(void) From 4bdd39460b5f57d4f165c4a05b22e66eb3a490ae Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Fri, 13 Mar 2020 11:22:38 +0530 Subject: [PATCH 42/65] powerpc/powernv: Re-enable imc trace-mode in kernel commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). Patch to re-enable imc-trace mode in kernel. The previous patch(1/2) in this series will address the mode switching issue by implementing a global lock, and will restrict the usage of accumulation and trace-mode at a time. Signed-off-by: Anju T Sudhakar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200313055238.8656-2-anju@linux.vnet.ibm.com --- arch/powerpc/platforms/powernv/opal-imc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 968b9a4d1cd9..7824cc364bc4 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) domain = IMC_DOMAIN_THREAD; break; case IMC_TYPE_TRACE: - /* - * FIXME. Using trace_imc events to monitor application - * or KVM thread performance can cause a checkstop - * (system crash). - * Disable it for now. - */ - pr_info_once("IMC: disabling trace_imc PMU\n"); - domain = -1; + domain = IMC_DOMAIN_TRACE; break; default: pr_warn("IMC Unknown Device type \n"); From 7ee417497a29028502cf952f419ab2635f563d51 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 27 Mar 2020 20:26:23 +0000 Subject: [PATCH 43/65] powerpc/ps3: Remove duplicate error message Remove a duplicate memory allocation failure error message. Signed-off-by: Markus Elfring Signed-off-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1bc5a16a22c487c478a204ebb7b80a22d2ad9cd0.1585340156.git.geoff@infradead.org --- arch/powerpc/platforms/ps3/os-area.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index cbddd63caf2d..e8530371aed6 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -613,10 +613,8 @@ static int update_flash_db(void) /* Read in header and db from flash. */ header = kmalloc(buf_len, GFP_KERNEL); - if (!header) { - pr_debug("%s: kmalloc failed\n", __func__); + if (!header) return -ENOMEM; - } count = os_area_flash_read(header, buf_len, 0); if (count < 0) { From 96efbab92cccbe3434501e5a77cbaa01c5bc2767 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Mar 2020 20:26:23 +0000 Subject: [PATCH 44/65] powerpc/ps3: Remove an unneeded NULL check Static checkers don't like the inconsistent NULL checking on "ops". This function is only called once and "ops" isn't NULL so the check can be removed. Signed-off-by: Dan Carpenter Signed-off-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ddc3513dc54d15456692c80df49287fe3babe40a.1585340156.git.geoff@infradead.org --- drivers/ps3/sys-manager-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ps3/sys-manager-core.c b/drivers/ps3/sys-manager-core.c index 24709c572c0c..e061b7d0632b 100644 --- a/drivers/ps3/sys-manager-core.c +++ b/drivers/ps3/sys-manager-core.c @@ -31,7 +31,7 @@ void ps3_sys_manager_register_ops(const struct ps3_sys_manager_ops *ops) { BUG_ON(!ops); BUG_ON(!ops->dev); - ps3_sys_manager_ops = ops ? *ops : ps3_sys_manager_ops; + ps3_sys_manager_ops = *ops; } EXPORT_SYMBOL_GPL(ps3_sys_manager_register_ops); From d3883fa0784832bb65df019ae6a291a87d146fb1 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 27 Mar 2020 20:26:23 +0000 Subject: [PATCH 45/65] powerpc/ps3: Set CONFIG_UEVENT_HELPER=y in ps3_defconfig Set CONFIG_UEVENT_HELPER=y in ps3_defconfig. commit 1be01d4a57142ded23bdb9e0c8d9369e693b26cc (driver: base: Disable CONFIG_UEVENT_HELPER by default) disabled the CONFIG_UEVENT_HELPER option that is needed for hotplug and module loading by most older 32bit powerpc distributions that users typically install on the PS3. Signed-off-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/410cda9aa1a6e04434dfe1f9aa2103d0694f706c.1585340156.git.geoff@infradead.org --- arch/powerpc/configs/ps3_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 4db51719342a..81b55c880fc3 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -60,6 +60,8 @@ CONFIG_CFG80211=m CONFIG_CFG80211_WEXT=y CONFIG_MAC80211=m # CONFIG_MAC80211_RC_MINSTREL is not set +CONFIG_UEVENT_HELPER=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=65535 From 9e62ccec3ba0a17c8050ea78500dfdd0e4c5c0cc Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:12 +0100 Subject: [PATCH 46/65] powerpc: Add back __ARCH_WANT_SYS_LLSEEK macro This partially reverts commit caf6f9c8a326 ("asm-generic: Remove unneeded __ARCH_WANT_SYS_LLSEEK macro") When CONFIG_COMPAT is disabled on ppc64 the kernel does not build. There is resistance to both removing the llseek syscall from the 64bit syscall tables and building the llseek interface unconditionally. Signed-off-by: Michal Suchanek Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/lkml/20190828151552.GA16855@infradead.org/ Link: https://lore.kernel.org/lkml/20190829214319.498c7de2@naga/ Link: https://lore.kernel.org/r/dd4575c51e31766e87f7e7fa121d099ab78d3290.1584699455.git.msuchanek@suse.de --- arch/powerpc/include/asm/unistd.h | 1 + fs/read_write.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b0720c7c3fcf..700fcdac2e3c 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -31,6 +31,7 @@ #define __ARCH_WANT_SYS_SOCKETCALL #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT #define __ARCH_WANT_SYS_OLD_UNAME diff --git a/fs/read_write.c b/fs/read_write.c index 59d819c5b92e..bbfa9b12b15e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i } #endif -#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ + defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) From 3dd4eb83a9c08ed6af482a5417323a6c8f4fc7a7 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:13 +0100 Subject: [PATCH 47/65] powerpc: move common register copy functions from signal_32.c to signal.c These functions are required for 64bit as well. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9fd6d9b7c5e91fab21159fe23534a2f16b4962d3.1584699455.git.msuchanek@suse.de --- arch/powerpc/kernel/signal.c | 141 ++++++++++++++++++++++++++++++++ arch/powerpc/kernel/signal_32.c | 140 ------------------------------- 2 files changed, 141 insertions(+), 140 deletions(-) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index d215f9554553..4b0152108f61 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -18,12 +18,153 @@ #include #include #include +#include #include #include #include #include "signal.h" +#ifdef CONFIG_VSX +unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_FPR(i); + buf[i] = task->thread.fp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_FPR(i) = buf[i]; + task->thread.fp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_vsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +unsigned long copy_ckfpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_CKFPR(i); + buf[i] = task->thread.ckfp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_ckfpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_CKFPR(i) = buf[i]; + task->thread.ckfp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_ckvsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_ckvsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#else +inline unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.fp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.fp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +inline unsigned long copy_ckfpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.ckfp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_ckfpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.ckfp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#endif + /* Log an error when sending an unhandled signal to a process. Controlled * through debug.exception-trace sysctl. */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 1b090a76b444..4f96d29a22bf 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -235,146 +235,6 @@ struct rt_sigframe { int abigap[56]; }; -#ifdef CONFIG_VSX -unsigned long copy_fpr_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NFPREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - buf[i] = task->thread.TS_FPR(i); - buf[i] = task->thread.fp_state.fpscr; - return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); -} - -unsigned long copy_fpr_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NFPREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) - return 1; - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - task->thread.TS_FPR(i) = buf[i]; - task->thread.fp_state.fpscr = buf[i]; - - return 0; -} - -unsigned long copy_vsx_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < ELF_NVSRHALFREG; i++) - buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; - return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); -} - -unsigned long copy_vsx_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) - return 1; - for (i = 0; i < ELF_NVSRHALFREG ; i++) - task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - return 0; -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -unsigned long copy_ckfpr_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NFPREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - buf[i] = task->thread.TS_CKFPR(i); - buf[i] = task->thread.ckfp_state.fpscr; - return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); -} - -unsigned long copy_ckfpr_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NFPREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) - return 1; - for (i = 0; i < (ELF_NFPREG - 1) ; i++) - task->thread.TS_CKFPR(i) = buf[i]; - task->thread.ckfp_state.fpscr = buf[i]; - - return 0; -} - -unsigned long copy_ckvsx_to_user(void __user *to, - struct task_struct *task) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - /* save FPR copy to local buffer then write to the thread_struct */ - for (i = 0; i < ELF_NVSRHALFREG; i++) - buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; - return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); -} - -unsigned long copy_ckvsx_from_user(struct task_struct *task, - void __user *from) -{ - u64 buf[ELF_NVSRHALFREG]; - int i; - - if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) - return 1; - for (i = 0; i < ELF_NVSRHALFREG ; i++) - task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; - return 0; -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -#else -inline unsigned long copy_fpr_to_user(void __user *to, - struct task_struct *task) -{ - return __copy_to_user(to, task->thread.fp_state.fpr, - ELF_NFPREG * sizeof(double)); -} - -inline unsigned long copy_fpr_from_user(struct task_struct *task, - void __user *from) -{ - return __copy_from_user(task->thread.fp_state.fpr, from, - ELF_NFPREG * sizeof(double)); -} - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -inline unsigned long copy_ckfpr_to_user(void __user *to, - struct task_struct *task) -{ - return __copy_to_user(to, task->thread.ckfp_state.fpr, - ELF_NFPREG * sizeof(double)); -} - -inline unsigned long copy_ckfpr_from_user(struct task_struct *task, - void __user *from) -{ - return __copy_from_user(task->thread.ckfp_state.fpr, from, - ELF_NFPREG * sizeof(double)); -} -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -#endif - /* * Save the current user registers on the user stack. * We only save the altivec/spe registers if the process has used From d6c19bdee2ba3d9426d31cb82d036212c3b2fb47 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:14 +0100 Subject: [PATCH 48/65] powerpc/perf: consolidate read_user_stack_32 There are two almost identical copies for 32bit and 64bit. The function is used only in 32bit code which will be split out in next patch so consolidate to one function. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0c21c919ed1296420199c78f7c3cfd29d3c7e909.1584699455.git.msuchanek@suse.de --- arch/powerpc/perf/callchain.c | 48 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index cbc251981209..c9a78c6e4361 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -161,18 +161,6 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) return read_user_stack_slow(ptr, ret, 8); } -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - if (!probe_user_read(ret, ptr, sizeof(*ret))) - return 0; - - return read_user_stack_slow(ptr, ret, 4); -} - static inline int valid_user_sp(unsigned long sp, int is_64) { if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32) @@ -277,19 +265,9 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, } #else /* CONFIG_PPC64 */ -/* - * On 32-bit we just access the address and let hash_page create a - * HPTE if necessary, so there is no need to fall back to reading - * the page tables. Since this is called at interrupt level, - * do_page_fault() won't treat a DSI as a page fault. - */ -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +static int read_user_stack_slow(void __user *ptr, void *buf, int nb) { - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - return probe_user_read(ret, ptr, sizeof(*ret)); + return 0; } static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, @@ -312,6 +290,28 @@ static inline int valid_user_sp(unsigned long sp, int is_64) #endif /* CONFIG_PPC64 */ +/* + * On 32-bit we just access the address and let hash_page create a + * HPTE if necessary, so there is no need to fall back to reading + * the page tables. Since this is called at interrupt level, + * do_page_fault() won't treat a DSI as a page fault. + */ +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + int rc; + + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + rc = probe_user_read(ret, ptr, sizeof(*ret)); + + if (IS_ENABLED(CONFIG_PPC64) && rc) + return read_user_stack_slow(ptr, ret, 4); + + return rc; +} + /* * Layout for non-RT signal frames */ From 2910428106ebf23a9a2176cb751749edb2ce57e2 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:15 +0100 Subject: [PATCH 49/65] powerpc/perf: consolidate valid_user_sp -> invalid_user_sp Merge the 32bit and 64bit version. Halve the check constants on 32bit. Use STACK_TOP since it is defined. Passing is_64 is now redundant since is_32bit_task() is used to determine which callchain variant should be used. Use STACK_TOP and is_32bit_task() directly. This removes a page from the valid 32bit area on 64bit: #define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE)) #define STACK_TOP_USER32 TASK_SIZE_USER32 Change return value to bool. It is inverted by users anyway. Change to invalid_user_sp to avoid inverting the return value twice. Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/be8e40fc0737fb28ad08b198552dee7cac1c5ce2.1584699455.git.msuchanek@suse.de --- arch/powerpc/perf/callchain.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c9a78c6e4361..001d0473a61f 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -102,6 +102,14 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re } } +static inline bool invalid_user_sp(unsigned long sp) +{ + unsigned long mask = is_32bit_task() ? 3 : 7; + unsigned long top = STACK_TOP - (is_32bit_task() ? 16 : 32); + + return (!sp || (sp & mask) || (sp > top)); +} + #ifdef CONFIG_PPC64 /* * On 64-bit we don't want to invoke hash_page on user addresses from @@ -161,13 +169,6 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) return read_user_stack_slow(ptr, ret, 8); } -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32) - return 0; - return 1; -} - /* * 64-bit user processes use the same stack frame for RT and non-RT signals. */ @@ -226,7 +227,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, while (entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; - if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) + if (invalid_user_sp(sp) || read_user_stack_64(fp, &next_sp)) return; if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) return; @@ -275,13 +276,6 @@ static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry { } -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > TASK_SIZE - 32) - return 0; - return 1; -} - #define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE #define sigcontext32 sigcontext #define mcontext32 mcontext @@ -423,7 +417,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, while (entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; - if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) + if (invalid_user_sp(sp) || read_user_stack_32(fp, &next_sp)) return; if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) return; From 0a7601b6ffddec11d7cc0bc3264daf0159f5e1a6 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:16 +0100 Subject: [PATCH 50/65] powerpc/64: make buildable without CONFIG_COMPAT There are numerous references to 32bit functions in generic and 64bit code so ifdef them out. Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e5619617020ef3a1f54f0c076e7d74cb9ec9f3bf.1584699455.git.msuchanek@suse.de --- arch/powerpc/include/asm/thread_info.h | 4 ++-- arch/powerpc/kernel/Makefile | 5 +++-- arch/powerpc/kernel/entry_64.S | 2 ++ arch/powerpc/kernel/ptrace/Makefile | 2 +- arch/powerpc/kernel/signal.c | 3 +-- arch/powerpc/kernel/syscall_64.c | 6 ++---- arch/powerpc/kernel/vdso.c | 3 ++- arch/powerpc/perf/callchain.c | 8 +++++++- 8 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index a2270749b282..ca6c97025704 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -162,10 +162,10 @@ static inline bool test_thread_local_flags(unsigned int flags) return (ti->local_flags & flags) != 0; } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_COMPAT #define is_32bit_task() (test_thread_flag(TIF_32BIT)) #else -#define is_32bit_task() (1) +#define is_32bit_task() (IS_ENABLED(CONFIG_PPC32)) #endif #if defined(CONFIG_PPC64) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 570660efbb3d..1c4385852d3d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -40,16 +40,17 @@ CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING endif obj-y := cputable.o syscalls.o \ - irq.o align.o signal_32.o pmc.o vdso.o \ + irq.o align.o signal_$(BITS).o pmc.o vdso.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o obj-y += ptrace/ -obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o signal_64.o \ +obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o firmware.o note.o \ syscall_64.o +obj-$(CONFIG_COMPAT) += sys_ppc32.o signal_32.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 63f0a4414618..9a1e5d636dea 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -52,8 +52,10 @@ SYS_CALL_TABLE: .tc sys_call_table[TC],sys_call_table +#ifdef CONFIG_COMPAT COMPAT_SYS_CALL_TABLE: .tc compat_sys_call_table[TC],compat_sys_call_table +#endif /* This value is used to mark exception frames on the stack. */ exception_marker: diff --git a/arch/powerpc/kernel/ptrace/Makefile b/arch/powerpc/kernel/ptrace/Makefile index e9d97c2d063e..c2f2402ebc8c 100644 --- a/arch/powerpc/kernel/ptrace/Makefile +++ b/arch/powerpc/kernel/ptrace/Makefile @@ -6,7 +6,7 @@ CFLAGS_ptrace-view.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' obj-y += ptrace.o ptrace-view.o -obj-$(CONFIG_PPC64) += ptrace32.o +obj-$(CONFIG_COMPAT) += ptrace32.o obj-$(CONFIG_VSX) += ptrace-vsx.o ifneq ($(CONFIG_VSX),y) obj-y += ptrace-novsx.o diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 4b0152108f61..a264989626fd 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -247,7 +247,6 @@ static void do_signal(struct task_struct *tsk) sigset_t *oldset = sigmask_to_save(); struct ksignal ksig = { .sig = 0 }; int ret; - int is32 = is_32bit_task(); BUG_ON(tsk != current); @@ -277,7 +276,7 @@ static void do_signal(struct task_struct *tsk) rseq_signal_deliver(&ksig, tsk->thread.regs); - if (is32) { + if (is_32bit_task()) { if (ksig.ka.sa.sa_flags & SA_SIGINFO) ret = handle_rt_signal32(&ksig, oldset, tsk); else diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index cf06eb443a80..c74295a7765b 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -22,7 +22,6 @@ notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs) { - unsigned long ti_flags; syscall_fn f; if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) @@ -60,8 +59,7 @@ notrace long system_call_exception(long r3, long r4, long r5, local_irq_enable(); - ti_flags = current_thread_info()->flags; - if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { + if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) { /* * We use the return value of do_syscall_trace_enter() as the * syscall number. If the syscall was rejected for any reason @@ -86,7 +84,7 @@ notrace long system_call_exception(long r3, long r4, long r5, /* May be faster to do array_index_nospec? */ barrier_nospec(); - if (unlikely(ti_flags & _TIF_32BIT)) { + if (unlikely(is_32bit_task())) { f = (void *)compat_sys_call_table[r0]; r3 &= 0x00000000ffffffffULL; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d3b77c15f9ce..f38f26e844b6 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -651,7 +651,8 @@ static void __init vdso_setup_syscall_map(void) if (sys_call_table[i] != sys_ni_syscall) vdso_data->syscall_map_64[i >> 5] |= 0x80000000UL >> (i & 0x1f); - if (compat_sys_call_table[i] != sys_ni_syscall) + if (IS_ENABLED(CONFIG_COMPAT) && + compat_sys_call_table[i] != sys_ni_syscall) vdso_data->syscall_map_32[i >> 5] |= 0x80000000UL >> (i & 0x1f); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 001d0473a61f..b5afd0bec4f8 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -15,7 +15,7 @@ #include #include #include -#ifdef CONFIG_PPC64 +#ifdef CONFIG_COMPAT #include "../kernel/ppc32.h" #endif #include @@ -284,6 +284,7 @@ static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_PPC32) || defined(CONFIG_COMPAT) /* * On 32-bit we just access the address and let hash_page create a * HPTE if necessary, so there is no need to fall back to reading @@ -447,6 +448,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, sp = next_sp; } } +#else /* 32bit */ +static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{} +#endif /* 32bit */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) From 6e944aed8859d3b2de32ddb86748db9aefa43667 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:17 +0100 Subject: [PATCH 51/65] powerpc/64: Make COMPAT user-selectable disabled on littleendian by default. On bigendian ppc64 it is common to have 32bit legacy binaries but much less so on littleendian. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/41393d6e895b0d3a47ee62f8f51e1cf888ad6226.1584699455.git.msuchanek@suse.de --- arch/powerpc/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 497b7d0b2d7e..29d00b3959b9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -264,8 +264,9 @@ config PANIC_TIMEOUT default 180 config COMPAT - bool - default y if PPC64 + bool "Enable support for 32bit binaries" + depends on PPC64 + default y if !CPU_LITTLE_ENDIAN select COMPAT_BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION From 7c0eda1a04340a1de09bdf6521853e3bc0637c3b Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 20 Mar 2020 11:20:18 +0100 Subject: [PATCH 52/65] powerpc/perf: split callchain.c by bitness Building callchain.c with !COMPAT proved quite ugly with all the defines. Splitting out the 32bit and 64bit parts looks better. No code change intended. Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a20027bf1074935a7934ee2a6757c99ea047e70d.1584699455.git.msuchanek@suse.de --- arch/powerpc/perf/Makefile | 5 +- arch/powerpc/perf/callchain.c | 356 +------------------------------ arch/powerpc/perf/callchain.h | 19 ++ arch/powerpc/perf/callchain_32.c | 196 +++++++++++++++++ arch/powerpc/perf/callchain_64.c | 174 +++++++++++++++ 5 files changed, 394 insertions(+), 356 deletions(-) create mode 100644 arch/powerpc/perf/callchain.h create mode 100644 arch/powerpc/perf/callchain_32.c create mode 100644 arch/powerpc/perf/callchain_64.c diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index c155dcbb8691..53d614e98537 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,6 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += callchain.o callchain_$(BITS).o perf_regs.o +ifdef CONFIG_COMPAT +obj-$(CONFIG_PERF_EVENTS) += callchain_32.o +endif obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += ppc970-pmu.o power5-pmu.o \ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index b5afd0bec4f8..dd5051015008 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -15,11 +15,9 @@ #include #include #include -#ifdef CONFIG_COMPAT -#include "../kernel/ppc32.h" -#endif #include +#include "callchain.h" /* * Is sp valid as the address of the next kernel stack frame after prev_sp? @@ -102,358 +100,6 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re } } -static inline bool invalid_user_sp(unsigned long sp) -{ - unsigned long mask = is_32bit_task() ? 3 : 7; - unsigned long top = STACK_TOP - (is_32bit_task() ? 16 : 32); - - return (!sp || (sp & mask) || (sp > top)); -} - -#ifdef CONFIG_PPC64 -/* - * On 64-bit we don't want to invoke hash_page on user addresses from - * interrupt context, so if the access faults, we read the page tables - * to find which page (if any) is mapped and access it directly. - */ -static int read_user_stack_slow(void __user *ptr, void *buf, int nb) -{ - int ret = -EFAULT; - pgd_t *pgdir; - pte_t *ptep, pte; - unsigned shift; - unsigned long addr = (unsigned long) ptr; - unsigned long offset; - unsigned long pfn, flags; - void *kaddr; - - pgdir = current->mm->pgd; - if (!pgdir) - return -EFAULT; - - local_irq_save(flags); - ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); - if (!ptep) - goto err_out; - if (!shift) - shift = PAGE_SHIFT; - - /* align address to page boundary */ - offset = addr & ((1UL << shift) - 1); - - pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !pte_user(pte)) - goto err_out; - pfn = pte_pfn(pte); - if (!page_is_ram(pfn)) - goto err_out; - - /* no highmem to worry about here */ - kaddr = pfn_to_kaddr(pfn); - memcpy(buf, kaddr + offset, nb); - ret = 0; -err_out: - local_irq_restore(flags); - return ret; -} - -static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || - ((unsigned long)ptr & 7)) - return -EFAULT; - - if (!probe_user_read(ret, ptr, sizeof(*ret))) - return 0; - - return read_user_stack_slow(ptr, ret, 8); -} - -/* - * 64-bit user processes use the same stack frame for RT and non-RT signals. - */ -struct signal_frame_64 { - char dummy[__SIGNAL_FRAMESIZE]; - struct ucontext uc; - unsigned long unused[2]; - unsigned int tramp[6]; - struct siginfo *pinfo; - void *puc; - struct siginfo info; - char abigap[288]; -}; - -static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) -{ - if (nip == fp + offsetof(struct signal_frame_64, tramp)) - return 1; - if (vdso64_rt_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) - return 1; - return 0; -} - -/* - * Do some sanity checking on the signal frame pointed to by sp. - * We check the pinfo and puc pointers in the frame. - */ -static int sane_signal_64_frame(unsigned long sp) -{ - struct signal_frame_64 __user *sf; - unsigned long pinfo, puc; - - sf = (struct signal_frame_64 __user *) sp; - if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) || - read_user_stack_64((unsigned long __user *) &sf->puc, &puc)) - return 0; - return pinfo == (unsigned long) &sf->info && - puc == (unsigned long) &sf->uc; -} - -static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, - struct pt_regs *regs) -{ - unsigned long sp, next_sp; - unsigned long next_ip; - unsigned long lr; - long level = 0; - struct signal_frame_64 __user *sigframe; - unsigned long __user *fp, *uregs; - - next_ip = perf_instruction_pointer(regs); - lr = regs->link; - sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); - - while (entry->nr < entry->max_stack) { - fp = (unsigned long __user *) sp; - if (invalid_user_sp(sp) || read_user_stack_64(fp, &next_sp)) - return; - if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) - return; - - /* - * Note: the next_sp - sp >= signal frame size check - * is true when next_sp < sp, which can happen when - * transitioning from an alternate signal stack to the - * normal stack. - */ - if (next_sp - sp >= sizeof(struct signal_frame_64) && - (is_sigreturn_64_address(next_ip, sp) || - (level <= 1 && is_sigreturn_64_address(lr, sp))) && - sane_signal_64_frame(sp)) { - /* - * This looks like an signal frame - */ - sigframe = (struct signal_frame_64 __user *) sp; - uregs = sigframe->uc.uc_mcontext.gp_regs; - if (read_user_stack_64(&uregs[PT_NIP], &next_ip) || - read_user_stack_64(&uregs[PT_LNK], &lr) || - read_user_stack_64(&uregs[PT_R1], &sp)) - return; - level = 0; - perf_callchain_store_context(entry, PERF_CONTEXT_USER); - perf_callchain_store(entry, next_ip); - continue; - } - - if (level == 0) - next_ip = lr; - perf_callchain_store(entry, next_ip); - ++level; - sp = next_sp; - } -} - -#else /* CONFIG_PPC64 */ -static int read_user_stack_slow(void __user *ptr, void *buf, int nb) -{ - return 0; -} - -static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, - struct pt_regs *regs) -{ -} - -#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE -#define sigcontext32 sigcontext -#define mcontext32 mcontext -#define ucontext32 ucontext -#define compat_siginfo_t struct siginfo - -#endif /* CONFIG_PPC64 */ - -#if defined(CONFIG_PPC32) || defined(CONFIG_COMPAT) -/* - * On 32-bit we just access the address and let hash_page create a - * HPTE if necessary, so there is no need to fall back to reading - * the page tables. Since this is called at interrupt level, - * do_page_fault() won't treat a DSI as a page fault. - */ -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) -{ - int rc; - - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - rc = probe_user_read(ret, ptr, sizeof(*ret)); - - if (IS_ENABLED(CONFIG_PPC64) && rc) - return read_user_stack_slow(ptr, ret, 4); - - return rc; -} - -/* - * Layout for non-RT signal frames - */ -struct signal_frame_32 { - char dummy[__SIGNAL_FRAMESIZE32]; - struct sigcontext32 sctx; - struct mcontext32 mctx; - int abigap[56]; -}; - -/* - * Layout for RT signal frames - */ -struct rt_signal_frame_32 { - char dummy[__SIGNAL_FRAMESIZE32 + 16]; - compat_siginfo_t info; - struct ucontext32 uc; - int abigap[56]; -}; - -static int is_sigreturn_32_address(unsigned int nip, unsigned int fp) -{ - if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad)) - return 1; - if (vdso32_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso32_sigtramp) - return 1; - return 0; -} - -static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp) -{ - if (nip == fp + offsetof(struct rt_signal_frame_32, - uc.uc_mcontext.mc_pad)) - return 1; - if (vdso32_rt_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso32_rt_sigtramp) - return 1; - return 0; -} - -static int sane_signal_32_frame(unsigned int sp) -{ - struct signal_frame_32 __user *sf; - unsigned int regs; - - sf = (struct signal_frame_32 __user *) (unsigned long) sp; - if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, ®s)) - return 0; - return regs == (unsigned long) &sf->mctx; -} - -static int sane_rt_signal_32_frame(unsigned int sp) -{ - struct rt_signal_frame_32 __user *sf; - unsigned int regs; - - sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; - if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, ®s)) - return 0; - return regs == (unsigned long) &sf->uc.uc_mcontext; -} - -static unsigned int __user *signal_frame_32_regs(unsigned int sp, - unsigned int next_sp, unsigned int next_ip) -{ - struct mcontext32 __user *mctx = NULL; - struct signal_frame_32 __user *sf; - struct rt_signal_frame_32 __user *rt_sf; - - /* - * Note: the next_sp - sp >= signal frame size check - * is true when next_sp < sp, for example, when - * transitioning from an alternate signal stack to the - * normal stack. - */ - if (next_sp - sp >= sizeof(struct signal_frame_32) && - is_sigreturn_32_address(next_ip, sp) && - sane_signal_32_frame(sp)) { - sf = (struct signal_frame_32 __user *) (unsigned long) sp; - mctx = &sf->mctx; - } - - if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) && - is_rt_sigreturn_32_address(next_ip, sp) && - sane_rt_signal_32_frame(sp)) { - rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; - mctx = &rt_sf->uc.uc_mcontext; - } - - if (!mctx) - return NULL; - return mctx->mc_gregs; -} - -static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, - struct pt_regs *regs) -{ - unsigned int sp, next_sp; - unsigned int next_ip; - unsigned int lr; - long level = 0; - unsigned int __user *fp, *uregs; - - next_ip = perf_instruction_pointer(regs); - lr = regs->link; - sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); - - while (entry->nr < entry->max_stack) { - fp = (unsigned int __user *) (unsigned long) sp; - if (invalid_user_sp(sp) || read_user_stack_32(fp, &next_sp)) - return; - if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) - return; - - uregs = signal_frame_32_regs(sp, next_sp, next_ip); - if (!uregs && level <= 1) - uregs = signal_frame_32_regs(sp, next_sp, lr); - if (uregs) { - /* - * This looks like an signal frame, so restart - * the stack trace with the values in it. - */ - if (read_user_stack_32(&uregs[PT_NIP], &next_ip) || - read_user_stack_32(&uregs[PT_LNK], &lr) || - read_user_stack_32(&uregs[PT_R1], &sp)) - return; - level = 0; - perf_callchain_store_context(entry, PERF_CONTEXT_USER); - perf_callchain_store(entry, next_ip); - continue; - } - - if (level == 0) - next_ip = lr; - perf_callchain_store(entry, next_ip); - ++level; - sp = next_sp; - } -} -#else /* 32bit */ -static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, - struct pt_regs *regs) -{} -#endif /* 32bit */ - void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h new file mode 100644 index 000000000000..7a2cb9e1181a --- /dev/null +++ b/arch/powerpc/perf/callchain.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _POWERPC_PERF_CALLCHAIN_H +#define _POWERPC_PERF_CALLCHAIN_H + +int read_user_stack_slow(void __user *ptr, void *buf, int nb); +void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs); +void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs); + +static inline bool invalid_user_sp(unsigned long sp) +{ + unsigned long mask = is_32bit_task() ? 3 : 7; + unsigned long top = STACK_TOP - (is_32bit_task() ? 16 : 32); + + return (!sp || (sp & mask) || (sp > top)); +} + +#endif /* _POWERPC_PERF_CALLCHAIN_H */ diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c new file mode 100644 index 000000000000..8aa951003141 --- /dev/null +++ b/arch/powerpc/perf/callchain_32.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Performance counter callchain support - powerpc architecture code + * + * Copyright © 2009 Paul Mackerras, IBM Corporation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "callchain.h" + +#ifdef CONFIG_PPC64 +#include "../kernel/ppc32.h" +#else /* CONFIG_PPC64 */ + +#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE +#define sigcontext32 sigcontext +#define mcontext32 mcontext +#define ucontext32 ucontext +#define compat_siginfo_t struct siginfo + +#endif /* CONFIG_PPC64 */ + +/* + * On 32-bit we just access the address and let hash_page create a + * HPTE if necessary, so there is no need to fall back to reading + * the page tables. Since this is called at interrupt level, + * do_page_fault() won't treat a DSI as a page fault. + */ +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + int rc; + + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + rc = probe_user_read(ret, ptr, sizeof(*ret)); + + if (IS_ENABLED(CONFIG_PPC64) && rc) + return read_user_stack_slow(ptr, ret, 4); + + return rc; +} + +/* + * Layout for non-RT signal frames + */ +struct signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32]; + struct sigcontext32 sctx; + struct mcontext32 mctx; + int abigap[56]; +}; + +/* + * Layout for RT signal frames + */ +struct rt_signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32 + 16]; + compat_siginfo_t info; + struct ucontext32 uc; + int abigap[56]; +}; + +static int is_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad)) + return 1; + if (vdso32_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_sigtramp) + return 1; + return 0; +} + +static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct rt_signal_frame_32, + uc.uc_mcontext.mc_pad)) + return 1; + if (vdso32_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_rt_sigtramp) + return 1; + return 0; +} + +static int sane_signal_32_frame(unsigned int sp) +{ + struct signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, ®s)) + return 0; + return regs == (unsigned long) &sf->mctx; +} + +static int sane_rt_signal_32_frame(unsigned int sp) +{ + struct rt_signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, ®s)) + return 0; + return regs == (unsigned long) &sf->uc.uc_mcontext; +} + +static unsigned int __user *signal_frame_32_regs(unsigned int sp, + unsigned int next_sp, unsigned int next_ip) +{ + struct mcontext32 __user *mctx = NULL; + struct signal_frame_32 __user *sf; + struct rt_signal_frame_32 __user *rt_sf; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, for example, when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_32) && + is_sigreturn_32_address(next_ip, sp) && + sane_signal_32_frame(sp)) { + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + mctx = &sf->mctx; + } + + if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) && + is_rt_sigreturn_32_address(next_ip, sp) && + sane_rt_signal_32_frame(sp)) { + rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + mctx = &rt_sf->uc.uc_mcontext; + } + + if (!mctx) + return NULL; + return mctx->mc_gregs; +} + +void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + unsigned int sp, next_sp; + unsigned int next_ip; + unsigned int lr; + long level = 0; + unsigned int __user *fp, *uregs; + + next_ip = perf_instruction_pointer(regs); + lr = regs->link; + sp = regs->gpr[1]; + perf_callchain_store(entry, next_ip); + + while (entry->nr < entry->max_stack) { + fp = (unsigned int __user *) (unsigned long) sp; + if (invalid_user_sp(sp) || read_user_stack_32(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) + return; + + uregs = signal_frame_32_regs(sp, next_sp, next_ip); + if (!uregs && level <= 1) + uregs = signal_frame_32_regs(sp, next_sp, lr); + if (uregs) { + /* + * This looks like an signal frame, so restart + * the stack trace with the values in it. + */ + if (read_user_stack_32(&uregs[PT_NIP], &next_ip) || + read_user_stack_32(&uregs[PT_LNK], &lr) || + read_user_stack_32(&uregs[PT_R1], &sp)) + return; + level = 0; + perf_callchain_store_context(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + perf_callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c new file mode 100644 index 000000000000..df1ffd8b20f2 --- /dev/null +++ b/arch/powerpc/perf/callchain_64.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Performance counter callchain support - powerpc architecture code + * + * Copyright © 2009 Paul Mackerras, IBM Corporation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "callchain.h" + +/* + * On 64-bit we don't want to invoke hash_page on user addresses from + * interrupt context, so if the access faults, we read the page tables + * to find which page (if any) is mapped and access it directly. + */ +int read_user_stack_slow(void __user *ptr, void *buf, int nb) +{ + int ret = -EFAULT; + pgd_t *pgdir; + pte_t *ptep, pte; + unsigned int shift; + unsigned long addr = (unsigned long) ptr; + unsigned long offset; + unsigned long pfn, flags; + void *kaddr; + + pgdir = current->mm->pgd; + if (!pgdir) + return -EFAULT; + + local_irq_save(flags); + ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); + if (!ptep) + goto err_out; + if (!shift) + shift = PAGE_SHIFT; + + /* align address to page boundary */ + offset = addr & ((1UL << shift) - 1); + + pte = READ_ONCE(*ptep); + if (!pte_present(pte) || !pte_user(pte)) + goto err_out; + pfn = pte_pfn(pte); + if (!page_is_ram(pfn)) + goto err_out; + + /* no highmem to worry about here */ + kaddr = pfn_to_kaddr(pfn); + memcpy(buf, kaddr + offset, nb); + ret = 0; +err_out: + local_irq_restore(flags); + return ret; +} + +static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || + ((unsigned long)ptr & 7)) + return -EFAULT; + + if (!probe_user_read(ret, ptr, sizeof(*ret))) + return 0; + + return read_user_stack_slow(ptr, ret, 8); +} + +/* + * 64-bit user processes use the same stack frame for RT and non-RT signals. + */ +struct signal_frame_64 { + char dummy[__SIGNAL_FRAMESIZE]; + struct ucontext uc; + unsigned long unused[2]; + unsigned int tramp[6]; + struct siginfo *pinfo; + void *puc; + struct siginfo info; + char abigap[288]; +}; + +static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) +{ + if (nip == fp + offsetof(struct signal_frame_64, tramp)) + return 1; + if (vdso64_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) + return 1; + return 0; +} + +/* + * Do some sanity checking on the signal frame pointed to by sp. + * We check the pinfo and puc pointers in the frame. + */ +static int sane_signal_64_frame(unsigned long sp) +{ + struct signal_frame_64 __user *sf; + unsigned long pinfo, puc; + + sf = (struct signal_frame_64 __user *) sp; + if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) || + read_user_stack_64((unsigned long __user *) &sf->puc, &puc)) + return 0; + return pinfo == (unsigned long) &sf->info && + puc == (unsigned long) &sf->uc; +} + +void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + unsigned long sp, next_sp; + unsigned long next_ip; + unsigned long lr; + long level = 0; + struct signal_frame_64 __user *sigframe; + unsigned long __user *fp, *uregs; + + next_ip = perf_instruction_pointer(regs); + lr = regs->link; + sp = regs->gpr[1]; + perf_callchain_store(entry, next_ip); + + while (entry->nr < entry->max_stack) { + fp = (unsigned long __user *) sp; + if (invalid_user_sp(sp) || read_user_stack_64(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) + return; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, which can happen when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_64) && + (is_sigreturn_64_address(next_ip, sp) || + (level <= 1 && is_sigreturn_64_address(lr, sp))) && + sane_signal_64_frame(sp)) { + /* + * This looks like an signal frame + */ + sigframe = (struct signal_frame_64 __user *) sp; + uregs = sigframe->uc.uc_mcontext.gp_regs; + if (read_user_stack_64(&uregs[PT_NIP], &next_ip) || + read_user_stack_64(&uregs[PT_LNK], &lr) || + read_user_stack_64(&uregs[PT_R1], &sp)) + return; + level = 0; + perf_callchain_store_context(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + perf_callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} From 54fc3c681ded9437e4548e2501dc1136b23cfa9a Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 31 Mar 2020 12:23:38 +1100 Subject: [PATCH 53/65] powerpc/pseries/ddw: Extend upper limit for huge DMA window for persistent memory Unlike normal memory ("memory" compatible type in the FDT), the persistent memory ("ibm,pmemory" in the FDT) can be mapped anywhere in the guest physical space and it can be used for DMA. In order to maintain 1:1 mapping via the huge DMA window, we need to know the maximum physical address at the time of the window setup. So far we've been looking at "memory" nodes but "ibm,pmemory" does not have fixed addresses and the persistent memory may be mapped afterwards. Since the persistent memory is still backed with page structs, use MAX_PHYSMEM_BITS as the upper limit. This effectively disables huge DMA window in LPAR under pHyp if persistent memory is present but this is the best we can do for the moment. Signed-off-by: Alexey Kardashevskiy Tested-by: Wen Xiong Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200331012338.23773-1-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 2e0a8eab5588..6d47b4a3ce39 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -945,6 +945,15 @@ static phys_addr_t ddw_memory_hotplug_max(void) phys_addr_t max_addr = memory_hotplug_max(); struct device_node *memory; + /* + * The "ibm,pmemory" can appear anywhere in the address space. + * Assuming it is still backed by page structs, set the upper limit + * for the huge DMA window as MAX_PHYSMEM_BITS. + */ + if (of_find_node_by_type(NULL, "ibm,pmemory")) + return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ? + (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS); + for_each_node_by_type(memory, "memory") { unsigned long start, size; int n_mem_addr_cells, n_mem_size_cells, len; From 60083063b755e29685902609e024ecd0c4a1a7d9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 13 Feb 2020 09:38:04 +0100 Subject: [PATCH 54/65] powerpc/time: Replace by The PowerPC time code is not a clock provider, and just needs to call of_clk_init(). Hence it can include instead of . Remove the #ifdef protecting the of_clk_init() call, as a stub is available for the !CONFIG_COMMON_CLK case. Signed-off-by: Geert Uytterhoeven Reviewed-by: Stephen Boyd Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200213083804.24315-1-geert+renesas@glider.be --- arch/powerpc/kernel/time.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bda9cb4a0a5f..2d9d3a3c61d6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include #include @@ -1149,9 +1149,7 @@ void __init time_init(void) init_decrementer_clockevent(); tick_setup_hrtimer_broadcast(); -#ifdef CONFIG_COMMON_CLK of_clk_init(NULL); -#endif } /* From abc3fce76adbdfa8f87272c784b388cd20b46049 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Apr 2020 22:04:01 +1000 Subject: [PATCH 55/65] Revert "powerpc/64: irq_work avoid interrupt when called with hardware irqs enabled" This reverts commit ebb37cf3ffd39fdb6ec5b07111f8bb2f11d92c5f. That commit does not play well with soft-masked irq state manipulations in idle, interrupt replay, and possibly others due to tracing code sometimes using irq_work_queue (e.g., in trace_hardirqs_on()). That can cause PACA_IRQ_DEC to become set when it is not expected, and be ignored or cleared or cause warnings. The net result seems to be missing an irq_work until the next timer interrupt in the worst case which is usually not going to be noticed, however it could be a long time if the tick is disabled, which is against the spirit of irq_work and might cause real problems. The idea is still solid, but it would need more work. It's not really clear if it would be worth added complexity, so revert this for now (not a straight revert, but replace with a comment explaining why we might see interrupts happening, and gives git blame something to find). Fixes: ebb37cf3ffd3 ("powerpc/64: irq_work avoid interrupt when called with hardware irqs enabled") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200402120401.1115883-1-npiggin@gmail.com --- arch/powerpc/kernel/time.c | 44 +++++++++++--------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2d9d3a3c61d6..6fcae436ae51 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -522,35 +522,6 @@ static inline void clear_irq_work_pending(void) "i" (offsetof(struct paca_struct, irq_work_pending))); } -void arch_irq_work_raise(void) -{ - preempt_disable(); - set_irq_work_pending_flag(); - /* - * Non-nmi code running with interrupts disabled will replay - * irq_happened before it re-enables interrupts, so setthe - * decrementer there instead of causing a hardware exception - * which would immediately hit the masked interrupt handler - * and have the net effect of setting the decrementer in - * irq_happened. - * - * NMI interrupts can not check this when they return, so the - * decrementer hardware exception is raised, which will fire - * when interrupts are next enabled. - * - * BookE does not support this yet, it must audit all NMI - * interrupt handlers to ensure they call nmi_enter() so this - * check would be correct. - */ - if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) { - set_dec(1); - } else { - hard_irq_disable(); - local_paca->irq_happened |= PACA_IRQ_DEC; - } - preempt_enable(); -} - #else /* 32-bit */ DEFINE_PER_CPU(u8, irq_work_pending); @@ -559,16 +530,27 @@ DEFINE_PER_CPU(u8, irq_work_pending); #define test_irq_work_pending() __this_cpu_read(irq_work_pending) #define clear_irq_work_pending() __this_cpu_write(irq_work_pending, 0) +#endif /* 32 vs 64 bit */ + void arch_irq_work_raise(void) { + /* + * 64-bit code that uses irq soft-mask can just cause an immediate + * interrupt here that gets soft masked, if this is called under + * local_irq_disable(). It might be possible to prevent that happening + * by noticing interrupts are disabled and setting decrementer pending + * to be replayed when irqs are enabled. The problem there is that + * tracing can call irq_work_raise, including in code that does low + * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on) + * which could get tangled up if we're messing with the same state + * here. + */ preempt_disable(); set_irq_work_pending_flag(); set_dec(1); preempt_enable(); } -#endif /* 32 vs 64 bit */ - #else /* CONFIG_IRQ_WORK */ #define test_irq_work_pending() 0 From 956d705dd279f70d5a222375fa97b637d6e8c43d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2020 09:47:36 +0900 Subject: [PATCH 56/65] riscv: Unaligned load/store handling for M_MODE Add handlers for unaligned load and store traps that may be generated by applications. Code heavily inspired from the OpenSBI project. Handling of the unaligned access traps is suitable for applications compiled with or without compressed instructions and is independent of the kernel CONFIG_RISCV_ISA_C option value. Signed-off-by: Damien Le Moal Signed-off-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 2 +- arch/riscv/kernel/traps.c | 27 +- arch/riscv/kernel/traps_misaligned.c | 370 +++++++++++++++++++++++++++ 3 files changed, 395 insertions(+), 4 deletions(-) create mode 100644 arch/riscv/kernel/traps_misaligned.c diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c121cc491eb8..1bad93f63dba 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -30,7 +30,7 @@ obj-y += cacheinfo.o obj-y += patch.o obj-$(CONFIG_MMU) += vdso.o vdso/ -obj-$(CONFIG_RISCV_M_MODE) += clint.o +obj-$(CONFIG_RISCV_M_MODE) += clint.o traps_misaligned.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 23a57b92cd1d..29aef5cbaf65 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -97,12 +97,33 @@ DO_ERROR_INFO(do_trap_insn_fault, SIGSEGV, SEGV_ACCERR, "instruction access fault"); DO_ERROR_INFO(do_trap_insn_illegal, SIGILL, ILL_ILLOPC, "illegal instruction"); -DO_ERROR_INFO(do_trap_load_misaligned, - SIGBUS, BUS_ADRALN, "load address misaligned"); DO_ERROR_INFO(do_trap_load_fault, SIGSEGV, SEGV_ACCERR, "load access fault"); +#ifndef CONFIG_RISCV_M_MODE +DO_ERROR_INFO(do_trap_load_misaligned, + SIGBUS, BUS_ADRALN, "Oops - load address misaligned"); DO_ERROR_INFO(do_trap_store_misaligned, - SIGBUS, BUS_ADRALN, "store (or AMO) address misaligned"); + SIGBUS, BUS_ADRALN, "Oops - store (or AMO) address misaligned"); +#else +int handle_misaligned_load(struct pt_regs *regs); +int handle_misaligned_store(struct pt_regs *regs); + +asmlinkage void do_trap_load_misaligned(struct pt_regs *regs) +{ + if (!handle_misaligned_load(regs)) + return; + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - load address misaligned"); +} + +asmlinkage void do_trap_store_misaligned(struct pt_regs *regs) +{ + if (!handle_misaligned_store(regs)) + return; + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - store (or AMO) address misaligned"); +} +#endif DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); DO_ERROR_INFO(do_trap_ecall_u, diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c new file mode 100644 index 000000000000..46c4dafe3ba0 --- /dev/null +++ b/arch/riscv/kernel/traps_misaligned.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +#define INSN_MATCH_LB 0x3 +#define INSN_MASK_LB 0x707f +#define INSN_MATCH_LH 0x1003 +#define INSN_MASK_LH 0x707f +#define INSN_MATCH_LW 0x2003 +#define INSN_MASK_LW 0x707f +#define INSN_MATCH_LD 0x3003 +#define INSN_MASK_LD 0x707f +#define INSN_MATCH_LBU 0x4003 +#define INSN_MASK_LBU 0x707f +#define INSN_MATCH_LHU 0x5003 +#define INSN_MASK_LHU 0x707f +#define INSN_MATCH_LWU 0x6003 +#define INSN_MASK_LWU 0x707f +#define INSN_MATCH_SB 0x23 +#define INSN_MASK_SB 0x707f +#define INSN_MATCH_SH 0x1023 +#define INSN_MASK_SH 0x707f +#define INSN_MATCH_SW 0x2023 +#define INSN_MASK_SW 0x707f +#define INSN_MATCH_SD 0x3023 +#define INSN_MASK_SD 0x707f + +#define INSN_MATCH_FLW 0x2007 +#define INSN_MASK_FLW 0x707f +#define INSN_MATCH_FLD 0x3007 +#define INSN_MASK_FLD 0x707f +#define INSN_MATCH_FLQ 0x4007 +#define INSN_MASK_FLQ 0x707f +#define INSN_MATCH_FSW 0x2027 +#define INSN_MASK_FSW 0x707f +#define INSN_MATCH_FSD 0x3027 +#define INSN_MASK_FSD 0x707f +#define INSN_MATCH_FSQ 0x4027 +#define INSN_MASK_FSQ 0x707f + +#define INSN_MATCH_C_LD 0x6000 +#define INSN_MASK_C_LD 0xe003 +#define INSN_MATCH_C_SD 0xe000 +#define INSN_MASK_C_SD 0xe003 +#define INSN_MATCH_C_LW 0x4000 +#define INSN_MASK_C_LW 0xe003 +#define INSN_MATCH_C_SW 0xc000 +#define INSN_MASK_C_SW 0xe003 +#define INSN_MATCH_C_LDSP 0x6002 +#define INSN_MASK_C_LDSP 0xe003 +#define INSN_MATCH_C_SDSP 0xe002 +#define INSN_MASK_C_SDSP 0xe003 +#define INSN_MATCH_C_LWSP 0x4002 +#define INSN_MASK_C_LWSP 0xe003 +#define INSN_MATCH_C_SWSP 0xc002 +#define INSN_MASK_C_SWSP 0xe003 + +#define INSN_MATCH_C_FLD 0x2000 +#define INSN_MASK_C_FLD 0xe003 +#define INSN_MATCH_C_FLW 0x6000 +#define INSN_MASK_C_FLW 0xe003 +#define INSN_MATCH_C_FSD 0xa000 +#define INSN_MASK_C_FSD 0xe003 +#define INSN_MATCH_C_FSW 0xe000 +#define INSN_MASK_C_FSW 0xe003 +#define INSN_MATCH_C_FLDSP 0x2002 +#define INSN_MASK_C_FLDSP 0xe003 +#define INSN_MATCH_C_FSDSP 0xa002 +#define INSN_MASK_C_FSDSP 0xe003 +#define INSN_MATCH_C_FLWSP 0x6002 +#define INSN_MASK_C_FLWSP 0xe003 +#define INSN_MATCH_C_FSWSP 0xe002 +#define INSN_MASK_C_FSWSP 0xe003 + +#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) + +#if defined(CONFIG_64BIT) +#define LOG_REGBYTES 3 +#define XLEN 64 +#else +#define LOG_REGBYTES 2 +#define XLEN 32 +#endif +#define REGBYTES (1 << LOG_REGBYTES) +#define XLEN_MINUS_16 ((XLEN) - 16) + +#define SH_RD 7 +#define SH_RS1 15 +#define SH_RS2 20 +#define SH_RS2C 2 + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ + (RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 1) << 6)) +#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 2) << 6)) +#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 2) << 6)) +#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 3) << 6)) +#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ + (RV_X(x, 7, 2) << 6)) +#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 7, 3) << 6)) +#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) +#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) +#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) + +#define SHIFT_RIGHT(x, y) \ + ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) + +#define REG_MASK \ + ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) + +#define REG_OFFSET(insn, pos) \ + (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) + +#define REG_PTR(insn, pos, regs) \ + (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)) + +#define GET_RM(insn) (((insn) >> 12) & 7) + +#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) +#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) +#define GET_SP(regs) (*REG_PTR(2, 0, regs)) +#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) +#define IMM_I(insn) ((s32)(insn) >> 20) +#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ + (s32)(((insn) >> 7) & 0x1f)) +#define MASK_FUNCT3 0x7000 + +#define GET_PRECISION(insn) (((insn) >> 25) & 3) +#define GET_RM(insn) (((insn) >> 12) & 7) +#define PRECISION_S 0 +#define PRECISION_D 1 + +#define STR(x) XSTR(x) +#define XSTR(x) #x + +#define DECLARE_UNPRIVILEGED_LOAD_FUNCTION(type, insn) \ +static inline type load_##type(const type *addr) \ +{ \ + type val; \ + asm (#insn " %0, %1" \ + : "=&r" (val) : "m" (*addr)); \ + return val; \ +} + +#define DECLARE_UNPRIVILEGED_STORE_FUNCTION(type, insn) \ +static inline void store_##type(type *addr, type val) \ +{ \ + asm volatile (#insn " %0, %1\n" \ + : : "r" (val), "m" (*addr)); \ +} + +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u8, lbu) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u16, lhu) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s8, lb) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s16, lh) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s32, lw) +DECLARE_UNPRIVILEGED_STORE_FUNCTION(u8, sb) +DECLARE_UNPRIVILEGED_STORE_FUNCTION(u16, sh) +DECLARE_UNPRIVILEGED_STORE_FUNCTION(u32, sw) +#if defined(CONFIG_64BIT) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lwu) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u64, ld) +DECLARE_UNPRIVILEGED_STORE_FUNCTION(u64, sd) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, ld) +#else +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lw) +DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, lw) + +static inline u64 load_u64(const u64 *addr) +{ + return load_u32((u32 *)addr) + + ((u64)load_u32((u32 *)addr + 1) << 32); +} + +static inline void store_u64(u64 *addr, u64 val) +{ + store_u32((u32 *)addr, val); + store_u32((u32 *)addr + 1, val >> 32); +} +#endif + +static inline ulong get_insn(ulong mepc) +{ + register ulong __mepc asm ("a2") = mepc; + ulong val, rvc_mask = 3, tmp; + + asm ("and %[tmp], %[addr], 2\n" + "bnez %[tmp], 1f\n" +#if defined(CONFIG_64BIT) + STR(LWU) " %[insn], (%[addr])\n" +#else + STR(LW) " %[insn], (%[addr])\n" +#endif + "and %[tmp], %[insn], %[rvc_mask]\n" + "beq %[tmp], %[rvc_mask], 2f\n" + "sll %[insn], %[insn], %[xlen_minus_16]\n" + "srl %[insn], %[insn], %[xlen_minus_16]\n" + "j 2f\n" + "1:\n" + "lhu %[insn], (%[addr])\n" + "and %[tmp], %[insn], %[rvc_mask]\n" + "bne %[tmp], %[rvc_mask], 2f\n" + "lhu %[tmp], 2(%[addr])\n" + "sll %[tmp], %[tmp], 16\n" + "add %[insn], %[insn], %[tmp]\n" + "2:" + : [insn] "=&r" (val), [tmp] "=&r" (tmp) + : [addr] "r" (__mepc), [rvc_mask] "r" (rvc_mask), + [xlen_minus_16] "i" (XLEN_MINUS_16)); + + return val; +} + +union reg_data { + u8 data_bytes[8]; + ulong data_ulong; + u64 data_u64; +}; + +int handle_misaligned_load(struct pt_regs *regs) +{ + union reg_data val; + unsigned long epc = regs->epc; + unsigned long insn = get_insn(epc); + unsigned long addr = csr_read(mtval); + int i, fp = 0, shift = 0, len = 0; + + regs->epc = 0; + + if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { + len = 4; + shift = 8 * (sizeof(unsigned long) - len); +#if defined(CONFIG_64BIT) + } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) { + len = 8; + shift = 8 * (sizeof(unsigned long) - len); + } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) { + len = 4; +#endif + } else if ((insn & INSN_MASK_FLD) == INSN_MATCH_FLD) { + fp = 1; + len = 8; + } else if ((insn & INSN_MASK_FLW) == INSN_MATCH_FLW) { + fp = 1; + len = 4; + } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) { + len = 2; + shift = 8 * (sizeof(unsigned long) - len); + } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) { + len = 2; +#if defined(CONFIG_64BIT) + } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) { + len = 8; + shift = 8 * (sizeof(unsigned long) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + shift = 8 * (sizeof(unsigned long) - len); +#endif + } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) { + len = 4; + shift = 8 * (sizeof(unsigned long) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + shift = 8 * (sizeof(unsigned long) - len); + } else if ((insn & INSN_MASK_C_FLD) == INSN_MATCH_C_FLD) { + fp = 1; + len = 8; + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_FLDSP) == INSN_MATCH_C_FLDSP) { + fp = 1; + len = 8; +#if defined(CONFIG_32BIT) + } else if ((insn & INSN_MASK_C_FLW) == INSN_MATCH_C_FLW) { + fp = 1; + len = 4; + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_FLWSP) == INSN_MATCH_C_FLWSP) { + fp = 1; + len = 4; +#endif + } else { + regs->epc = epc; + return -1; + } + + val.data_u64 = 0; + for (i = 0; i < len; i++) + val.data_bytes[i] = load_u8((void *)(addr + i)); + + if (fp) + return -1; + SET_RD(insn, regs, val.data_ulong << shift >> shift); + + regs->epc = epc + INSN_LEN(insn); + + return 0; +} + +int handle_misaligned_store(struct pt_regs *regs) +{ + union reg_data val; + unsigned long epc = regs->epc; + unsigned long insn = get_insn(epc); + unsigned long addr = csr_read(mtval); + int i, len = 0; + + regs->epc = 0; + + val.data_ulong = GET_RS2(insn, regs); + + if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) { + len = 4; +#if defined(CONFIG_64BIT) + } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) { + len = 8; +#endif + } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) { + len = 2; +#if defined(CONFIG_64BIT) + } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) { + len = 8; + val.data_ulong = GET_RS2S(insn, regs); + } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + val.data_ulong = GET_RS2C(insn, regs); +#endif + } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) { + len = 4; + val.data_ulong = GET_RS2S(insn, regs); + } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + val.data_ulong = GET_RS2C(insn, regs); + } else { + regs->epc = epc; + return -1; + } + + for (i = 0; i < len; i++) + store_u8((void *)(addr + i), val.data_bytes[i]); + + regs->epc = epc + INSN_LEN(insn); + + return 0; +} From 335b139057ef79dbede01dea6e8c3f47c2b88802 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2020 09:47:38 +0900 Subject: [PATCH 57/65] riscv: Add SOC early init support Add a mechanism for early SoC initialization for platforms that need additional hardware initialization not possible through the regular device tree and drivers mechanism. With this, a SoC specific initialization function can be called very early, before DTB parsing is done by parse_dtb() in Linux RISC-V kernel setup code. This can be very useful for early hardware initialization for No-MMU kernels booted directly in M-mode because it is quite likely that no other booting stage exist prior to the No-MMU kernel. Example use of a SoC early initialization is as follows: static void vendor_abc_early_init(const void *fdt) { /* * some early init code here that can use simple matches * against the flat device tree file. */ } SOC_EARLY_INIT_DECLARE("vendor,abc", abc_early_init); This early initialization function is executed only if the flat device tree for the board has a 'compatible = "vendor,abc"' entry; Signed-off-by: Damien Le Moal Signed-off-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/soc.h | 23 +++++++++++++++++++++++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/head.S | 1 + arch/riscv/kernel/soc.c | 28 ++++++++++++++++++++++++++++ arch/riscv/kernel/vmlinux.lds.S | 6 ++++++ 5 files changed, 59 insertions(+) create mode 100644 arch/riscv/include/asm/soc.h create mode 100644 arch/riscv/kernel/soc.c diff --git a/arch/riscv/include/asm/soc.h b/arch/riscv/include/asm/soc.h new file mode 100644 index 000000000000..7cec1968c8b4 --- /dev/null +++ b/arch/riscv/include/asm/soc.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#ifndef _ASM_RISCV_SOC_H +#define _ASM_RISCV_SOC_H + +#include +#include +#include + +#define SOC_EARLY_INIT_DECLARE(name, compat, fn) \ + static const struct of_device_id __soc_early_init__##name \ + __used __section(__soc_early_init_table) \ + = { .compatible = compat, .data = fn } + +void soc_early_init(void); + +extern unsigned long __soc_early_init_table_start; +extern unsigned long __soc_early_init_table_end; + +#endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 1bad93f63dba..86c83081044f 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -11,6 +11,7 @@ endif extra-y += head.o extra-y += vmlinux.lds +obj-y += soc.o obj-y += cpu.o obj-y += cpufeature.o obj-y += entry.o diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index e5115d5e0b3a..98a406474e7d 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -234,6 +234,7 @@ clear_bss_done: call kasan_early_init #endif /* Start the kernel */ + call soc_early_init call parse_dtb tail start_kernel diff --git a/arch/riscv/kernel/soc.c b/arch/riscv/kernel/soc.c new file mode 100644 index 000000000000..0b3b3dc9ad0f --- /dev/null +++ b/arch/riscv/kernel/soc.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include + +/* + * This is called extremly early, before parse_dtb(), to allow initializing + * SoC hardware before memory or any device driver initialization. + */ +void __init soc_early_init(void) +{ + void (*early_fn)(const void *fdt); + const struct of_device_id *s; + const void *fdt = dtb_early_va; + + for (s = (void *)&__soc_early_init_table_start; + (void *)s < (void *)&__soc_early_init_table_end; s++) { + if (!fdt_node_check_compatible(fdt, 0, s->compatible)) { + early_fn = s->data; + early_fn(fdt); + return; + } + } +} diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 435cd60dca04..0339b6bbe11a 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -28,6 +28,12 @@ SECTIONS __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) + . = ALIGN(8); + __soc_early_init_table : { + __soc_early_init_table_start = .; + KEEP(*(__soc_early_init_table)) + __soc_early_init_table_end = .; + } /* we have to discard exit text and such at runtime, not link time */ .exit.text : { From c48c4a4c7eadb76593965e2b956e4e2b23a4e388 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2020 09:47:39 +0900 Subject: [PATCH 58/65] riscv: Add Kendryte K210 SoC support Add support for the Kendryte K210 RISC-V SoC. For now, this support only provides a simple sysctl driver allowing to setup the CPU and uart clock. This support is enabled through the new Kconfig option SOC_KENDRYTE and defines the config option CONFIG_K210_SYSCTL to enable the K210 SoC sysctl driver compilation. The sysctl driver also registers an early SoC initialization function allowing enabling the general purpose use of the 2MB of SRAM normally reserved for the SoC AI engine. This initialization function is automatically called before the dt early initialization using the flat dt root node compatible property matching the value "kendryte,k210". Signed-off-by: Christoph Hellwig Signed-off-by: Damien Le Moal [Palmer: Add missing endmenu in Kconfig.socs] Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 30 ++++ drivers/soc/Kconfig | 1 + drivers/soc/Makefile | 1 + drivers/soc/kendryte/Kconfig | 14 ++ drivers/soc/kendryte/Makefile | 3 + drivers/soc/kendryte/k210-sysctl.c | 248 +++++++++++++++++++++++++++++ 6 files changed, 297 insertions(+) create mode 100644 drivers/soc/kendryte/Kconfig create mode 100644 drivers/soc/kendryte/Makefile create mode 100644 drivers/soc/kendryte/k210-sysctl.c diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index d325b67d00df..69071578e181 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -10,4 +10,34 @@ config SOC_SIFIVE help This enables support for SiFive SoC platform hardware. +config SOC_VIRT + bool "QEMU Virt Machine" + select VIRTIO_PCI + select VIRTIO_BALLOON + select VIRTIO_MMIO + select VIRTIO_CONSOLE + select VIRTIO_NET + select NET_9P_VIRTIO + select VIRTIO_BLK + select SCSI_VIRTIO + select DRM_VIRTIO_GPU + select HW_RANDOM_VIRTIO + select RPMSG_CHAR + select RPMSG_VIRTIO + select CRYPTO_DEV_VIRTIO + select VIRTIO_INPUT + select POWER_RESET_SYSCON + select POWER_RESET_SYSCON_POWEROFF + select GOLDFISH + select RTC_DRV_GOLDFISH + select SIFIVE_PLIC + help + This enables support for QEMU Virt Machine. + +config SOC_KENDRYTE + bool "Kendryte K210 SoC" + depends on !MMU + help + This enables support for Kendryte K210 SoC platform hardware. + endmenu diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig index 1778f8c62861..425ab6f7e375 100644 --- a/drivers/soc/Kconfig +++ b/drivers/soc/Kconfig @@ -22,5 +22,6 @@ source "drivers/soc/ux500/Kconfig" source "drivers/soc/versatile/Kconfig" source "drivers/soc/xilinx/Kconfig" source "drivers/soc/zte/Kconfig" +source "drivers/soc/kendryte/Kconfig" endmenu diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile index 8b49d782a1ab..af58063bb989 100644 --- a/drivers/soc/Makefile +++ b/drivers/soc/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_ARCH_U8500) += ux500/ obj-$(CONFIG_PLAT_VERSATILE) += versatile/ obj-y += xilinx/ obj-$(CONFIG_ARCH_ZX) += zte/ +obj-$(CONFIG_SOC_KENDRYTE) += kendryte/ diff --git a/drivers/soc/kendryte/Kconfig b/drivers/soc/kendryte/Kconfig new file mode 100644 index 000000000000..49785b1b0217 --- /dev/null +++ b/drivers/soc/kendryte/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 + +if SOC_KENDRYTE + +config K210_SYSCTL + bool "Kendryte K210 system controller" + default y + depends on RISCV + help + Enables controlling the K210 various clocks and to enable + general purpose use of the extra 2MB of SRAM normally + reserved for the AI engine. + +endif diff --git a/drivers/soc/kendryte/Makefile b/drivers/soc/kendryte/Makefile new file mode 100644 index 000000000000..002d9ce95c0d --- /dev/null +++ b/drivers/soc/kendryte/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_K210_SYSCTL) += k210-sysctl.o diff --git a/drivers/soc/kendryte/k210-sysctl.c b/drivers/soc/kendryte/k210-sysctl.c new file mode 100644 index 000000000000..4608fbca20e1 --- /dev/null +++ b/drivers/soc/kendryte/k210-sysctl.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2019 Christoph Hellwig. + * Copyright (c) 2019 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define K210_SYSCTL_CLK0_FREQ 26000000UL + +/* Registers base address */ +#define K210_SYSCTL_SYSCTL_BASE_ADDR 0x50440000ULL + +/* Registers */ +#define K210_SYSCTL_PLL0 0x08 +#define K210_SYSCTL_PLL1 0x0c +/* clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */ +#define PLL_RESET (1 << 20) +#define PLL_PWR (1 << 21) +#define PLL_INTFB (1 << 22) +#define PLL_BYPASS (1 << 23) +#define PLL_TEST (1 << 24) +#define PLL_OUT_EN (1 << 25) +#define PLL_TEST_EN (1 << 26) +#define K210_SYSCTL_PLL_LOCK 0x18 +#define PLL0_LOCK1 (1 << 0) +#define PLL0_LOCK2 (1 << 1) +#define PLL0_SLIP_CLEAR (1 << 2) +#define PLL0_TEST_CLK_OUT (1 << 3) +#define PLL1_LOCK1 (1 << 8) +#define PLL1_LOCK2 (1 << 9) +#define PLL1_SLIP_CLEAR (1 << 10) +#define PLL1_TEST_CLK_OUT (1 << 11) +#define PLL2_LOCK1 (1 << 16) +#define PLL2_LOCK2 (1 << 16) +#define PLL2_SLIP_CLEAR (1 << 18) +#define PLL2_TEST_CLK_OUT (1 << 19) +#define K210_SYSCTL_CLKSEL0 0x20 +#define CLKSEL_ACLK (1 << 0) +#define K210_SYSCTL_CLKEN_CENT 0x28 +#define CLKEN_CPU (1 << 0) +#define CLKEN_SRAM0 (1 << 1) +#define CLKEN_SRAM1 (1 << 2) +#define CLKEN_APB0 (1 << 3) +#define CLKEN_APB1 (1 << 4) +#define CLKEN_APB2 (1 << 5) +#define K210_SYSCTL_CLKEN_PERI 0x2c +#define CLKEN_ROM (1 << 0) +#define CLKEN_DMA (1 << 1) +#define CLKEN_AI (1 << 2) +#define CLKEN_DVP (1 << 3) +#define CLKEN_FFT (1 << 4) +#define CLKEN_GPIO (1 << 5) +#define CLKEN_SPI0 (1 << 6) +#define CLKEN_SPI1 (1 << 7) +#define CLKEN_SPI2 (1 << 8) +#define CLKEN_SPI3 (1 << 9) +#define CLKEN_I2S0 (1 << 10) +#define CLKEN_I2S1 (1 << 11) +#define CLKEN_I2S2 (1 << 12) +#define CLKEN_I2C0 (1 << 13) +#define CLKEN_I2C1 (1 << 14) +#define CLKEN_I2C2 (1 << 15) +#define CLKEN_UART1 (1 << 16) +#define CLKEN_UART2 (1 << 17) +#define CLKEN_UART3 (1 << 18) +#define CLKEN_AES (1 << 19) +#define CLKEN_FPIO (1 << 20) +#define CLKEN_TIMER0 (1 << 21) +#define CLKEN_TIMER1 (1 << 22) +#define CLKEN_TIMER2 (1 << 23) +#define CLKEN_WDT0 (1 << 24) +#define CLKEN_WDT1 (1 << 25) +#define CLKEN_SHA (1 << 26) +#define CLKEN_OTP (1 << 27) +#define CLKEN_RTC (1 << 29) + +struct k210_sysctl { + void __iomem *regs; + struct clk_hw hw; +}; + +static void k210_set_bits(u32 val, void __iomem *reg) +{ + writel(readl(reg) | val, reg); +} + +static void k210_clear_bits(u32 val, void __iomem *reg) +{ + writel(readl(reg) & ~val, reg); +} + +static void k210_pll1_enable(void __iomem *regs) +{ + u32 val; + + val = readl(regs + K210_SYSCTL_PLL1); + val &= ~GENMASK(19, 0); /* clkr1 = 0 */ + val |= FIELD_PREP(GENMASK(9, 4), 0x3B); /* clkf1 = 59 */ + val |= FIELD_PREP(GENMASK(13, 10), 0x3); /* clkod1 = 3 */ + val |= FIELD_PREP(GENMASK(19, 14), 0x3B); /* bwadj1 = 59 */ + writel(val, regs + K210_SYSCTL_PLL1); + + k210_clear_bits(PLL_BYPASS, regs + K210_SYSCTL_PLL1); + k210_set_bits(PLL_PWR, regs + K210_SYSCTL_PLL1); + + /* + * Reset the pll. The magic NOPs come from the Kendryte reference SDK. + */ + k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); + k210_set_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); + nop(); + nop(); + k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1); + + for (;;) { + val = readl(regs + K210_SYSCTL_PLL_LOCK); + if (val & PLL1_LOCK2) + break; + writel(val | PLL1_SLIP_CLEAR, regs + K210_SYSCTL_PLL_LOCK); + } + + k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL1); +} + +static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct k210_sysctl *s = container_of(hw, struct k210_sysctl, hw); + u32 clksel0, pll0; + u64 pll0_freq, clkr0, clkf0, clkod0; + + /* + * If the clock selector is not set, use the base frequency. + * Otherwise, use PLL0 frequency with a frequency divisor. + */ + clksel0 = readl(s->regs + K210_SYSCTL_CLKSEL0); + if (!(clksel0 & CLKSEL_ACLK)) + return K210_SYSCTL_CLK0_FREQ; + + /* + * Get PLL0 frequency: + * freq = base frequency * clkf0 / (clkr0 * clkod0) + */ + pll0 = readl(s->regs + K210_SYSCTL_PLL0); + clkr0 = 1 + FIELD_GET(GENMASK(3, 0), pll0); + clkf0 = 1 + FIELD_GET(GENMASK(9, 4), pll0); + clkod0 = 1 + FIELD_GET(GENMASK(13, 10), pll0); + pll0_freq = clkf0 * K210_SYSCTL_CLK0_FREQ / (clkr0 * clkod0); + + /* Get the frequency divisor from the clock selector */ + return pll0_freq / (2ULL << FIELD_GET(0x00000006, clksel0)); +} + +static const struct clk_ops k210_sysctl_clk_ops = { + .recalc_rate = k210_sysctl_clk_recalc_rate, +}; + +static const struct clk_init_data k210_clk_init_data = { + .name = "k210-sysctl-pll1", + .ops = &k210_sysctl_clk_ops, +}; + +static int k210_sysctl_probe(struct platform_device *pdev) +{ + struct k210_sysctl *s; + int error; + + pr_info("Kendryte K210 SoC sysctl\n"); + + s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->regs = devm_ioremap_resource(&pdev->dev, + platform_get_resource(pdev, IORESOURCE_MEM, 0)); + if (IS_ERR(s->regs)) + return PTR_ERR(s->regs); + + s->hw.init = &k210_clk_init_data; + error = devm_clk_hw_register(&pdev->dev, &s->hw); + if (error) { + dev_err(&pdev->dev, "failed to register clk"); + return error; + } + + error = devm_of_clk_add_hw_provider(&pdev->dev, of_clk_hw_simple_get, + &s->hw); + if (error) { + dev_err(&pdev->dev, "adding clk provider failed\n"); + return error; + } + + return 0; +} + +static const struct of_device_id k210_sysctl_of_match[] = { + { .compatible = "kendryte,k210-sysctl", }, + {} +}; + +static struct platform_driver k210_sysctl_driver = { + .driver = { + .name = "k210-sysctl", + .of_match_table = k210_sysctl_of_match, + }, + .probe = k210_sysctl_probe, +}; + +static int __init k210_sysctl_init(void) +{ + return platform_driver_register(&k210_sysctl_driver); +} +core_initcall(k210_sysctl_init); + +/* + * This needs to be called very early during initialization, given that + * PLL1 needs to be enabled to be able to use all SRAM. + */ +static void __init k210_soc_early_init(const void *fdt) +{ + void __iomem *regs; + + regs = ioremap(K210_SYSCTL_SYSCTL_BASE_ADDR, 0x1000); + if (!regs) + panic("K210 sysctl ioremap"); + + /* Enable PLL1 to make the KPU SRAM useable */ + k210_pll1_enable(regs); + + k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0); + + k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1, + regs + K210_SYSCTL_CLKEN_CENT); + k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC, + regs + K210_SYSCTL_CLKEN_PERI); + + k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_CLKSEL0); + + iounmap(regs); +} +SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init); From 8759a42bf1d04232835ed9287860fe6c124f3aac Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2020 09:47:40 +0900 Subject: [PATCH 59/65] riscv: Select required drivers for Kendryte SOC This patch selects drivers required for the Kendryte K210 SOC. Since K210 SoC based boards do not provide a device tree, this patch also enables the BUILTIN_DTB option. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 69071578e181..a843100124ae 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -37,6 +37,10 @@ config SOC_VIRT config SOC_KENDRYTE bool "Kendryte K210 SoC" depends on !MMU + select BUILTIN_DTB + select SERIAL_SIFIVE if TTY + select SERIAL_SIFIVE_CONSOLE if TTY + select SIFIVE_PLIC help This enables support for Kendryte K210 SoC platform hardware. From 5ba568f57f0ae4826beb6aaeecb12e68219b8a0b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2020 09:47:41 +0900 Subject: [PATCH 60/65] riscv: Add Kendryte K210 device tree Add a generic device tree for Kendryte K210 SoC based boards. This is for now a very simple device tree describing the core elements of the SoC. This is suitable (and tested) for the Kendryte KD233 development board, the Sipeed MAIX M1 Dan Dock board and the Sipeed MAIXDUINO board. Signed-off-by: Damien Le Moal Reviewed-by: Sean Anderson Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/Makefile | 1 + arch/riscv/boot/dts/kendryte/Makefile | 2 + arch/riscv/boot/dts/kendryte/k210.dts | 23 +++++ arch/riscv/boot/dts/kendryte/k210.dtsi | 123 +++++++++++++++++++++++++ include/dt-bindings/clock/k210-clk.h | 20 ++++ 5 files changed, 169 insertions(+) create mode 100644 arch/riscv/boot/dts/kendryte/Makefile create mode 100644 arch/riscv/boot/dts/kendryte/k210.dts create mode 100644 arch/riscv/boot/dts/kendryte/k210.dtsi create mode 100644 include/dt-bindings/clock/k210-clk.h diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile index dcc3ada78455..557f0b519c8e 100644 --- a/arch/riscv/boot/dts/Makefile +++ b/arch/riscv/boot/dts/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 subdir-y += sifive +subdir-y += kendryte diff --git a/arch/riscv/boot/dts/kendryte/Makefile b/arch/riscv/boot/dts/kendryte/Makefile new file mode 100644 index 000000000000..815444e69e89 --- /dev/null +++ b/arch/riscv/boot/dts/kendryte/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +dtb-$(CONFIG_SOC_KENDRYTE) += k210.dtb diff --git a/arch/riscv/boot/dts/kendryte/k210.dts b/arch/riscv/boot/dts/kendryte/k210.dts new file mode 100644 index 000000000000..0d1f28fce6b2 --- /dev/null +++ b/arch/riscv/boot/dts/kendryte/k210.dts @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +/dts-v1/; + +#include "k210.dtsi" + +/ { + model = "Kendryte K210 generic"; + compatible = "kendryte,k210"; + + chosen { + bootargs = "earlycon console=ttySIF0"; + stdout-path = "serial0"; + }; +}; + +&uarths0 { + status = "okay"; +}; + diff --git a/arch/riscv/boot/dts/kendryte/k210.dtsi b/arch/riscv/boot/dts/kendryte/k210.dtsi new file mode 100644 index 000000000000..c1df56ccb8d5 --- /dev/null +++ b/arch/riscv/boot/dts/kendryte/k210.dtsi @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Sean Anderson + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include + +/ { + /* + * Although the K210 is a 64-bit CPU, the address bus is only 32-bits + * wide, and the upper half of all addresses is ignored. + */ + #address-cells = <1>; + #size-cells = <1>; + compatible = "kendryte,k210"; + + aliases { + serial0 = &uarths0; + }; + + /* + * The K210 has an sv39 MMU following the priviledge specification v1.9. + * Since this is a non-ratified draft specification, the kernel does not + * support it and the K210 support enabled only for the !MMU case. + * Be consistent with this by setting the CPUs MMU type to "none". + */ + cpus { + #address-cells = <1>; + #size-cells = <0>; + timebase-frequency = <7800000>; + cpu0: cpu@0 { + device_type = "cpu"; + reg = <0>; + compatible = "kendryte,k210", "sifive,rocket0", "riscv"; + riscv,isa = "rv64imafdc"; + mmu-type = "none"; + i-cache-size = <0x8000>; + i-cache-block-size = <64>; + d-cache-size = <0x8000>; + d-cache-block-size = <64>; + clocks = <&sysctl K210_CLK_CPU>; + clock-frequency = <390000000>; + cpu0_intc: interrupt-controller { + #interrupt-cells = <1>; + interrupt-controller; + compatible = "riscv,cpu-intc"; + }; + }; + cpu1: cpu@1 { + device_type = "cpu"; + reg = <1>; + compatible = "kendryte,k210", "sifive,rocket0", "riscv"; + riscv,isa = "rv64imafdc"; + mmu-type = "none"; + i-cache-size = <0x8000>; + i-cache-block-size = <64>; + d-cache-size = <0x8000>; + d-cache-block-size = <64>; + clocks = <&sysctl K210_CLK_CPU>; + clock-frequency = <390000000>; + cpu1_intc: interrupt-controller { + #interrupt-cells = <1>; + interrupt-controller; + compatible = "riscv,cpu-intc"; + }; + }; + }; + + sram: memory@80000000 { + device_type = "memory"; + reg = <0x80000000 0x400000>, + <0x80400000 0x200000>, + <0x80600000 0x200000>; + reg-names = "sram0", "sram1", "aisram"; + }; + + clocks { + in0: oscillator { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <26000000>; + }; + }; + + soc { + #address-cells = <1>; + #size-cells = <1>; + compatible = "kendryte,k210-soc", "simple-bus"; + ranges; + interrupt-parent = <&plic0>; + + sysctl: sysctl@50440000 { + compatible = "kendryte,k210-sysctl", "simple-mfd"; + reg = <0x50440000 0x1000>; + #clock-cells = <1>; + }; + + clint0: interrupt-controller@2000000 { + compatible = "riscv,clint0"; + reg = <0x2000000 0xC000>; + interrupts-extended = <&cpu0_intc 3>, <&cpu1_intc 3>; + clocks = <&sysctl K210_CLK_ACLK>; + }; + + plic0: interrupt-controller@c000000 { + #interrupt-cells = <1>; + interrupt-controller; + compatible = "kendryte,k210-plic0", "riscv,plic0"; + reg = <0xC000000 0x4000000>; + interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 0xffffffff>, + <&cpu1_intc 11>, <&cpu1_intc 0xffffffff>; + riscv,ndev = <65>; + riscv,max-priority = <7>; + }; + + uarths0: serial@38000000 { + compatible = "kendryte,k210-uarths", "sifive,uart0"; + reg = <0x38000000 0x1000>; + interrupts = <33>; + clocks = <&sysctl K210_CLK_CPU>; + }; + }; +}; diff --git a/include/dt-bindings/clock/k210-clk.h b/include/dt-bindings/clock/k210-clk.h new file mode 100644 index 000000000000..5a2fd64d1a49 --- /dev/null +++ b/include/dt-bindings/clock/k210-clk.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019-20 Sean Anderson + * Copyright (c) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef K210_CLK_H +#define K210_CLK_H + +/* + * Arbitrary identifiers for clocks. + * The structure is: in0 -> pll0 -> aclk -> cpu + * + * Since we use the hardware defaults for now, set all these to the same clock. + */ +#define K210_CLK_PLL0 0 +#define K210_CLK_PLL1 0 +#define K210_CLK_ACLK 0 +#define K210_CLK_CPU 0 + +#endif /* K210_CLK_H */ From aa10eb6bb8a9c12d238e77fdd470db60cd7fb769 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2020 09:47:42 +0900 Subject: [PATCH 61/65] riscv: Kendryte K210 default config This patch adds a defconfig file to build No-MMU kernels meant for boards based on the Kendryte K210 SoC. Signed-off-by: Damien Le Moal Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/nommu_k210_defconfig | 68 +++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/riscv/configs/nommu_k210_defconfig diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig new file mode 100644 index 000000000000..632aa2f95e57 --- /dev/null +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -0,0 +1,68 @@ +# CONFIG_CPU_ISOLATION is not set +CONFIG_LOG_BUF_SHIFT=15 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_INITRAMFS_FORCE=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_XZ is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +# CONFIG_BOOT_CONFIG is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_FHANDLE is not set +# CONFIG_BASE_FULL is not set +# CONFIG_EPOLL is not set +# CONFIG_SIGNALFD is not set +# CONFIG_TIMERFD is not set +# CONFIG_EVENTFD is not set +# CONFIG_AIO is not set +# CONFIG_IO_URING is not set +# CONFIG_ADVISE_SYSCALLS is not set +# CONFIG_MEMBARRIER is not set +# CONFIG_KALLSYMS is not set +CONFIG_EMBEDDED=y +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SLOB=y +# CONFIG_SLAB_MERGE_DEFAULT is not set +# CONFIG_MMU is not set +CONFIG_SOC_KENDRYTE=y +CONFIG_MAXPHYSMEM_2GB=y +CONFIG_SMP=y +CONFIG_NR_CPUS=2 +CONFIG_CMDLINE="earlycon console=ttySIF0" +CONFIG_CMDLINE_FORCE=y +CONFIG_USE_BUILTIN_DTB=y +CONFIG_BUILTIN_DTB_SOURCE="kendryte/k210" +# CONFIG_BLOCK is not set +CONFIG_BINFMT_FLAT=y +# CONFIG_COREDUMP is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_FW_LOADER is not set +# CONFIG_ALLOW_DEV_COREDUMP is not set +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_LDISC_AUTOLOAD is not set +# CONFIG_DEVMEM is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +# CONFIG_VGA_CONSOLE is not set +# CONFIG_HID is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_VIRTIO_MENU is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_LSM="[]" +CONFIG_PRINTK_TIME=y +# CONFIG_DEBUG_MISC is not set +# CONFIG_SCHED_DEBUG is not set +# CONFIG_RCU_TRACE is not set +# CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set From 37809df4b1c88927fe944eb766e0553811c51f64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2020 09:47:43 +0900 Subject: [PATCH 62/65] riscv: create a loader.bin boot image for Kendryte SoC Create the loader.bin bootable image file that can be loaded into Kendryte K210 based boards using the kflash.py tool with the command: kflash.py/kflash.py -t arch/riscv/boot/loader.bin Signed-off-by: Christoph Hellwig Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 6 +++--- arch/riscv/boot/Makefile | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index b9009a2fbaf5..0e94ff8ec1f7 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -83,12 +83,12 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ -ifeq ($(CONFIG_RISCV_M_MODE),y) -KBUILD_IMAGE := $(boot)/loader +ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_KENDRYTE),yy) +KBUILD_IMAGE := $(boot)/loader.bin else KBUILD_IMAGE := $(boot)/Image.gz endif -BOOT_TARGETS := Image Image.gz loader +BOOT_TARGETS := Image Image.gz loader loader.bin all: $(notdir $(KBUILD_IMAGE)) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 36db8145f9f4..3530c59b3ea7 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -41,6 +41,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE $(obj)/Image.lzo: $(obj)/Image FORCE $(call if_changed,lzo) +$(obj)/loader.bin: $(obj)/loader FORCE + $(call if_changed,objcopy) + install: $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \ $(obj)/Image System.map "$(INSTALL_PATH)" From d16a58f8854b194c964a4bbe8156ec624ebfdbd2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Apr 2020 23:10:05 +1000 Subject: [PATCH 63/65] powerpc: Improve ppc_save_regs() Make ppc_save_regs() a bit more useful: - Set NIP to our caller rather rather than the caller's caller (which is what we save to LR in the stack frame). - Set SOFTE to the current irq soft-mask state rather than uninitialised. - Zero CFAR rather than leave it uninitialised. In qemu, injecting a nmi to an idle CPU gives a nicer stack trace (note NIP, IRQMASK, CFAR). Oops: System Reset, sig: 6 [#1] LE PAGE_SIZE=64K MMU=Hash PREEMPT SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.0-rc2-00429-ga76e38fd80bf #1277 NIP: c0000000000b6e5c LR: c0000000000b6e5c CTR: c000000000b06270 REGS: c00000000173fb08 TRAP: 0100 Not tainted MSR: 9000000000001033 CR: 28000224 XER: 00000000 CFAR: c0000000016a2128 IRQMASK: c00000000173fc80 GPR00: c0000000000b6e5c c00000000173fc80 c000000001743400 c00000000173fb08 GPR04: 0000000000000000 0000000000000000 0000000000000008 0000000000000001 GPR08: 00000001fea80000 0000000000000000 0000000000000000 ffffffffffffffff GPR12: c000000000b06270 c000000001930000 00000000300026c0 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000003 c0000000016a2128 GPR20: c0000001ffc97148 0000000000000001 c000000000f289a8 0000000000080000 GPR24: c0000000016e1480 000000011dc870ba 0000000000000000 0000000000000003 GPR28: c0000000016a2128 c0000001ffc97148 c0000000016a2260 0000000000000003 NIP [c0000000000b6e5c] power9_idle_type+0x5c/0x70 LR [c0000000000b6e5c] power9_idle_type+0x5c/0x70 Call Trace: [c00000000173fc80] [c0000000000b6e5c] power9_idle_type+0x5c/0x70 (unreliable) [c00000000173fcb0] [c000000000b062b0] stop_loop+0x40/0x60 [c00000000173fce0] [c000000000b022d8] cpuidle_enter_state+0xa8/0x660 [c00000000173fd60] [c000000000b0292c] cpuidle_enter+0x4c/0x70 [c00000000173fda0] [c00000000017624c] call_cpuidle+0x4c/0x90 [c00000000173fdc0] [c000000000176768] do_idle+0x338/0x460 [c00000000173fe60] [c000000000176b3c] cpu_startup_entry+0x3c/0x40 [c00000000173fe90] [c0000000000126b4] rest_init+0x124/0x140 [c00000000173fed0] [c0000000010948d4] start_kernel+0x938/0x988 [c00000000173ff90] [c00000000000cdcc] start_here_common+0x1c/0x20 Oops: System Reset, sig: 6 [#1] LE PAGE_SIZE=64K MMU=Hash PREEMPT SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.0-rc2-00430-gddce91b8712f #1278 NIP: c00000000001d150 LR: c0000000000b6e5c CTR: c000000000b06270 REGS: c00000000173fb08 TRAP: 0100 Not tainted MSR: 9000000000001033 CR: 28000224 XER: 00000000 CFAR: 0000000000000000 IRQMASK: 1 GPR00: c0000000000b6e5c c00000000173fc80 c000000001743400 c00000000173fb08 GPR04: 0000000000000000 0000000000000000 0000000000000008 0000000000000001 GPR08: 00000001fea80000 0000000000000000 0000000000000000 ffffffffffffffff GPR12: c000000000b06270 c000000001930000 00000000300026c0 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000003 c0000000016a2128 GPR20: c0000001ffc97148 0000000000000001 c000000000f289a8 0000000000080000 GPR24: c0000000016e1480 00000000b68db8ce 0000000000000000 0000000000000003 GPR28: c0000000016a2128 c0000001ffc97148 c0000000016a2260 0000000000000003 NIP [c00000000001d150] replay_system_reset+0x30/0xa0 LR [c0000000000b6e5c] power9_idle_type+0x5c/0x70 Call Trace: [c00000000173fc80] [c0000000000b6e5c] power9_idle_type+0x5c/0x70 (unreliable) [c00000000173fcb0] [c000000000b062b0] stop_loop+0x40/0x60 [c00000000173fce0] [c000000000b022d8] cpuidle_enter_state+0xa8/0x660 [c00000000173fd60] [c000000000b0292c] cpuidle_enter+0x4c/0x70 [c00000000173fda0] [c00000000017624c] call_cpuidle+0x4c/0x90 [c00000000173fdc0] [c000000000176768] do_idle+0x338/0x460 [c00000000173fe60] [c000000000176b38] cpu_startup_entry+0x38/0x40 [c00000000173fe90] [c0000000000126b4] rest_init+0x124/0x140 [c00000000173fed0] [c0000000010948d4] start_kernel+0x938/0x988 [c00000000173ff90] [c00000000000cdcc] start_here_common+0x1c/0x20 Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200403131006.123243-1-npiggin@gmail.com --- arch/powerpc/kernel/ppc_save_regs.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index f3bd0bbf2ae8..2d4d21bb46a9 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -55,14 +55,17 @@ _GLOBAL(ppc_save_regs) PPC_STL r29,29*SZL(r3) PPC_STL r30,30*SZL(r3) PPC_STL r31,31*SZL(r3) + lbz r0,PACAIRQSOFTMASK(r13) + PPC_STL r0,SOFTE-STACK_FRAME_OVERHEAD(r3) #endif /* go up one stack frame for SP */ PPC_LL r4,0(r1) PPC_STL r4,1*SZL(r3) /* get caller's LR */ PPC_LL r0,LRSAVE(r4) - PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) + mflr r0 + PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) mfmsr r0 PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) mfctr r0 @@ -73,4 +76,5 @@ _GLOBAL(ppc_save_regs) PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) li r0,0 PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,ORIG_GPR3-STACK_FRAME_OVERHEAD(r3) blr From 6ba4a2d3591039aea1cb45c7c42262d26351a2fa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 3 Apr 2020 20:56:56 +1100 Subject: [PATCH 64/65] selftests/powerpc: Always build the tm-poison test 64-bit The tm-poison test includes inline asm which is 64-bit only, so the test must be built 64-bit in order to work. Otherwise it fails, eg: # file tm-poison tm-poison: ELF 32-bit MSB executable, PowerPC or cisco 4500, version 1 (SYSV) ... # ./tm-poison test: tm_poison_test Unknown value 0x1fff71150 leaked into f31! Unknown value 0x1fff710c0 leaked into vr31! failure: tm_poison_test Fixes: a003365cab64 ("powerpc/tm: Add tm-poison test") Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200403095656.3772005-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/tm/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 0b0db8d3857c..5881e97c73c1 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -25,6 +25,7 @@ $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized - $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64 $(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64 $(OUTPUT)/tm-signal-pagefault: CFLAGS += -pthread -m64 +$(OUTPUT)/tm-poison: CFLAGS += -m64 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S From fc2266011accd5aeb8ebc335c381991f20e26e33 Mon Sep 17 00:00:00 2001 From: Fredrik Strupe Date: Wed, 8 Apr 2020 13:29:41 +0200 Subject: [PATCH 65/65] arm64: armv8_deprecated: Fix undef_hook mask for thumb setend For thumb instructions, call_undef_hook() in traps.c first reads a u16, and if the u16 indicates a T32 instruction (u16 >= 0xe800), a second u16 is read, which then makes up the the lower half-word of a T32 instruction. For T16 instructions, the second u16 is not read, which makes the resulting u32 opcode always have the upper half set to 0. However, having the upper half of instr_mask in the undef_hook set to 0 masks out the upper half of all thumb instructions - both T16 and T32. This results in trapped T32 instructions with the lower half-word equal to the T16 encoding of setend (b650) being matched, even though the upper half-word is not 0000 and thus indicates a T32 opcode. An example of such a T32 instruction is eaa0b650, which should raise a SIGILL since T32 instructions with an eaa prefix are unallocated as per Arm ARM, but instead works as a SETEND because the second half-word is set to b650. This patch fixes the issue by extending instr_mask to include the upper u32 half, which will still match T16 instructions where the upper half is 0, but not T32 instructions. Fixes: 2d888f48e056 ("arm64: Emulate SETEND for AArch32 tasks") Cc: # 4.0.x- Reviewed-by: Suzuki K Poulose Signed-off-by: Fredrik Strupe Signed-off-by: Catalin Marinas --- arch/arm64/kernel/armv8_deprecated.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 4cc581af2d96..c19aa81ddc8c 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -601,7 +601,7 @@ static struct undef_hook setend_hooks[] = { }, { /* Thumb mode */ - .instr_mask = 0x0000fff7, + .instr_mask = 0xfffffff7, .instr_val = 0x0000b650, .pstate_mask = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK), .pstate_val = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR),