From dd9a8c5a87395b6f05552c3b44e42fdc95760552 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 11 Sep 2018 13:07:56 +1000 Subject: [PATCH 001/221] powerpc/tm: Fix HFSCR bit for no suspend case Currently on P9N DD2.1 we end up taking infinite TM facility unavailable exceptions on the first TM usage by userspace. In the special case of TM no suspend (P9N DD2.1), Linux is told TM is off via CPU dt-ftrs but told to (partially) use it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] will be off from dt-ftrs but we need to turn it on for the no suspend case. This patch fixes this by enabling HFSCR TM in this case. Cc: stable@vger.kernel.org # 4.15+ Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/setup_64.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6a501b25dd85..faf00222b324 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -243,13 +243,19 @@ static void cpu_ready_for_interrupts(void) } /* - * Fixup HFSCR:TM based on CPU features. The bit is set by our - * early asm init because at that point we haven't updated our - * CPU features from firmware and device-tree. Here we have, - * so let's do it. + * Set HFSCR:TM based on CPU features: + * In the special case of TM no suspend (P9N DD2.1), Linux is + * told TM is off via the dt-ftrs but told to (partially) use + * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] + * will be off from dt-ftrs but we need to turn it on for the + * no suspend case. */ - if (cpu_has_feature(CPU_FTR_HVMODE) && !cpu_has_feature(CPU_FTR_TM_COMP)) - mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (cpu_has_feature(CPU_FTR_TM_COMP)) + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM); + else + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + } /* Set IR and DR in PACA MSR */ get_paca()->kernel_msr = MSR_KERNEL; From 56d20861c027498b5a1112b4f9f05b56d906fdda Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Fri, 14 Sep 2018 13:10:04 +0930 Subject: [PATCH 002/221] powerpc/vdso: Correct call frame information Call Frame Information is used by gdb for back-traces and inserting breakpoints on function return for the "finish" command. This failed when inside __kernel_clock_gettime. More concerning than difficulty debugging is that CFI is also used by stack frame unwinding code to implement exceptions. If you have an app that needs to handle asynchronous exceptions for some reason, and you are unlucky enough to get one inside the VDSO time functions, your app will crash. What's wrong: There is control flow in __kernel_clock_gettime that reaches label 99 without saving lr in r12. CFI info however is interpreted by the unwinder without reference to control flow: It's a simple matter of "Execute all the CFI opcodes up to the current address". That means the unwinder thinks r12 contains the return address at label 99. Disabuse it of that notion by resetting CFI for the return address at label 99. Note that the ".cfi_restore lr" could have gone anywhere from the "mtlr r12" a few instructions earlier to the instruction at label 99. I put the CFI as late as possible, because in general that's best practice (and if possible grouped with other CFI in order to reduce the number of CFI opcodes executed when unwinding). Using r12 as the return address is perfectly fine after the "mtlr r12" since r12 on that code path still contains the return address. __get_datapage also has a CFI error. That function temporarily saves lr in r0, and reflects that fact with ".cfi_register lr,r0". A later use of r0 means the CFI at that point isn't correct, as r0 no longer contains the return address. Fix that too. Signed-off-by: Alan Modra Tested-by: Reza Arbab Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/vdso32/datapage.S | 1 + arch/powerpc/kernel/vdso32/gettimeofday.S | 1 + arch/powerpc/kernel/vdso64/datapage.S | 1 + arch/powerpc/kernel/vdso64/gettimeofday.S | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 3745113fcc65..2a7eb5452aba 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 769c2624e0a6..1e0bc5955a40 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -139,6 +139,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index abf17feffe40..bf9668691511 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index c002adcc694c..a4ed9edfd5f0 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -169,6 +169,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc From b0dc0f8618e87f1b2c7f04b977fda16e961bbec1 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 21 Aug 2018 11:44:28 +0930 Subject: [PATCH 003/221] powerpc/powernv: Don't select the cpufreq governors Deciding wich govenors should be built into the kernel can be left to users to configure. Fixes: 81f359027a3a ("cpufreq: powernv: Select CPUFreq related Kconfig options for powernv") Signed-off-by: Joel Stanley [mpe: Update powernv/ppc64 defconfigs to enable them by default] Signed-off-by: Michael Ellerman --- arch/powerpc/configs/powernv_defconfig | 3 +++ arch/powerpc/configs/ppc64_defconfig | 3 +++ arch/powerpc/platforms/powernv/Kconfig | 5 ----- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 6ab34e60495f..b035d909e681 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -44,6 +44,9 @@ CONFIG_PPC_MEMTRACE=y # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_IDLE=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 5033e630afea..1b4753d4070b 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -40,6 +40,9 @@ CONFIG_PS3_LPM=m CONFIG_PPC_IBM_CELL_BLADE=y CONFIG_RTAS_FLASH=m CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_PMAC64=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index f8dc98d3dc01..028ac941c05c 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -15,11 +15,6 @@ config PPC_POWERNV select PPC_SCOM select ARCH_RANDOM select CPU_FREQ - select CPU_FREQ_GOV_PERFORMANCE - select CPU_FREQ_GOV_POWERSAVE - select CPU_FREQ_GOV_USERSPACE - select CPU_FREQ_GOV_ONDEMAND - select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL select MMU_NOTIFIER select FORCE_SMP From 693b31b2fc1636f0aa7af53136d3b49f6ad9ff39 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 31 Jul 2018 17:55:57 -0300 Subject: [PATCH 004/221] powerpc/selftests: Wait all threads to join Test tm-tmspr might exit before all threads stop executing, because it just waits for the very last thread to join before proceeding/exiting. This patch makes sure that all threads that were created will join before proceeding/exiting. This patch also guarantees that the amount of threads being created is equal to thread_num. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/tm/tm-tmspr.c | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c index 2bda81c7bf23..df1d7d4b1c89 100644 --- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c +++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c @@ -98,7 +98,7 @@ void texasr(void *in) int test_tmspr() { - pthread_t thread; + pthread_t *thread; int thread_num; unsigned long i; @@ -107,21 +107,28 @@ int test_tmspr() /* To cause some context switching */ thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); - /* Test TFIAR and TFHAR */ - for (i = 0 ; i < thread_num ; i += 2){ - if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i)) - return EXIT_FAILURE; - } - if (pthread_join(thread, NULL) != 0) + thread = malloc(thread_num * sizeof(pthread_t)); + if (thread == NULL) return EXIT_FAILURE; - /* Test TEXASR */ - for (i = 0 ; i < thread_num ; i++){ - if (pthread_create(&thread, NULL, (void*)texasr, (void *)i)) + /* Test TFIAR and TFHAR */ + for (i = 0; i < thread_num; i += 2) { + if (pthread_create(&thread[i], NULL, (void *)tfiar_tfhar, + (void *)i)) return EXIT_FAILURE; } - if (pthread_join(thread, NULL) != 0) - return EXIT_FAILURE; + /* Test TEXASR */ + for (i = 1; i < thread_num; i += 2) { + if (pthread_create(&thread[i], NULL, (void *)texasr, (void *)i)) + return EXIT_FAILURE; + } + + for (i = 0; i < thread_num; i++) { + if (pthread_join(thread[i], NULL) != 0) + return EXIT_FAILURE; + } + + free(thread); if (passed) return 0; From 96695563cebfb810b09479a9951ebbc466fa4c68 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 18 Jun 2018 19:59:42 -0300 Subject: [PATCH 005/221] powerpc/tm: Fix HTM documentation This patch simply fix part of the documentation on the HTM code. This fixes reference to old fields that were renamed in commit 000ec280e3dd ("powerpc: tm: Rename transct_(*) to ck(\1)_state") It also documents better the flow after commit eb5c3f1c8647 ("powerpc: Always save/restore checkpointed regs during treclaim/trecheckpoint"), where tm_recheckpoint can recheckpoint what is in ck{fp,vr}_state blindly. Signed-off-by: Breno Leitao Acked-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 10 +++++----- arch/powerpc/kernel/traps.c | 16 ++++++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 6bffbc5affe7..50e5cff10d0f 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -96,9 +96,9 @@ EXPORT_SYMBOL_GPL(tm_abort); * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding - * transactions and updates thread->regs.tm_ckpt_* with the - * original checkpointed state. Note that thread->regs is - * unchanged. + * transactions and updates thread.ckpt_regs, thread.ckfp_state and + * thread.ckvr_state with the original checkpointed state. Note that + * thread->regs is unchanged. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -261,7 +261,7 @@ _GLOBAL(tm_reclaim) /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE - SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ + SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 ckvr_state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 @@ -272,7 +272,7 @@ _GLOBAL(tm_reclaim) /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE - SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ + SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 ckfp_state */ mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c85adb858271..6ab66a88db14 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1750,16 +1750,20 @@ void fp_unavailable_tm(struct pt_regs *regs) * checkpointed FP registers need to be loaded. */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - /* Reclaim didn't save out any FPRs to transact_fprs. */ + + /* + * Reclaim initially saved out bogus (lazy) FPRs to ckfp_state, and + * then it was overwrite by the thr->fp_state by tm_reclaim_thread(). + * + * At this point, ck{fp,vr}_state contains the exact values we want to + * recheckpoint. + */ /* Enable FP for the task: */ current->thread.load_fp = 1; - /* This loads and recheckpoints the FP registers from - * thread.fpr[]. They will remain in registers after the - * checkpoint so we don't need to reload them after. - * If VMX is in use, the VRs now hold checkpointed values, - * so we don't want to load the VRs from the thread_struct. + /* + * Recheckpoint all the checkpointed ckpt, ck{fp, vr}_state registers. */ tm_recheckpoint(¤t->thread); } From 5600fbe340331e2a25d1b277f9c190f5c9948038 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 20 Aug 2018 16:29:34 +0200 Subject: [PATCH 006/221] powerpc/pseries/mm: Introducing FW_FEATURE_BLOCK_REMOVE This feature tells if the hcall H_BLOCK_REMOVE is available. Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Reviewed-by: Aneesh Kumar K.V Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/firmware.h | 3 ++- arch/powerpc/platforms/pseries/firmware.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7a051bd21f87..2aca2655fe30 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -52,6 +52,7 @@ #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) +#define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) #ifndef __ASSEMBLY__ @@ -69,7 +70,7 @@ enum { FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | - FW_FEATURE_DRC_INFO, + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index a3bbeb43689e..1624501386f4 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -65,6 +65,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_SET_MODE, "hcall-set-mode"}, {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, + {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, }; /* Build up the firmware features bitmask using the contents of From 0effa488dc1ad082ceed44b5d38c29fd8b3f259e Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 20 Aug 2018 16:29:35 +0200 Subject: [PATCH 007/221] powerpc/pseries/mm: factorize PTE slot computation This part of code will be called also when dealing with H_BLOCK_REMOVE. Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Reviewed-by: Aneesh Kumar K.V Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lpar.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index d3992ced0782..ebc852e3607d 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -546,6 +546,24 @@ static int pSeries_lpar_hpte_removebolted(unsigned long ea, return 0; } + +static inline unsigned long compute_slot(real_pte_t pte, + unsigned long vpn, + unsigned long index, + unsigned long shift, + int ssize) +{ + unsigned long slot, hash, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + return slot; +} + /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. @@ -558,7 +576,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[PLPAR_HCALL9_BUFSIZE]; - unsigned long hash, index, shift, hidx, slot; + unsigned long index, shift, slot; real_pte_t pte; int psize, ssize; @@ -572,12 +590,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) vpn = batch->vpn[i]; pte = batch->pte[i]; pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; + slot = compute_slot(pte, vpn, index, shift, ssize); if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { /* * lpar doesn't use the passed actual page size From ba2dd8a26baa7e140555746d396e32952709c42d Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 20 Aug 2018 16:29:36 +0200 Subject: [PATCH 008/221] powerpc/pseries/mm: call H_BLOCK_REMOVE This hypervisor's call allows to remove up to 8 ptes with only call to tlbie. The virtual pages must be all within the same naturally aligned 8 pages virtual address block and have the same page and segment size encodings. Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvcall.h | 1 + arch/powerpc/platforms/pseries/lpar.c | 214 +++++++++++++++++++++++++- 2 files changed, 207 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a0b17f9f1ea4..c349d3960d63 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -278,6 +278,7 @@ #define H_COP 0x304 #define H_GET_MPP_X 0x314 #define H_SET_MODE 0x31C +#define H_BLOCK_REMOVE 0x328 #define H_CLEAR_HPT 0x358 #define H_REQUEST_VMC 0x360 #define H_RESIZE_HPT_PREPARE 0x36C diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index ebc852e3607d..0b5081085a44 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -417,6 +417,79 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, BUG_ON(lpar_rc != H_SUCCESS); } + +/* + * As defined in the PAPR's section 14.5.4.1.8 + * The control mask doesn't include the returned reference and change bit from + * the processed PTE. + */ +#define HBLKR_AVPN 0x0100000000000000UL +#define HBLKR_CTRL_MASK 0xf800000000000000UL +#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL +#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL +#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL + +/** + * H_BLOCK_REMOVE caller. + * @idx should point to the latest @param entry set with a PTEX. + * If PTE cannot be processed because another CPUs has already locked that + * group, those entries are put back in @param starting at index 1. + * If entries has to be retried and @retry_busy is set to true, these entries + * are retried until success. If @retry_busy is set to false, the returned + * is the number of entries yet to process. + */ +static unsigned long call_block_remove(unsigned long idx, unsigned long *param, + bool retry_busy) +{ + unsigned long i, rc, new_idx; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + if (idx < 2) { + pr_warn("Unexpected empty call to H_BLOCK_REMOVE"); + return 0; + } +again: + new_idx = 0; + if (idx > PLPAR_HCALL9_BUFSIZE) { + pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx); + idx = PLPAR_HCALL9_BUFSIZE; + } else if (idx < PLPAR_HCALL9_BUFSIZE) + param[idx] = HBR_END; + + rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf, + param[0], /* AVA */ + param[1], param[2], param[3], param[4], /* TS0-7 */ + param[5], param[6], param[7], param[8]); + if (rc == H_SUCCESS) + return 0; + + BUG_ON(rc != H_PARTIAL); + + /* Check that the unprocessed entries were 'not found' or 'busy' */ + for (i = 0; i < idx-1; i++) { + unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK; + + if (ctrl == HBLKR_CTRL_ERRBUSY) { + param[++new_idx] = param[i+1]; + continue; + } + + BUG_ON(ctrl != HBLKR_CTRL_SUCCESS + && ctrl != HBLKR_CTRL_ERRNOTFOUND); + } + + /* + * If there were entries found busy, retry these entries if requested, + * of if all the entries have to be retried. + */ + if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) { + idx = new_idx + 1; + goto again; + } + + return new_idx; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need @@ -424,17 +497,57 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, */ #define PPC64_HUGE_HPTE_BATCH 12 -static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, - unsigned long *vpn, int count, - int psize, int ssize) +static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) +{ + unsigned long param[PLPAR_HCALL9_BUFSIZE]; + unsigned long shift, current_vpgb, vpgb; + int i, pix = 0; + + shift = mmu_psize_defs[psize].shift; + + for (i = 0; i < count; i++) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn[i], psize, ssize); + pix = 1; + } + + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i]; + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } + if (pix) + (void)call_block_remove(pix, param, true); +} + +static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) { unsigned long param[PLPAR_HCALL9_BUFSIZE]; int i = 0, pix = 0, rc; - unsigned long flags = 0; - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); for (i = 0; i < count; i++) { @@ -462,6 +575,23 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, param[6], param[7]); BUG_ON(rc != H_SUCCESS); } +} + +static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, + int count, int psize, + int ssize) +{ + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + hugepage_block_invalidate(slot, vpn, count, psize, ssize); + else + hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); @@ -564,6 +694,68 @@ static inline unsigned long compute_slot(real_pte_t pte, return slot; } +/** + * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are + * "all within the same naturally aligned 8 page virtual address block". + */ +static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, + unsigned long *param) +{ + unsigned long vpn; + unsigned long i, pix = 0; + unsigned long index, shift, slot, current_vpgb, vpgb; + real_pte_t pte; + int psize, ssize; + + psize = batch->psize; + ssize = batch->ssize; + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, + true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn, psize, + ssize); + pix = 1; + } + + slot = compute_slot(pte, vpn, index, shift, ssize); + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot; + + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } pte_iterate_hashed_end(); + } + + if (pix) + (void)call_block_remove(pix, param, true); +} + /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. @@ -583,6 +775,11 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) { + do_block_remove(number, batch, param); + goto out; + } + psize = batch->psize; ssize = batch->ssize; pix = 0; @@ -621,6 +818,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) BUG_ON(rc != H_SUCCESS); } +out: if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); } From be54c1216f6689a6eb504e3471d0cb41cc9d9809 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 21 Aug 2018 11:04:12 +1000 Subject: [PATCH 009/221] powerpc/64: Remove static branch hints from memset() Static branch hints override dynamic branch prediction on recent POWER CPUs. We should only use them when we are overwhelmingly sure of the direction. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/mem_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index ec531de99996..3c3be02f33b7 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -40,7 +40,7 @@ _GLOBAL(memset) .Lms: PPC_MTOCRF(1,r0) mr r6,r3 blt cr1,8f - beq+ 3f /* if already 8-byte aligned */ + beq 3f /* if already 8-byte aligned */ subf r5,r0,r5 bf 31,1f stb r4,0(r6) @@ -85,7 +85,7 @@ _GLOBAL(memset) addi r6,r6,8 8: cmpwi r5,0 PPC_MTOCRF(1,r5) - beqlr+ + beqlr bf 29,9f stw r4,0(r6) addi r6,r6,4 From 3f7daf3d7582dc6628ac40a9045dd1bbd80c5f35 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Fri, 17 Aug 2018 14:25:01 +1000 Subject: [PATCH 010/221] powerpc/memtrace: Remove memory in chunks When hot-removing memory release_mem_region_adjustable() splits iomem resources if they are not the exact size of the memory being hot-deleted. Adding this memory back to the kernel adds a new resource. Eg a node has memory 0x0 - 0xfffffffff. Hot-removing 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff. When we hot-add the memory back we now have three resources: 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff. This is an issue if we try to remove some memory that overlaps resources. Eg when trying to remove 2GB at address 0xf40000000, release_mem_region_adjustable() fails as it expects the chunk of memory to be within the boundaries of a single resource. We then get the warning: "Unable to release resource" and attempting to use memtrace again gives us this error: "bash: echo: write error: Resource temporarily unavailable" This patch makes memtrace remove memory in chunks that are always the same size from an address that is always equal to end_of_memory - n*size, for some n. So hotremoving and hotadding memory of different sizes will now not attempt to remove memory that spans multiple resources. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/memtrace.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 51dc398ae3f7..a29fdf8a2e56 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -90,17 +90,15 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, change_memblock_state); - lock_device_hotplug(); - remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); - unlock_device_hotplug(); return true; } static u64 memtrace_alloc_node(u32 nid, u64 size) { - u64 start_pfn, end_pfn, nr_pages; + u64 start_pfn, end_pfn, nr_pages, pfn; u64 base_pfn; + u64 bytes = memory_block_size_bytes(); if (!node_spanned_pages(nid)) return 0; @@ -113,8 +111,21 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) end_pfn = round_down(end_pfn - nr_pages, nr_pages); for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { - if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) { + /* + * Remove memory in memory block size chunks so that + * iomem resources are always split to the same size and + * we never try to remove memory that spans two iomem + * resources. + */ + lock_device_hotplug(); + end_pfn = base_pfn + nr_pages; + for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { + remove_memory(nid, pfn << PAGE_SHIFT, bytes); + } + unlock_device_hotplug(); return base_pfn << PAGE_SHIFT; + } } return 0; From 6f8e45f7eb1bee5efdbe4a9cfe4a45627403c5fb Mon Sep 17 00:00:00 2001 From: Christophe Lombard Date: Tue, 14 Aug 2018 14:45:15 +0200 Subject: [PATCH 011/221] ocxl: Fix access to the AFU Descriptor Data The AFU Information DVSEC capability is a means to extract common, general information about all of the AFUs associated with a Function independent of the specific functionality that each AFU provides. Write in the AFU Index field allows to access to the descriptor data for each AFU. With the current code, we are not able to access to these specific data when the index >= 1 because we are writing to the wrong location. All requests to the data of each AFU are pointing to those of the AFU 0, which could have impacts when using a card with more than one AFU per function. This patch fixes the access to the AFU Descriptor Data indexed by the AFU Info Index field. Fixes: 5ef3166e8a32 ("ocxl: Driver code for 'generic' opencapi devices") Cc: stable # 4.16 Signed-off-by: Christophe Lombard Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/config.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index 2e30de9c694a..57a6bb1fd3c9 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -280,7 +280,9 @@ int ocxl_config_check_afu_index(struct pci_dev *dev, u32 val; int rc, templ_major, templ_minor, len; - pci_write_config_word(dev, fn->dvsec_afu_info_pos, afu_idx); + pci_write_config_byte(dev, + fn->dvsec_afu_info_pos + OCXL_DVSEC_AFU_INFO_AFU_IDX, + afu_idx); rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_VERSION, &val); if (rc) return rc; From 8ac9e5bfd8cf41ef106ac97267117e5209627c74 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Aug 2018 20:26:39 -0300 Subject: [PATCH 012/221] powerpc/xive: Use xive_cpu->chip_id instead of looking it up again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function xive_native_get_ipi() might use chip_id without it being initialized, if the CPU node is not found, as reported by smatch: error: uninitialized symbol 'chip_id' As suggested by Cédric, we can use xc->chip_id instead of consulting the device tree for chip id, which is safe since xive_prepare_cpu() should have initialized ->chip_id by the time xive_native_get_ipi() is called. Signed-off-by: Breno Leitao Reviewed-by: Cédric Le Goater [mpe: Tweak change log] Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/native.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 5b20a678d755..1ca127d052a6 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -238,20 +238,11 @@ static bool xive_native_match(struct device_node *node) #ifdef CONFIG_SMP static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) { - struct device_node *np; - unsigned int chip_id; s64 irq; - /* Find the chip ID */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) - chip_id = 0; - } - /* Allocate an IPI and populate info about it */ for (;;) { - irq = opal_xive_allocate_irq(chip_id); + irq = opal_xive_allocate_irq(xc->chip_id); if (irq == OPAL_BUSY) { msleep(OPAL_BUSY_DELAY_MS); continue; From 984ecdd68de0fa1f63ce205d6c19ef5a7bc67b40 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Aug 2018 15:44:48 -0300 Subject: [PATCH 013/221] powerpc/iommu: Avoid derefence before pointer check The tbl pointer is being derefenced by IOMMU_PAGE_SIZE prior the check if it is not NULL. Just moving the dereference code to after the check, where there will be guarantee that 'tbl' will not be NULL. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index af7a20dc6e09..80b6caaa9b92 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -785,9 +785,9 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, vaddr = page_address(page) + offset; uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); if (tbl) { + npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); align = 0; if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) From 44d947eff19d64384efc06069509db7a0a1103b0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 12 Sep 2018 17:31:05 -0300 Subject: [PATCH 014/221] selftests/powerpc: Do not fail with reschedule There are cases where the test is not expecting to have the transaction aborted, but, the test process might have been rescheduled, either in the OS level or by KVM (if it is running on a KVM guest machine). The process reschedule will cause a treclaim/recheckpoint which will cause the transaction to doom, aborting the transaction as soon as the process is rescheduled back to the CPU. This might cause the test to fail, but this is not a failure in essence. If that is the case, TEXASR[FC] is indicated with either TM_CAUSE_RESCHEDULE or TM_CAUSE_KVM_RESCHEDULE for KVM interruptions. In this scenario, ignore these two failures and avoid the whole test to return failure. Signed-off-by: Breno Leitao Reviewed-by: Gustavo Romero Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/tm/tm-unavailable.c | 9 ++++++--- tools/testing/selftests/powerpc/tm/tm.h | 9 +++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c index 156c8e750259..09894f4ff62e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-unavailable.c +++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c @@ -236,7 +236,8 @@ void *tm_una_ping(void *input) } /* Check if we were not expecting a failure and a it occurred. */ - if (!expecting_failure() && is_failure(cr_)) { + if (!expecting_failure() && is_failure(cr_) && + !failure_is_reschedule()) { printf("\n\tUnexpected transaction failure 0x%02lx\n\t", failure_code()); return (void *) -1; @@ -244,9 +245,11 @@ void *tm_una_ping(void *input) /* * Check if TM failed due to the cause we were expecting. 0xda is a - * TM_CAUSE_FAC_UNAV cause, otherwise it's an unexpected cause. + * TM_CAUSE_FAC_UNAV cause, otherwise it's an unexpected cause, unless + * it was caused by a reschedule. */ - if (is_failure(cr_) && !failure_is_unavailable()) { + if (is_failure(cr_) && !failure_is_unavailable() && + !failure_is_reschedule()) { printf("\n\tUnexpected failure cause 0x%02lx\n\t", failure_code()); return (void *) -1; diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index df4204247d45..5518b1d4ef8b 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -52,6 +52,15 @@ static inline bool failure_is_unavailable(void) return (failure_code() & TM_CAUSE_FAC_UNAV) == TM_CAUSE_FAC_UNAV; } +static inline bool failure_is_reschedule(void) +{ + if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED || + (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED) + return true; + + return false; +} + static inline bool failure_is_nesting(void) { return (__builtin_get_texasru() & 0x400000); From 04fce21c9db54695389200b50b0b7a7866232ba6 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 11 Sep 2018 19:56:52 +0530 Subject: [PATCH 015/221] powerpc/pseries: Define MCE error event section. On pseries, the machine check error details are part of RTAS extended event log passed under Machine check exception section. This patch adds the definition of rtas MCE event section and related helper functions. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/rtas.h | 8 +++ arch/powerpc/platforms/pseries/ras.c | 95 ++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 71e393c46a49..adefa6493d29 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -185,6 +185,13 @@ static inline uint8_t rtas_error_disposition(const struct rtas_error_log *elog) return (elog->byte1 & 0x18) >> 3; } +static inline +void rtas_set_disposition_recovered(struct rtas_error_log *elog) +{ + elog->byte1 &= ~0x18; + elog->byte1 |= (RTAS_DISP_FULLY_RECOVERED << 3); +} + static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog) { return (elog->byte1 & 0x04) >> 2; @@ -275,6 +282,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') #define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') #define PSERIES_ELOG_SECT_ID_HOTPLUG (('H' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C') /* Vendor specific Platform Event Log Format, Version 6, section header */ struct pseries_errorlog { diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 851ce326874a..3500ad982706 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -50,6 +50,101 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); static irqreturn_t ras_error_interrupt(int irq, void *dev_id); +/* RTAS pseries MCE errorlog section. */ +struct pseries_mc_errorlog { + __be32 fru_id; + __be32 proc_id; + u8 error_type; + /* + * sub_err_type (1 byte). Bit fields depends on error_type + * + * MSB0 + * | + * V + * 01234567 + * XXXXXXXX + * + * For error_type == MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Permanent or Transient UE. + * X 1: Effective address provided. + * X 1: Logical address provided. + * XX 2: Reserved. + * XXX 3: Type of UE error. + * + * For error_type != MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Effective address provided. + * XXXXX 5: Reserved. + * XX 2: Type of SLB/ERAT/TLB error. + */ + u8 sub_err_type; + u8 reserved_1[6]; + __be64 effective_address; + __be64 logical_address; +} __packed; + +/* RTAS pseries MCE error types */ +#define MC_ERROR_TYPE_UE 0x00 +#define MC_ERROR_TYPE_SLB 0x01 +#define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_TLB 0x04 +#define MC_ERROR_TYPE_D_CACHE 0x05 +#define MC_ERROR_TYPE_I_CACHE 0x07 + +/* RTAS pseries MCE error sub types */ +#define MC_ERROR_UE_INDETERMINATE 0 +#define MC_ERROR_UE_IFETCH 1 +#define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 +#define MC_ERROR_UE_LOAD_STORE 3 +#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 + +#define MC_ERROR_SLB_PARITY 0 +#define MC_ERROR_SLB_MULTIHIT 1 +#define MC_ERROR_SLB_INDETERMINATE 2 + +#define MC_ERROR_ERAT_PARITY 1 +#define MC_ERROR_ERAT_MULTIHIT 2 +#define MC_ERROR_ERAT_INDETERMINATE 3 + +#define MC_ERROR_TLB_PARITY 1 +#define MC_ERROR_TLB_MULTIHIT 2 +#define MC_ERROR_TLB_INDETERMINATE 3 + +static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) +{ + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + return (mlog->sub_err_type & 0x07); + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + return (mlog->sub_err_type & 0x03); + default: + return 0; + } +} + +static +inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) +{ + __be64 addr = 0; + + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + if (mlog->sub_err_type & 0x40) + addr = mlog->effective_address; + break; + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + if (mlog->sub_err_type & 0x80) + addr = mlog->effective_address; + default: + break; + } + return be64_to_cpu(addr); +} /* * Enable the hotplug interrupt late because processing them may touch other From a43c1590426c44a5c6bbaf51b70a36a5c6d86914 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 11 Sep 2018 19:57:00 +0530 Subject: [PATCH 016/221] powerpc/pseries: Flush SLB contents on SLB MCE errors. On pseries, as of today system crashes if we get a machine check exceptions due to SLB errors. These are soft errors and can be fixed by flushing the SLBs so the kernel can continue to function instead of system crash. We do this in real mode before turning on MMU. Otherwise we would run into nested machine checks. This patch now fetches the rtas error log in real mode and flushes the SLBs on SLB/ERAT errors. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michal Suchanek Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/machdep.h | 1 + arch/powerpc/include/asm/mce.h | 3 + arch/powerpc/kernel/exceptions-64s.S | 129 +++++++++++++++++++++++ arch/powerpc/kernel/mce.c | 9 +- arch/powerpc/kernel/mce_power.c | 2 +- arch/powerpc/platforms/powernv/opal.c | 2 + arch/powerpc/platforms/powernv/setup.c | 11 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 60 ++++++++++- arch/powerpc/platforms/pseries/setup.c | 1 + 10 files changed, 213 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index a47de82fb8e2..b4831f1338db 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -108,6 +108,7 @@ struct machdep_calls { /* Early exception handlers called in realmode */ int (*hmi_exception_early)(struct pt_regs *regs); + long (*machine_check_early)(struct pt_regs *regs); /* Called during machine check exception to retrive fixup address. */ bool (*mce_check_early_recovery)(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 3a1226e9b465..a8b8903e1844 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -210,4 +210,7 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode); +#ifdef CONFIG_PPC_BOOK3S_64 +void flush_and_reload_slb(void); +#endif /* CONFIG_PPC_BOOK3S_64 */ #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ea04dfb8c092..b36e11d73702 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -331,6 +331,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries) machine_check_fwnmi: SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_pSeries_early +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) machine_check_pSeries_0: EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) /* @@ -342,6 +345,103 @@ machine_check_pSeries_0: TRAMP_KVM_SKIP(PACA_EXMC, 0x200) +TRAMP_REAL_BEGIN(machine_check_pSeries_early) +BEGIN_FTR_SECTION + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) + mr r10,r1 /* Save r1 */ + lhz r11,PACA_IN_MCE(r13) + cmpwi r11,0 /* Are we in nested machine check */ + bne 0f /* Yes, we are. */ + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + addi r11,r11,1 /* increment paca->in_mce */ + sth r11,PACA_IN_MCE(r13) + /* Limit nested MCE to level 4 to avoid stack overflow */ + cmpwi r11,MAX_MCE_DEPTH + bgt 1f /* Check if we hit limit of 4 */ + mfspr r11,SPRN_SRR0 /* Save SRR0 */ + mfspr r12,SPRN_SRR1 /* Save SRR1 */ + EXCEPTION_PROLOG_COMMON_1() + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) + EXCEPTION_PROLOG_COMMON_3(0x200) + addi r3,r1,STACK_FRAME_OVERHEAD + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */ + ld r12,_MSR(r1) + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne 2f /* continue in V mode if we are. */ + + /* + * At this point we are not sure about what context we come from. + * We may be in the middle of switching stack. r1 may not be valid. + * Hence stay on emergency stack, call machine_check_exception and + * return from the interrupt. + * But before that, check if this is an un-recoverable exception. + * If yes, then stay on emergency stack and panic. + */ + andi. r11,r12,MSR_RI + beq 1f + + /* + * Check if we have successfully handled/recovered from error, if not + * then stay on emergency stack and panic. + */ + cmpdi r3,0 /* see if we handled MCE successfully */ + beq 1f /* if !handled then panic */ + + /* Stay on emergency stack and return from interrupt. */ + LOAD_HANDLER(r10,mce_return) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . + +1: LOAD_HANDLER(r10,unrecover_mce) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + /* + * We are going down. But there are chances that we might get hit by + * another MCE during panic path and we may run into unstable state + * with no way out. Hence, turn ME bit off while going down, so that + * when another MCE is hit during panic path, hypervisor will + * power cycle the lpar, instead of getting into MCE loop. + */ + li r3,MSR_ME + andc r10,r10,r3 /* Turn off MSR_ME */ + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . + + /* Move original SRR0 and SRR1 into the respective regs */ +2: ld r9,_MSR(r1) + mtspr SPRN_SRR1,r9 + ld r3,_NIP(r1) + mtspr SPRN_SRR0,r3 + ld r9,_CTR(r1) + mtctr r9 + ld r9,_XER(r1) + mtxer r9 + ld r9,_LINK(r1) + mtlr r9 + REST_GPR(0, r1) + REST_8GPRS(2, r1) + REST_GPR(10, r1) + ld r11,_CCR(r1) + mtcr r11 + /* Decrement paca->in_mce. */ + lhz r12,PACA_IN_MCE(r13) + subi r12,r12,1 + sth r12,PACA_IN_MCE(r13) + REST_GPR(11, r1) + REST_2GPRS(12, r1) + /* restore original r1. */ + ld r1,GPR1(r1) + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_0(PACA_EXMC) + b machine_check_pSeries_0 +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) + EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different @@ -535,6 +635,35 @@ EXC_COMMON_BEGIN(unrecover_mce) bl unrecoverable_exception b 1b +EXC_COMMON_BEGIN(mce_return) + /* Invoke machine_check_exception to print MCE event and return. */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + ld r9,_MSR(r1) + mtspr SPRN_SRR1,r9 + ld r3,_NIP(r1) + mtspr SPRN_SRR0,r3 + ld r9,_CTR(r1) + mtctr r9 + ld r9,_XER(r1) + mtxer r9 + ld r9,_LINK(r1) + mtlr r9 + REST_GPR(0, r1) + REST_8GPRS(2, r1) + REST_GPR(10, r1) + ld r11,_CCR(r1) + mtcr r11 + /* Decrement paca->in_mce. */ + lhz r12,PACA_IN_MCE(r13) + subi r12,r12,1 + sth r12,PACA_IN_MCE(r13) + REST_GPR(11, r1) + REST_2GPRS(12, r1) + /* restore original r1. */ + ld r1,GPR1(r1) + RFI_TO_KERNEL + b . EXC_REAL(data_access, 0x300, 0x80) EXC_VIRT(data_access, 0x4300, 0x80, 0x300) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index efdd16a79075..bd933a75f0bc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -488,10 +488,11 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); + /* + * See if platform is capable of handling machine check. + */ + if (ppc_md.machine_check_early) + handled = ppc_md.machine_check_early(regs); return handled; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 3497c8329c1d..2016b58d564f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -60,7 +60,7 @@ static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) /* flush SLBs and reload */ #ifdef CONFIG_PPC_BOOK3S_64 -static void flush_and_reload_slb(void) +void flush_and_reload_slb(void) { /* Invalidate all SLBs */ slb_flush_all_realmode(); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 38fe4087484a..62c291e23dbe 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -578,6 +578,8 @@ int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; + __this_cpu_inc(irq_stat.mce_exceptions); + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return 0; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index adddde023622..c9cbd11a442e 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -437,6 +437,16 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) return ret_freq; } +static long pnv_machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + + return handled; +} + define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, @@ -448,6 +458,7 @@ define_machine(powernv) { .machine_shutdown = pnv_shutdown, .power_save = NULL, .calibrate_decr = generic_calibrate_decr, + .machine_check_early = pnv_machine_check_early, #ifdef CONFIG_KEXEC_CORE .kexec_cpu_down = pnv_kexec_cpu_down, #endif diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 60db2ee511fb..619f8f3fa173 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -24,6 +24,7 @@ struct pt_regs; extern int pSeries_system_reset_exception(struct pt_regs *regs); extern int pSeries_machine_check_exception(struct pt_regs *regs); +extern long pseries_machine_check_realmode(struct pt_regs *regs); #ifdef CONFIG_SMP extern void smp_init_pseries(void); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 3500ad982706..0578c243ef01 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "pseries.h" @@ -522,6 +523,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +static int mce_handle_error(struct rtas_error_log *errp) +{ + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); + u8 error_type; + + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + +#ifdef CONFIG_PPC_BOOK3S_64 + if (disposition == RTAS_DISP_NOT_RECOVERED) { + switch (error_type) { + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + /* Store the old slb content someplace. */ + flush_and_reload_slb(); + disposition = RTAS_DISP_FULLY_RECOVERED; + rtas_set_disposition_recovered(errp); + break; + default: + break; + } + } +#endif + +out: + return disposition; +} + /* * Process MCE rtas errlog event. */ @@ -598,11 +636,31 @@ int pSeries_machine_check_exception(struct pt_regs *regs) struct rtas_error_log *errp; if (fwnmi_active) { - errp = fwnmi_get_errinfo(regs); fwnmi_release_errinfo(); + errp = fwnmi_get_errlog(); if (errp && recover_mce(regs, errp)) return 1; } return 0; } + +long pseries_machine_check_realmode(struct pt_regs *regs) +{ + struct rtas_error_log *errp; + int disposition; + + if (fwnmi_active) { + errp = fwnmi_get_errinfo(regs); + /* + * Call to fwnmi_release_errinfo() in real mode causes kernel + * to panic. Hence we will call it as soon as we go into + * virtual mode. + */ + disposition = mce_handle_error(errp); + if (disposition == RTAS_DISP_FULLY_RECOVERED) + return 1; + } + + return 0; +} diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index ba1791fd3234..e03f62a78649 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -1017,6 +1017,7 @@ define_machine(pseries) { .calibrate_decr = generic_calibrate_decr, .progress = rtas_progress, .system_reset_exception = pSeries_system_reset_exception, + .machine_check_early = pseries_machine_check_realmode, .machine_check_exception = pSeries_machine_check_exception, #ifdef CONFIG_KEXEC_CORE .machine_kexec = pSeries_machine_kexec, From 8f0b80561f217e3f379d7819a6c6b429bebdaea6 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 11 Sep 2018 19:57:07 +0530 Subject: [PATCH 017/221] powerpc/pseries: Display machine check error details. Extract the MCE error details from RTAS extended log and display it to console. With this patch you should now see mce logs like below: [ 142.371818] Severe Machine check interrupt [Recovered] [ 142.371822] NIP [d00000000ca301b8]: init_module+0x1b8/0x338 [bork_kernel] [ 142.371822] Initiator: CPU [ 142.371823] Error type: SLB [Multihit] [ 142.371824] Effective address: d00000000ca70000 Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/rtas.h | 5 + arch/powerpc/platforms/pseries/ras.c | 133 +++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index adefa6493d29..0183e9595acc 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -197,6 +197,11 @@ static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog) return (elog->byte1 & 0x04) >> 2; } +static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog) +{ + return (elog->byte2 & 0xf0) >> 4; +} + #define rtas_error_type(x) ((x)->byte3) static inline diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 0578c243ef01..49e83c954d2c 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -523,6 +523,136 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +#define VAL_TO_STRING(ar, val) \ + (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") + +static void pseries_print_mce_info(struct pt_regs *regs, + struct rtas_error_log *errp) +{ + const char *level, *sevstr; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + u8 error_type, err_sub_type; + u64 addr; + u8 initiator = rtas_error_initiator(errp); + int disposition = rtas_error_disposition(errp); + + static const char * const initiators[] = { + "Unknown", + "CPU", + "PCI", + "ISA", + "Memory", + "Power Mgmt", + }; + static const char * const mc_err_types[] = { + "UE", + "SLB", + "ERAT", + "TLB", + "D-Cache", + "Unknown", + "I-Cache", + }; + static const char * const mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + + /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ + static const char * const mc_slb_types[] = { + "Parity", + "Multihit", + "Indeterminate", + }; + + /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ + static const char * const mc_soft_types[] = { + "Unknown", + "Parity", + "Multihit", + "Indeterminate", + }; + + if (!rtas_error_extended(errp)) { + pr_err("Machine check interrupt: Missing extended error log\n"); + return; + } + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + return; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (rtas_error_severity(errp)) { + case RTAS_SEVERITY_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case RTAS_SEVERITY_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case RTAS_SEVERITY_ERROR: + case RTAS_SEVERITY_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case RTAS_SEVERITY_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + disposition == RTAS_DISP_FULLY_RECOVERED ? + "Recovered" : "Not recovered"); + if (user_mode(regs)) { + printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, + regs->nip, current->pid, current->comm); + } else { + printk("%s NIP [%016lx]: %pS\n", level, regs->nip, + (void *)regs->nip); + } + printk("%s Initiator: %s\n", level, + VAL_TO_STRING(initiators, initiator)); + + switch (error_type) { + case MC_ERROR_TYPE_UE: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_ue_types, err_sub_type)); + break; + case MC_ERROR_TYPE_SLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_slb_types, err_sub_type)); + break; + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_soft_types, err_sub_type)); + break; + default: + printk("%s Error type: %s\n", level, + VAL_TO_STRING(mc_err_types, error_type)); + break; + } + + addr = rtas_mc_get_effective_addr(mce_log); + if (addr) + printk("%s Effective address: %016llx\n", level, addr); +} + static int mce_handle_error(struct rtas_error_log *errp) { struct pseries_errorlog *pseries_log; @@ -585,8 +715,11 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) int recovered = 0; int disposition = rtas_error_disposition(err); + pseries_print_mce_info(regs, err); + if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ + pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { From c6d15258cdf1c197cad7b11b9848e79068dd21e0 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 11 Sep 2018 19:57:15 +0530 Subject: [PATCH 018/221] powerpc/pseries: Dump the SLB contents on SLB MCE errors. If we get a machine check exceptions due to SLB errors then dump the current SLB contents which will be very much helpful in debugging the root cause of SLB errors. Introduce an exclusive buffer per cpu to hold faulty SLB entries. In real mode mce handler saves the old SLB contents into this buffer accessible through paca and print it out later in virtual mode. With this patch the console will log SLB contents like below on SLB MCE errors: [ 507.297236] SLB contents of cpu 0x1 [ 507.297237] Last SLB entry inserted at slot 16 [ 507.297238] 00 c000000008000000 400ea1b217000500 [ 507.297239] 1T ESID= c00000 VSID= ea1b217 LLP:100 [ 507.297240] 01 d000000008000000 400d43642f000510 [ 507.297242] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297243] 11 f000000008000000 400a86c85f000500 [ 507.297244] 1T ESID= f00000 VSID= a86c85f LLP:100 [ 507.297245] 12 00007f0008000000 4008119624000d90 [ 507.297246] 1T ESID= 7f VSID= 8119624 LLP:110 [ 507.297247] 13 0000000018000000 00092885f5150d90 [ 507.297247] 256M ESID= 1 VSID= 92885f5150 LLP:110 [ 507.297248] 14 0000010008000000 4009e7cb50000d90 [ 507.297249] 1T ESID= 1 VSID= 9e7cb50 LLP:110 [ 507.297250] 15 d000000008000000 400d43642f000510 [ 507.297251] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297252] 16 d000000008000000 400d43642f000510 [ 507.297253] 1T ESID= d00000 VSID= d43642f LLP:110 [ 507.297253] ---------------------------------- [ 507.297254] SLB cache ptr value = 3 [ 507.297254] Valid SLB cache entries: [ 507.297255] 00 EA[0-35]= 7f000 [ 507.297256] 01 EA[0-35]= 1 [ 507.297257] 02 EA[0-35]= 1000 [ 507.297257] Rest of SLB cache entries: [ 507.297258] 03 EA[0-35]= 7f000 [ 507.297258] 04 EA[0-35]= 1 [ 507.297259] 05 EA[0-35]= 1000 [ 507.297260] 06 EA[0-35]= 12 [ 507.297260] 07 EA[0-35]= 7f000 Suggested-by: Aneesh Kumar K.V Suggested-by: Michael Ellerman Signed-off-by: Mahesh Salgaonkar Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 7 ++ arch/powerpc/include/asm/paca.h | 6 ++ arch/powerpc/mm/slb.c | 70 +++++++++++++++++++ arch/powerpc/platforms/pseries/ras.c | 17 ++++- arch/powerpc/platforms/pseries/setup.c | 13 ++++ 5 files changed, 112 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index b3520b549cba..e577ccffe301 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -495,11 +495,18 @@ static inline void hpte_init_pseries(void) { } extern void hpte_init_native(void); +struct slb_entry { + u64 esid; + u64 vsid; +}; + extern void slb_initialize(void); extern void slb_flush_and_rebolt(void); void slb_flush_all_realmode(void); void __slb_restore_bolted_realmode(void); void slb_restore_bolted_realmode(void); +void slb_save_contents(struct slb_entry *slb_ptr); +void slb_dump_contents(struct slb_entry *slb_ptr); extern void slb_vmalloc_update(void); extern void slb_set_size(u16 size); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index ad4f16164619..7b6e23af3808 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -250,6 +250,12 @@ struct paca_struct { #ifdef CONFIG_PPC_PSERIES u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Capture SLB related old contents in MCE handler. */ + struct slb_entry *mce_faulty_slbs; + u16 slb_save_cache_ptr; +#endif /* CONFIG_PPC_BOOK3S_64 */ } ____cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 9f574e59d178..e941189d9bd6 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -184,6 +184,76 @@ void slb_flush_and_rebolt(void) get_paca()->slb_cache_ptr = 0; } +void slb_save_contents(struct slb_entry *slb_ptr) +{ + int i; + unsigned long e, v; + + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + pr_err("Last SLB entry inserted at slot %lld\n", get_paca()->stab_rr); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx\n", i, e, v); + + if (!(e & SLB_ESID_V)) { + pr_err("\n"); + continue; + } + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + pr_err("----------------------------------\n"); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); +} + void slb_vmalloc_update(void) { unsigned long vflags; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 49e83c954d2c..8d2ead2d7591 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -612,6 +612,12 @@ static void pseries_print_mce_info(struct pt_regs *regs, break; } +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (error_type == MC_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, disposition == RTAS_DISP_FULLY_RECOVERED ? "Recovered" : "Not recovered"); @@ -675,7 +681,16 @@ static int mce_handle_error(struct rtas_error_log *errp) switch (error_type) { case MC_ERROR_TYPE_SLB: case MC_ERROR_TYPE_ERAT: - /* Store the old slb content someplace. */ + /* + * Store the old slb content in paca before flushing. + * Print this when we go to virtual mode. + * There are chances that we may hit MCE again if there + * is a parity error on the SLB entry we trying to read + * for saving. Hence limit the slb saving to single + * level of recursion. + */ + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; rtas_set_disposition_recovered(errp); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e03f62a78649..0f553dcfa548 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -107,6 +107,10 @@ static void __init fwnmi_init(void) u8 *mce_data_buf; unsigned int i; int nr_cpus = num_possible_cpus(); +#ifdef CONFIG_PPC_BOOK3S_64 + struct slb_entry *slb_ptr; + size_t size; +#endif int ibm_nmi_register = rtas_token("ibm,nmi-register"); if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) @@ -132,6 +136,15 @@ static void __init fwnmi_init(void) paca_ptrs[i]->mce_data_buf = mce_data_buf + (RTAS_ERROR_LOG_MAX * i); } + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Allocate per cpu slb area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = __va(memblock_alloc_base(size, sizeof(struct slb_entry), + ppc64_rma_size)); + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); +#endif } static void pseries_8259_cascade(struct irq_desc *desc) From db7d31ac04133fc18893725d348fabf91d6e808e Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 11 Sep 2018 19:57:23 +0530 Subject: [PATCH 019/221] powernv/pseries: consolidate code for mce early handling. Now that other platforms also implements real mode mce handler, lets consolidate the code by sharing existing powernv machine check early code. Rename machine_check_powernv_early to machine_check_common_early and reuse the code. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 155 +++++---------------------- 1 file changed, 28 insertions(+), 127 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b36e11d73702..301a6a86a20f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -244,14 +244,13 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_powernv_early + b machine_check_common_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_powernv_early) -BEGIN_FTR_SECTION +TRAMP_REAL_BEGIN(machine_check_common_early) EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* * Register contents: @@ -305,7 +304,9 @@ BEGIN_FTR_SECTION /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) mfmsr r11 /* get MSR value */ +BEGIN_FTR_SECTION ori r11,r11,MSR_ME /* turn on ME bit */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ori r11,r11,MSR_RI /* turn on RI bit */ LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 @@ -324,7 +325,6 @@ BEGIN_FTR_SECTION andc r11,r11,r10 /* Turn off MSR_ME */ b 1b b . /* prevent speculative execution */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) TRAMP_REAL_BEGIN(machine_check_pSeries) .globl machine_check_fwnmi @@ -332,7 +332,7 @@ machine_check_fwnmi: SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_pSeries_early + b machine_check_common_early END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) machine_check_pSeries_0: EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) @@ -345,103 +345,6 @@ machine_check_pSeries_0: TRAMP_KVM_SKIP(PACA_EXMC, 0x200) -TRAMP_REAL_BEGIN(machine_check_pSeries_early) -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) - mr r10,r1 /* Save r1 */ - lhz r11,PACA_IN_MCE(r13) - cmpwi r11,0 /* Are we in nested machine check */ - bne 0f /* Yes, we are. */ - /* First machine check entry */ - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ -0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - addi r11,r11,1 /* increment paca->in_mce */ - sth r11,PACA_IN_MCE(r13) - /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r11,MAX_MCE_DEPTH - bgt 1f /* Check if we hit limit of 4 */ - mfspr r11,SPRN_SRR0 /* Save SRR0 */ - mfspr r12,SPRN_SRR1 /* Save SRR1 */ - EXCEPTION_PROLOG_COMMON_1() - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) - EXCEPTION_PROLOG_COMMON_3(0x200) - addi r3,r1,STACK_FRAME_OVERHEAD - BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */ - ld r12,_MSR(r1) - andi. r11,r12,MSR_PR /* See if coming from user. */ - bne 2f /* continue in V mode if we are. */ - - /* - * At this point we are not sure about what context we come from. - * We may be in the middle of switching stack. r1 may not be valid. - * Hence stay on emergency stack, call machine_check_exception and - * return from the interrupt. - * But before that, check if this is an un-recoverable exception. - * If yes, then stay on emergency stack and panic. - */ - andi. r11,r12,MSR_RI - beq 1f - - /* - * Check if we have successfully handled/recovered from error, if not - * then stay on emergency stack and panic. - */ - cmpdi r3,0 /* see if we handled MCE successfully */ - beq 1f /* if !handled then panic */ - - /* Stay on emergency stack and return from interrupt. */ - LOAD_HANDLER(r10,mce_return) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -1: LOAD_HANDLER(r10,unrecover_mce) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - /* - * We are going down. But there are chances that we might get hit by - * another MCE during panic path and we may run into unstable state - * with no way out. Hence, turn ME bit off while going down, so that - * when another MCE is hit during panic path, hypervisor will - * power cycle the lpar, instead of getting into MCE loop. - */ - li r3,MSR_ME - andc r10,r10,r3 /* Turn off MSR_ME */ - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - - /* Move original SRR0 and SRR1 into the respective regs */ -2: ld r9,_MSR(r1) - mtspr SPRN_SRR1,r9 - ld r3,_NIP(r1) - mtspr SPRN_SRR0,r3 - ld r9,_CTR(r1) - mtctr r9 - ld r9,_XER(r1) - mtxer r9 - ld r9,_LINK(r1) - mtlr r9 - REST_GPR(0, r1) - REST_8GPRS(2, r1) - REST_GPR(10, r1) - ld r11,_CCR(r1) - mtcr r11 - /* Decrement paca->in_mce. */ - lhz r12,PACA_IN_MCE(r13) - subi r12,r12,1 - sth r12,PACA_IN_MCE(r13) - REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ - ld r1,GPR1(r1) - SET_SCRATCH0(r13) /* save r13 */ - EXCEPTION_PROLOG_0(PACA_EXMC) - b machine_check_pSeries_0 -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) - EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different @@ -540,6 +443,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) +BEGIN_FTR_SECTION + b 4f +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) #ifdef CONFIG_PPC_P7_NAP /* @@ -563,11 +469,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ beq 5f - andi. r11,r12,MSR_PR /* See if coming from user. */ +4: andi. r11,r12,MSR_PR /* See if coming from user. */ bne 9f /* continue in V mode if we are. */ 5: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER +BEGIN_FTR_SECTION /* * We are coming from kernel context. Check if we are coming from * guest. if yes, then we can continue. We will fall through @@ -576,6 +483,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ bne 9f /* continue if we are. */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #endif /* * At this point we are not sure about what context we come from. @@ -610,6 +518,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) cmpdi r3,0 /* see if we handled MCE successfully */ beq 1b /* if !handled then panic */ +BEGIN_FTR_SECTION /* * Return from MC interrupt. * Queue up the MCE event so that we can log it later, while @@ -618,10 +527,24 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP RFI_TO_USER_OR_KERNEL +FTR_SECTION_ELSE + /* + * pSeries: Return from MC interrupt. Before that stay on emergency + * stack and call machine_check_exception to log the MCE event. + */ + LOAD_HANDLER(r10,mce_return) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP - b machine_check_pSeries + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_0(PACA_EXMC) + b machine_check_pSeries_0 EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ @@ -639,29 +562,7 @@ EXC_COMMON_BEGIN(mce_return) /* Invoke machine_check_exception to print MCE event and return. */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception - ld r9,_MSR(r1) - mtspr SPRN_SRR1,r9 - ld r3,_NIP(r1) - mtspr SPRN_SRR0,r3 - ld r9,_CTR(r1) - mtctr r9 - ld r9,_XER(r1) - mtxer r9 - ld r9,_LINK(r1) - mtlr r9 - REST_GPR(0, r1) - REST_8GPRS(2, r1) - REST_GPR(10, r1) - ld r11,_CCR(r1) - mtcr r11 - /* Decrement paca->in_mce. */ - lhz r12,PACA_IN_MCE(r13) - subi r12,r12,1 - sth r12,PACA_IN_MCE(r13) - REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ - ld r1,GPR1(r1) + MACHINE_CHECK_HANDLER_WINDUP RFI_TO_KERNEL b . From 09b4438db13fa83b6219aee5993711a2aa2a0c64 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:45 +1000 Subject: [PATCH 020/221] powerpc/64s/hash: Fix stab_rr off by one initialization This causes SLB alloation to start 1 beyond the start of the SLB. There is no real problem because after it wraps it stats behaving properly, it's just surprisig to see when looking at SLB traces. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index e941189d9bd6..07ece013856b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -425,7 +425,7 @@ void slb_initialize(void) #endif } - get_paca()->stab_rr = SLB_NUM_BOLTED; + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; lflags = SLB_VSID_KERNEL | linear_llp; vflags = SLB_VSID_KERNEL | vmalloc_llp; From 505ea82eabd2dfc69ca7a50c2996dbe969a89bec Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:46 +1000 Subject: [PATCH 021/221] powerpc/64s/hash: avoid the POWER5 < DD2.1 slb invalidate workaround on POWER8/9 I only have POWER8/9 to test, so just remove it for those. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 2 ++ arch/powerpc/mm/slb.c | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2206912ea4f0..77a888bfcb53 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -672,7 +672,9 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) isync slbie r6 +BEGIN_FTR_SECTION slbie r6 /* Workaround POWER5 < DD2.1 issue */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) slbmte r7,r0 isync 2: diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 07ece013856b..0656d20d59ec 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -326,9 +326,11 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) __slb_flush_and_rebolt(); } - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (slbie_data)); + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { + /* Workaround POWER5 < DD2.1 issue */ + if (offset == 1 || offset > SLB_CACHE_ENTRIES) + asm volatile("slbie %0" : : "r" (slbie_data)); + } get_paca()->slb_cache_ptr = 0; copy_mm_to_paca(mm); From 8b92887ced2e3fce223412487f99d4ef3f07b490 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:47 +1000 Subject: [PATCH 022/221] powerpc/64s/hash: move POWER5 < DD2.1 slbie workaround where it is needed The POWER5 < DD2.1 issue is that slbie needs to be issued more than once. It came in with this change: ChangeSet@1.1608, 2004-04-29 07:12:31-07:00, david@gibson.dropbear.id.au [PATCH] POWER5 erratum workaround Early POWER5 revisions ( [mpe: Retain slbie_data initialisation to avoid compiler warning] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 0656d20d59ec..a9c66feb3c43 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -296,7 +296,6 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { unsigned long offset; - unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long exec_base; @@ -311,7 +310,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) offset = get_paca()->slb_cache_ptr; if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; int i; + asm volatile("isync" : : : "memory"); for (i = 0; i < offset; i++) { slbie_data = (unsigned long)get_paca()->slb_cache[i] @@ -321,17 +322,16 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) slbie_data |= SLBIE_C; /* C set for user addresses */ asm volatile("slbie %0" : : "r" (slbie_data)); } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + asm volatile("isync" : : : "memory"); } else { __slb_flush_and_rebolt(); } - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (slbie_data)); - } - get_paca()->slb_cache_ptr = 0; copy_mm_to_paca(mm); From 85376e2a17ec152e76f6a87fcb66332a68926218 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:48 +1000 Subject: [PATCH 023/221] powerpc/64s/hash: remove the vmalloc segment from the bolted SLB Remove the vmalloc segment from bolted SLBEs. This is not required to be bolted, and seems like it was added to help pre-load the SLB on context switch. However there are now other segments like the vmemmap segment and non-zero node memory that often take misses after a context switch, so it is better to solve this in a more general way. A subsequent change will track free SLB entries and uses those rather than round-robin overwrite valid entries, which makes it far less likely for kernel SLBEs to be evicted after they are installed. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/mm/slb.c | 23 ++++--------------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index e577ccffe301..e0e4ce8f77d6 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -30,7 +30,7 @@ * SLB */ -#define SLB_NUM_BOLTED 3 +#define SLB_NUM_BOLTED 2 #define SLB_CACHE_ENTRIES 8 #define SLB_MIN_SIZE 32 diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index a9c66feb3c43..4a0b059ad104 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -30,8 +30,7 @@ enum slb_index { LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */ - KSTACK_INDEX = 2, /* Kernel stack map */ + KSTACK_INDEX = 1, /* Kernel stack map */ }; extern void slb_allocate(unsigned long ea); @@ -133,13 +132,11 @@ static void __slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED * and PR KVM appropriately too. */ - unsigned long linear_llp, vmalloc_llp, lflags, vflags; + unsigned long linear_llp, lflags; unsigned long ksp_esid_data, ksp_vsid_data; linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { @@ -157,14 +154,10 @@ static void __slb_flush_and_rebolt(void) * the stack between the slbia and rebolting it. */ asm volatile("isync\n" "slbia\n" - /* Slot 1 - first VMALLOC segment */ + /* Slot 1 - kernel stack */ "slbmte %0,%1\n" - /* Slot 2 - kernel stack */ - "slbmte %2,%3\n" "isync" - :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)), - "r"(ksp_vsid_data), + :: "r"(ksp_vsid_data), "r"(ksp_esid_data) : "memory"); } @@ -256,10 +249,6 @@ void slb_dump_contents(struct slb_entry *slb_ptr) void slb_vmalloc_update(void) { - unsigned long vflags; - - vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); slb_flush_and_rebolt(); } @@ -394,7 +383,7 @@ void slb_set_size(u16 size) void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags, vflags; + unsigned long lflags; static int slb_encoding_inited; #ifdef CONFIG_SPARSEMEM_VMEMMAP unsigned long vmemmap_llp; @@ -430,14 +419,12 @@ void slb_initialize(void) get_paca()->stab_rr = SLB_NUM_BOLTED - 1; lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; /* Invalidate the entire SLB (even entry 0) & all the ERATS */ asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); /* For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also From 5141c182d75b4004c41ac2dc5af081b457b3e8cb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:49 +1000 Subject: [PATCH 024/221] powerpc/64s/hash: Use POWER6 SLBIA IH=1 variant in switch_slb The SLBIA IH=1 hint will remove all non-zero SLBEs, but only invalidate ERAT entries associated with a class value of 1, for processors that support the hint (e.g., POWER6 and newer), which Linux assigns to user addresses. This prevents kernel ERAT entries from being invalidated when context switchig (if the thread faulted in more than 8 user SLBEs). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 4a0b059ad104..a8f27fee6a23 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -128,13 +128,21 @@ void slb_flush_all_realmode(void) asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } -static void __slb_flush_and_rebolt(void) +void slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED * and PR KVM appropriately too. */ unsigned long linear_llp, lflags; unsigned long ksp_esid_data, ksp_vsid_data; + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; lflags = SLB_VSID_KERNEL | linear_llp; @@ -160,20 +168,7 @@ static void __slb_flush_and_rebolt(void) :: "r"(ksp_vsid_data), "r"(ksp_esid_data) : "memory"); -} -void slb_flush_and_rebolt(void) -{ - - WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - __slb_flush_and_rebolt(); get_paca()->slb_cache_ptr = 0; } @@ -318,7 +313,20 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) asm volatile("isync" : : : "memory"); } else { - __slb_flush_and_rebolt(); + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile("isync\n" + PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); + + asm volatile("isync" : : : "memory"); } get_paca()->slb_cache_ptr = 0; From 82d8f4c22f3514eface7e082750bc917193d91f9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:50 +1000 Subject: [PATCH 025/221] powerpc/64s/hash: Use POWER9 SLBIA IH=3 variant in switch_slb POWER9 introduces SLBIA IH=3, which invalidates all SLB entries and associated lookaside information that have a class value of 1, which Linux assigns to user addresses. This matches what switch_slb wants, and allows a simple fast implementation that avoids the slb_cache complexity. As a side-effect, the POWER5 < DD2.1 SLB invalidation workaround is also avoided on POWER9. Process context switching rate is improved about 2.2% for a small process that hits the slb cache which is the best case for the current code. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 83 +++++++++++++++++++++++----------------- arch/powerpc/xmon/xmon.c | 11 ++++-- 2 files changed, 55 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index a8f27fee6a23..513c6596140d 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -279,7 +279,6 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long exec_base; @@ -291,45 +290,56 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); - offset = get_paca()->slb_cache_ptr; - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - unsigned long slbie_data = 0; - int i; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile("isync ; " PPC_SLBIA(3)" ; isync"); + } else { + unsigned long offset = get_paca()->slb_cache_ptr; - asm volatile("isync" : : : "memory"); - for (i = 0; i < offset; i++) { - slbie_data = (unsigned long)get_paca()->slb_cache[i] - << SID_SHIFT; /* EA */ - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* C set for user addresses */ - asm volatile("slbie %0" : : "r" (slbie_data)); + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; + int i; + + asm volatile("isync" : : : "memory"); + for (i = 0; i < offset; i++) { + /* EA */ + slbie_data = (unsigned long) + get_paca()->slb_cache[i] << SID_SHIFT; + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + + asm volatile("isync" : : : "memory"); + } else { + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile("isync\n" + PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); } - /* Workaround POWER5 < DD2.1 issue */ - if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) - asm volatile("slbie %0" : : "r" (slbie_data)); - - asm volatile("isync" : : : "memory"); - } else { - struct slb_shadow *p = get_slb_shadow(); - unsigned long ksp_esid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].esid); - unsigned long ksp_vsid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - - asm volatile("isync\n" - PPC_SLBIA(1) "\n" - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data)); - - asm volatile("isync" : : : "memory"); + get_paca()->slb_cache_ptr = 0; } - get_paca()->slb_cache_ptr = 0; copy_mm_to_paca(mm); /* @@ -455,6 +465,9 @@ static void insert_slb_entry(unsigned long vsid, unsigned long ea, enum slb_index index; int slb_cache_index; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ + /* * We are irq disabled, hence should be safe to access PACA. */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4264aedc7775..cd43c168dc1b 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2393,10 +2393,13 @@ static void dump_one_paca(int cpu) } } DUMP(p, vmalloc_sllp, "%#-*x"); - DUMP(p, slb_cache_ptr, "%#-*x"); - for (i = 0; i < SLB_CACHE_ENTRIES; i++) - printf(" %-*s[%d] = 0x%016x\n", - 22, "slb_cache", i, p->slb_cache[i]); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + DUMP(p, slb_cache_ptr, "%#-*x"); + for (i = 0; i < SLB_CACHE_ENTRIES; i++) + printf(" %-*s[%d] = 0x%016x\n", + 22, "slb_cache", i, p->slb_cache[i]); + } DUMP(p, rfi_flush_fallback_area, "%-*px"); #endif From 5e46e29e6a977a71f6b5bed414b7bcdbff5a6a43 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:51 +1000 Subject: [PATCH 026/221] powerpc/64s/hash: convert SLB miss handlers to C This patch moves SLB miss handlers completely to C, using the standard exception handler macros to set up the stack and branch to C. This can be done because the segment containing the kernel stack is always bolted, so accessing it with relocation on will not cause an SLB exception. Arbitrary kernel memory may not be accessed when handling kernel space SLB misses, so care should be taken there. However user SLB misses can access any kernel memory, which can be used to move some fields out of the paca (in later patches). User SLB misses could quite easily reconcile IRQs and set up a first class kernel environment and exit via ret_from_except, however that doesn't seem to be necessary at the moment, so we only do that if a bad fault is encountered. [ Credit to Aneesh for bug fixes, error checks, and improvements to bad address handling, etc ] Signed-off-by: Nicholas Piggin Since RFC: - Added MSR[RI] handling - Fixed up a register loss bug exposed by irq tracing (Aneesh) - Reject misses outside the defined kernel regions (Aneesh) - Added several more sanity checks and error handling (Aneesh), we may look at consolidating these tests and tightenig up the code but for a first pass we decided it's better to check carefully. Since v1: - Fixed SLB cache corruption (Aneesh) - Fixed untidy SLBE allocation "leak" in get_vsid error case - Now survives some stress testing on real hardware Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 2 + arch/powerpc/include/asm/exception-64s.h | 8 - arch/powerpc/kernel/exceptions-64s.S | 202 +++---------- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/slb.c | 273 ++++++++++-------- arch/powerpc/mm/slb_low.S | 335 ---------------------- 6 files changed, 197 insertions(+), 625 deletions(-) delete mode 100644 arch/powerpc/mm/slb_low.S diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1f4691ce4126..78ed3c3f879a 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -78,6 +78,8 @@ void kernel_bad_stack(struct pt_regs *regs); void system_reset_exception(struct pt_regs *regs); void machine_check_exception(struct pt_regs *regs); void emulation_assist_interrupt(struct pt_regs *regs); +long do_slb_fault(struct pt_regs *regs, unsigned long ea); +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err); /* signals, syscalls and interrupts */ long sys_swapcontext(struct ucontext __user *old_ctx, diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index a86feddddad0..47578b79f0fb 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -60,14 +60,6 @@ */ #define MAX_MCE_DEPTH 4 -/* - * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR - * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole - * in the save area so it's not necessary to overlap them. Could be used - * for future savings though if another 4 byte register was to be saved. - */ -#define EX_LR EX_DAR - /* * EX_R3 is only used by the bad_stack handler. bad_stack reloads and * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 301a6a86a20f..786f4fa5100a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -596,28 +596,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); EXC_VIRT_END(data_access_slb, 0x4380, 0x80) + TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) +EXC_COMMON_BEGIN(data_access_slb_common) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) + ld r4,PACA_EXSLB+EX_DAR(r13) + std r4,_DAR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) + bl save_nvgprs + RECONCILE_IRQ_STATE(r10, r11) + ld r4,_DAR(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault + b ret_from_except + EXC_REAL(instruction_access, 0x400, 0x80) EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) @@ -640,160 +648,34 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, KVMTEST_PR, 0x480); EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, NOTEST, 0x480); EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) + TRAMP_KVM(PACA_EXSLB, 0x480) - -/* - * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as - * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. - */ -EXC_COMMON_BEGIN(slb_miss_common) - /* - * r13 points to the PACA, r9 contains the saved CR, - * r12 contains the saved r3, - * r11 contain the saved SRR1, SRR0 is still ready for return - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss - * We assume we aren't going to take any exceptions during this - * procedure. - */ - mflr r10 - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - andi. r9,r11,MSR_PR // Check for exception from userspace - cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later - - /* - * Test MSR_RI before calling slb_allocate_realmode, because the - * MSR in r11 gets clobbered. However we still want to allocate - * SLB in case MSR_RI=0, to minimise the risk of getting stuck in - * recursive SLB faults. So use cr5 for this, which is preserved. - */ - andi. r11,r11,MSR_RI /* check for unrecoverable exception */ - cmpdi cr5,r11,MSR_RI - - crset 4*cr0+eq -#ifdef CONFIG_PPC_BOOK3S_64 -BEGIN_MMU_FTR_SECTION - bl slb_allocate -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) -#endif - - ld r10,PACA_EXSLB+EX_LR(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - mtlr r10 - - /* - * Large address, check whether we have to allocate new contexts. - */ - beq- 8f - - bne- cr5,2f /* if unrecoverable exception, oops */ - - /* All done -- return from exception. */ - - bne cr4,1f /* returning to kernel */ - - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_USER - b . /* prevent speculative execution */ -1: - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_KERNEL - b . /* prevent speculative execution */ - - -2: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10,unrecov_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -8: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10, large_addr_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -EXC_COMMON_BEGIN(unrecov_slb) - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - RECONCILE_IRQ_STATE(r10, r11) +EXC_COMMON_BEGIN(instruction_access_slb_common) + EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) + ld r4,_NIP(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(large_addr_slb) - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) RECONCILE_IRQ_STATE(r10, r11) - ld r3, PACA_EXSLB+EX_DAR(r13) - std r3, _DAR(r1) - beq cr6, 2f - li r10, 0x481 /* fix trap number for I-SLB miss */ - std r10, _TRAP(r1) -2: bl save_nvgprs - addi r3, r1, STACK_FRAME_OVERHEAD - bl slb_miss_large_addr + ld r4,_NIP(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault b ret_from_except + EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; hardware_interrupt_hv: diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index cdf6a9960046..892d4e061d62 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 513c6596140d..6fec2ce3ccf4 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,6 +14,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -33,7 +34,7 @@ enum slb_index { KSTACK_INDEX = 1, /* Kernel stack map */ }; -extern void slb_allocate(unsigned long ea); +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) @@ -44,11 +45,17 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, + unsigned long flags) +{ + return (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, unsigned long flags) { - return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } static inline void slb_shadow_update(unsigned long ea, int ssize, @@ -353,49 +360,19 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) is_kernel_addr(exec_base)) return; - slb_allocate(pc); + slb_allocate_user(mm, pc); if (!esids_match(pc, stack)) - slb_allocate(stack); + slb_allocate_user(mm, stack); if (!esids_match(pc, exec_base) && !esids_match(stack, exec_base)) - slb_allocate(exec_base); + slb_allocate_user(mm, exec_base); } -static inline void patch_slb_encoding(unsigned int *insn_addr, - unsigned int immed) -{ - - /* - * This function patches either an li or a cmpldi instruction with - * a new immediate value. This relies on the fact that both li - * (which is actually addi) and cmpldi both take a 16-bit immediate - * value, and it is situated in the same location in the instruction, - * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. - * The signedness of the immediate operand differs between the two - * instructions however this code is only ever patching a small value, - * much less than 1 << 15, so we can get away with it. - * To patch the value we read the existing instruction, clear the - * immediate value, and or in our new value, then write the instruction - * back. - */ - unsigned int insn = (*insn_addr & 0xffff0000) | immed; - patch_instruction(insn_addr, insn); -} - -extern u32 slb_miss_kernel_load_linear[]; -extern u32 slb_miss_kernel_load_io[]; -extern u32 slb_compare_rr_to_size[]; -extern u32 slb_miss_kernel_load_vmemmap[]; - void slb_set_size(u16 size) { - if (mmu_slb_size == size) - return; - mmu_slb_size = size; - patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) @@ -417,19 +394,9 @@ void slb_initialize(void) #endif if (!slb_encoding_inited) { slb_encoding_inited = 1; - patch_slb_encoding(slb_miss_kernel_load_linear, - SLB_VSID_KERNEL | linear_llp); - patch_slb_encoding(slb_miss_kernel_load_io, - SLB_VSID_KERNEL | io_llp); - patch_slb_encoding(slb_compare_rr_to_size, - mmu_slb_size); - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); pr_devel("SLB: io LLP = %04lx\n", io_llp); - #ifdef CONFIG_SPARSEMEM_VMEMMAP - patch_slb_encoding(slb_miss_kernel_load_vmemmap, - SLB_VSID_KERNEL | vmemmap_llp); pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); #endif } @@ -458,52 +425,13 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } -static void insert_slb_entry(unsigned long vsid, unsigned long ea, - int bpsize, int ssize) +static void slb_cache_update(unsigned long esid_data) { - unsigned long flags, vsid_data, esid_data; - enum slb_index index; int slb_cache_index; if (cpu_has_feature(CPU_FTR_ARCH_300)) return; /* ISAv3.0B and later does not use slb_cache */ - /* - * We are irq disabled, hence should be safe to access PACA. - */ - VM_WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - index = get_paca()->stab_rr; - - /* - * simple round-robin replacement of slb starting at SLB_NUM_BOLTED. - */ - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - - get_paca()->stab_rr = index; - - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - vsid_data = (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); - esid_data = mk_esid_data(ea, ssize, index); - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * Also we only handle user segments here. - */ - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data) - : "memory"); - /* * Now update slb cache entries */ @@ -525,58 +453,161 @@ static void insert_slb_entry(unsigned long vsid, unsigned long ea, } } -static void handle_multi_context_slb_miss(int context_id, unsigned long ea) +static enum slb_index alloc_slb_index(void) { - struct mm_struct *mm = current->mm; - unsigned long vsid; - int bpsize; + enum slb_index index; - /* - * We are always above 1TB, hence use high user segment size. - */ - vsid = get_vsid(context_id, ea, mmu_highuser_ssize); - bpsize = get_slice_psize(mm, ea); - insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize); + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = get_paca()->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + get_paca()->stab_rr = index; + + return index; } -void slb_miss_large_addr(struct pt_regs *regs) +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) { - enum ctx_state prev_state = exception_enter(); - unsigned long ea = regs->dar; - int context; + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; - if (REGION_ID(ea) != USER_REGION_ID) - goto slb_bad_addr; + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; + + index = alloc_slb_index(); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); /* - * Are we beyound what the page table layout supports ? + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * Also we only handle user segments here. */ - if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) - goto slb_bad_addr; + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); - /* Lower address should have been handled by asm code */ - if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT)) - goto slb_bad_addr; + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + if (id == KERNEL_REGION_ID) { + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + if (ea < H_VMALLOC_END) + flags = get_paca()->vmalloc_sllp; + else + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = id - KERNEL_REGION_CONTEXT_OFFSET; + + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; /* * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= current->mm->context.slb_addr_limit) - goto slb_bad_addr; + if (ea >= mm->context.slb_addr_limit) + return -EFAULT; - context = get_ea_context(¤t->mm->context, ea); + context = get_ea_context(&mm->context, ea); if (!context) - goto slb_bad_addr; + return -EFAULT; - handle_multi_context_slb_miss(context, ea); - exception_exit(prev_state); - return; + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } -slb_bad_addr: - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - exception_exit(prev_state); + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = REGION_ID(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= KERNEL_REGION_ID) { + return slb_allocate_kernel(ea, id); + } else { + struct mm_struct *mm = current->mm; + + if (unlikely(!mm)) + return -EFAULT; + + return slb_allocate_user(mm, ea); + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S deleted file mode 100644 index 4ac5057ad439..000000000000 --- a/arch/powerpc/mm/slb_low.S +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson , IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * rf = flags - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ -/* \ - * powermac get slb fault before feature fixup, so make 65 bit part \ - * the default part of feature fixup \ - */ \ -BEGIN_MMU_FTR_SECTION \ - srdi rx,rt,VSID_BITS_65_##size; \ - clrldi rt,rt,(64-VSID_BITS_65_##size); \ - add rt,rt,rx; \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_65_##size; \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ -MMU_FTR_SECTION_ELSE \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) - - -/* void slb_allocate(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * r3 is preserved. - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate) - /* - * Check if the address falls within the range of the first context, or - * if we may need to handle multi context. For the first context we - * allocate the slb entry via the fast path below. For large address we - * branch out to C-code and see if additional contexts have been - * allocated. - * The test here is: - * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT) - */ - rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4) - bne- 8f - - srdi r9,r3,60 /* get region */ - srdi r10,r3,SID_SHIFT /* get esid */ - cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - - /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ - blt cr7,0f /* user or kernel? */ - - /* Check if hitting the linear mapping or some other kernel space - */ - bne cr7,1f - - /* Linear mapping encoding bits, the "li" instruction below will - * be patched by the kernel at boot - */ -.globl slb_miss_kernel_load_linear -slb_miss_kernel_load_linear: - li r11,0 - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -1: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - cmpldi cr0,r9,0xf - bne 1f -/* Check virtual memmap region. To be patched at kernel boot */ -.globl slb_miss_kernel_load_vmemmap -slb_miss_kernel_load_vmemmap: - li r11,0 - b 6f -1: -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - /* - * r10 contains the ESID, which is the original faulting EA shifted - * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) - * which is 0xd00038000. That can't be used as an immediate, even if we - * ignored the 0xd, so we have to load it into a register, and we only - * have one register free. So we must load all of (H_VMALLOC_END >> 28) - * into a register and compare ESID against that. - */ - lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 - ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 - // Rotate left 4, then mask with 0xffffffff0 - rldic r11,r11,4,28 // r11 = 0xd00038000 - cmpld r10,r11 // if r10 >= r11 - bge 5f // goto io_mapping - - /* - * vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines. - */ - lhz r11,PACAVMALLOCSLLP(r13) - b 6f -5: - /* IO mapping */ -.globl slb_miss_kernel_load_io -slb_miss_kernel_load_io: - li r11,0 -6: - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -0: /* - * For userspace addresses, make sure this is region 0. - */ - cmpdi r9, 0 - bne- 8f - /* - * user space make sure we are within the allowed limit - */ - ld r11,PACA_SLB_ADDR_LIMIT(r13) - cmpld r3,r11 - bge- 8f - - /* when using slices, we extract the psize off the slice bitmaps - * and then we need to get the sllp encoding off the mmu_psize_defs - * array. - * - * XXX This is a bit inefficient especially for the normal case, - * so we should try to implement a fast path for the standard page - * size using the old sllp value so we avoid the array. We cannot - * really do dynamic patching unfortunately as processes might flip - * between 4k and 64k standard page size - */ -#ifdef CONFIG_PPC_MM_SLICES - /* r10 have esid */ - cmpldi r10,16 - /* below SLICE_LOW_TOP */ - blt 5f - /* - * Handle hpsizes, - * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ - addi r9,r11,PACAHIGHSLICEPSIZE - lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ - /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ - rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 - b 6f - -5: - /* - * Handle lpsizes - * r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,1 /* index */ - addi r9,r11,PACALOWSLICESPSIZE - lbzx r9,r13,r9 /* r9 is lpsizes[r11] */ - rldicl r11,r10,0,63 /* r11 = r10 & 0x1 */ -6: - sldi r11,r11,2 /* index * 4 */ - /* Extract the psize and multiply to get an array offset */ - srd r9,r9,r11 - andi. r9,r9,0xf - mulli r9,r9,MMUPSIZEDEFSIZE - - /* Now get to the array and obtain the sllp - */ - ld r11,PACATOC(r13) - ld r11,mmu_psize_defs@got(r11) - add r11,r11,r9 - ld r11,MMUPSIZESLLP(r11) - ori r11,r11,SLB_VSID_USER -#else - /* paca context sllp already contains the SLB_VSID_USER bits */ - lhz r11,PACACONTEXTSLLP(r13) -#endif /* CONFIG_PPC_MM_SLICES */ - - ld r9,PACACONTEXTID(r13) -BEGIN_FTR_SECTION - cmpldi r10,0x1000 - bge .Lslb_finish_load_1T -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load - -8: /* invalid EA - return an error indication */ - crset 4*cr0+eq /* indicate failure */ - blr - -/* - * Finish loading of an SLB entry and return - * - * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET - */ -.Lslb_finish_load: - rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,256M) - /* r3 = EA, r11 = VSID data */ - /* - * Find a slot, round robin. Previously we tried to find a - * free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ - - mr r9,r3 - - /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ -7: ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* This gets soft patched on boot. */ -.globl slb_compare_rr_to_size -slb_compare_rr_to_size: - cmpldi r10,0 - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) - -3: - rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ - oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - - /* r9 = ESID data, r11 = VSID data */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - /* we're done for kernel addresses */ - crclr 4*cr0+eq /* set result to "success" */ - bgelr cr7 - - /* Update the slb cache */ - lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r9,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ - srdi r10,r10,28 /* get the 36 bits of the ESID */ - add r11,r11,r13 /* r11 = (u32 *)paca + offset */ - stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r9,r9,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r9,SLB_CACHE_ENTRIES+1 -2: - sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - crclr 4*cr0+eq /* set result to "success" */ - blr - -/* - * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * - * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 - */ -.Lslb_finish_load_1T: - srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - - li r10,MMU_SEGSIZE_1T - rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ - - /* r3 = EA, r11 = VSID data */ - clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ - b 7b - - -_ASM_NOKPROBE_SYMBOL(slb_allocate) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) -_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) -#ifdef CONFIG_SPARSEMEM_VMEMMAP -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap) -#endif From 8fed04d0f6aedf99b3d811ba58d38bb7f938a47a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:52 +1000 Subject: [PATCH 027/221] powerpc/64s/hash: remove user SLB data from the paca User SLB mappig data is copied into the PACA from the mm->context so it can be accessed by the SLB miss handlers. After the C conversion, SLB miss handlers now run with relocation on, and user SLB misses are able to take recursive kernel SLB misses, so the user SLB mapping data can be removed from the paca and accessed directly. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 + arch/powerpc/include/asm/paca.h | 13 ------ arch/powerpc/kernel/asm-offsets.c | 9 ---- arch/powerpc/kernel/paca.c | 22 --------- arch/powerpc/mm/hash_utils_64.c | 46 +++++-------------- arch/powerpc/mm/mmu_context.c | 3 +- arch/powerpc/mm/slb.c | 20 +++++++- arch/powerpc/mm/slice.c | 29 ++++-------- 8 files changed, 40 insertions(+), 103 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index e0e4ce8f77d6..d3064c7d1b1f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -501,6 +501,7 @@ struct slb_entry { }; extern void slb_initialize(void); +extern void core_flush_all_slbs(struct mm_struct *mm); extern void slb_flush_and_rebolt(void); void slb_flush_all_realmode(void); void __slb_restore_bolted_realmode(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 7b6e23af3808..8144d673541a 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -143,18 +143,6 @@ struct paca_struct { struct tlb_core_data tcd; #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_BOOK3S - mm_context_id_t mm_ctx_id; -#ifdef CONFIG_PPC_MM_SLICES - unsigned char mm_ctx_low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; - unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long mm_ctx_slb_addr_limit; -#else - u16 mm_ctx_user_psize; - u16 mm_ctx_sllp; -#endif -#endif - /* * then miscellaneous read-write fields */ @@ -258,7 +246,6 @@ struct paca_struct { #endif /* CONFIG_PPC_BOOK3S_64 */ } ____cacheline_aligned; -extern void copy_mm_to_paca(struct mm_struct *mm); extern struct paca_struct **paca_ptrs; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 89cf15566c4e..ce3ac40fd96e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -181,15 +181,6 @@ int main(void) OFFSET(PACAIRQSOFTMASK, paca_struct, irq_soft_mask); OFFSET(PACAIRQHAPPENED, paca_struct, irq_happened); OFFSET(PACA_FTRACE_ENABLED, paca_struct, ftrace_enabled); -#ifdef CONFIG_PPC_BOOK3S - OFFSET(PACACONTEXTID, paca_struct, mm_ctx_id); -#ifdef CONFIG_PPC_MM_SLICES - OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); - OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); - DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); -#endif /* CONFIG_PPC_MM_SLICES */ -#endif #ifdef CONFIG_PPC_BOOK3E OFFSET(PACAPGD, paca_struct, pgd); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 0ee3e6d50f28..0cf84e30d1cd 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -258,25 +258,3 @@ void __init free_unused_pacas(void) printk(KERN_DEBUG "Allocated %u bytes for %u pacas\n", paca_ptrs_size + paca_struct_size, nr_cpu_ids); } - -void copy_mm_to_paca(struct mm_struct *mm) -{ -#ifdef CONFIG_PPC_BOOK3S - mm_context_t *context = &mm->context; - - get_paca()->mm_ctx_id = context->id; -#ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.slb_addr_limit); - get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; - memcpy(&get_paca()->mm_ctx_low_slices_psize, - &context->low_slices_psize, sizeof(context->low_slices_psize)); - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); -#else /* CONFIG_PPC_MM_SLICES */ - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif -#else /* !CONFIG_PPC_BOOK3S */ - return; -#endif -} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f23a89d8e4ce..88c95dc8b141 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1088,16 +1088,16 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) } #ifdef CONFIG_PPC_MM_SLICES -static unsigned int get_paca_psize(unsigned long addr) +static unsigned int get_psize(struct mm_struct *mm, unsigned long addr) { unsigned char *psizes; unsigned long index, mask_index; if (addr < SLICE_LOW_TOP) { - psizes = get_paca()->mm_ctx_low_slices_psize; + psizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); } else { - psizes = get_paca()->mm_ctx_high_slices_psize; + psizes = mm->context.high_slices_psize; index = GET_HIGH_SLICE_INDEX(addr); } mask_index = index & 0x1; @@ -1105,9 +1105,9 @@ static unsigned int get_paca_psize(unsigned long addr) } #else -unsigned int get_paca_psize(unsigned long addr) +unsigned int get_psize(struct mm_struct *mm, unsigned long addr) { - return get_paca()->mm_ctx_user_psize; + return mm->context.user_psize; } #endif @@ -1118,15 +1118,11 @@ unsigned int get_paca_psize(unsigned long addr) #ifdef CONFIG_PPC_64K_PAGES void demote_segment_4k(struct mm_struct *mm, unsigned long addr) { - if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + if (get_psize(mm, addr) == MMU_PAGE_4K) return; slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); copro_flush_all_slbs(mm); - if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - - copy_mm_to_paca(mm); - slb_flush_and_rebolt(); - } + core_flush_all_slbs(mm); } #endif /* CONFIG_PPC_64K_PAGES */ @@ -1191,22 +1187,6 @@ void hash_failure_debug(unsigned long ea, unsigned long access, trap, vsid, ssize, psize, lpsize, pte); } -static void check_paca_psize(unsigned long ea, struct mm_struct *mm, - int psize, bool user_region) -{ - if (user_region) { - if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(mm); - slb_flush_and_rebolt(); - } - } else if (get_paca()->vmalloc_sllp != - mmu_psize_defs[mmu_vmalloc_psize].sllp) { - get_paca()->vmalloc_sllp = - mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_vmalloc_update(); - } -} - /* Result code is: * 0 - handled * 1 - normal page fault @@ -1239,7 +1219,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, rc = 1; goto bail; } - psize = get_slice_psize(mm, ea); + psize = get_psize(mm, ea); ssize = user_segment_size(ea); vsid = get_user_vsid(&mm->context, ea, ssize); break; @@ -1327,9 +1307,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, WARN_ON(1); } #endif - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - goto bail; } @@ -1364,15 +1341,14 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, "to 4kB pages because of " "non-cacheable mapping\n"); psize = mmu_vmalloc_psize = MMU_PAGE_4K; + slb_vmalloc_update(); copro_flush_all_slbs(mm); + core_flush_all_slbs(mm); } } #endif /* CONFIG_PPC_64K_PAGES */ - if (current->mm == mm) - check_paca_psize(ea, mm, psize, user_region); - #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, @@ -1460,7 +1436,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, #ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { - int psize = get_slice_psize(mm, ea); + int psize = get_psize(mm, ea); /* We only prefault standard pages for now */ if (unlikely(psize != mm->context.user_psize)) diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index f84e14f23e50..28ae2835db3d 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -54,8 +54,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * MMU context id, which is then moved to SPRN_PID. * * For the hash MMU it is either the first load from slb_cache - * in switch_slb(), and/or the store of paca->mm_ctx_id in - * copy_mm_to_paca(). + * in switch_slb(), and/or load of MMU context id. * * On the other side, the barrier is in mm/tlb-radix.c for * radix which orders earlier stores to clear the PTEs vs diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 6fec2ce3ccf4..1347ab86d32e 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -347,8 +347,6 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) get_paca()->slb_cache_ptr = 0; } - copy_mm_to_paca(mm); - /* * preload some userspace segments into the SLB. * Almost all 32 and 64bit PowerPC executables are linked at @@ -375,6 +373,24 @@ void slb_set_size(u16 size) mmu_slb_size = size; } +static void cpu_flush_slb(void *parm) +{ + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + local_irq_save(flags); + slb_flush_and_rebolt(); + local_irq_restore(flags); +} + +void core_flush_all_slbs(struct mm_struct *mm) +{ + on_each_cpu(cpu_flush_slb, mm, 1); +} + void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 205fe557ca10..606f424aac47 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -207,23 +207,6 @@ static bool slice_check_range_fits(struct mm_struct *mm, return true; } -static void slice_flush_segments(void *parm) -{ -#ifdef CONFIG_PPC64 - struct mm_struct *mm = parm; - unsigned long flags; - - if (mm != current->active_mm) - return; - - copy_mm_to_paca(current->active_mm); - - local_irq_save(flags); - slb_flush_and_rebolt(); - local_irq_restore(flags); -#endif -} - static void slice_convert(struct mm_struct *mm, const struct slice_mask *mask, int psize) { @@ -289,6 +272,9 @@ static void slice_convert(struct mm_struct *mm, spin_unlock_irqrestore(&slice_convert_lock, flags); copro_flush_all_slbs(mm); +#ifdef CONFIG_PPC64 + core_flush_all_slbs(mm); +#endif } /* @@ -502,8 +488,9 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * be already initialised beyond the old address limit. */ mm->context.slb_addr_limit = high_limit; - - on_each_cpu(slice_flush_segments, mm, 1); +#ifdef CONFIG_PPC64 + core_flush_all_slbs(mm); +#endif } /* Sanity checks */ @@ -665,8 +652,10 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, (SLICE_NUM_HIGH && !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { slice_convert(mm, &potential_mask, psize); +#ifdef CONFIG_PPC64 if (psize > MMU_PAGE_BASE) - on_each_cpu(slice_flush_segments, mm, 1); + core_flush_all_slbs(mm); +#endif } return newaddr; From 655deecf67b240bf7bb4e73df4e1235900c26a01 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:53 +1000 Subject: [PATCH 028/221] powerpc/64s/hash: SLB allocation status bitmaps Add 32-entry bitmaps to track the allocation status of the first 32 SLB entries, and whether they are user or kernel entries. These are used to allocate free SLB entries first, before resorting to the round robin allocator. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 6 ++- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/mm/slb.c | 64 ++++++++++++++++++++++++------- arch/powerpc/xmon/xmon.c | 4 +- 4 files changed, 59 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 8144d673541a..6d6b3706232c 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -113,7 +113,10 @@ struct paca_struct { * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; - u16 slb_cache_ptr; + u8 slb_cache_ptr; + u8 stab_rr; /* stab/slb round-robin counter */ + u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ + u32 slb_kern_bitmap; u32 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -148,7 +151,6 @@ struct paca_struct { */ struct task_struct *__current; /* Pointer to current */ u64 kstack; /* Saved Kernel stack addr */ - u64 stab_rr; /* stab/slb round-robin counter */ u64 saved_r1; /* r1 save for RTAS calls or PM or EE=0 */ u64 saved_msr; /* MSR saved here by enter_rtas */ u16 trap_save; /* Used when bad stack is encountered */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index ce3ac40fd96e..ba9d0fc98730 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -173,7 +173,6 @@ int main(void) OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); - OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); OFFSET(PACAKBASE, paca_struct, kernelbase); @@ -203,6 +202,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); + OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); #ifdef CONFIG_PPC_MM_SLICES OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 1347ab86d32e..5bfbd3f61312 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -122,6 +122,9 @@ void slb_restore_bolted_realmode(void) { __slb_restore_bolted_realmode(); get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } /* @@ -129,9 +132,6 @@ void slb_restore_bolted_realmode(void) */ void slb_flush_all_realmode(void) { - /* - * This flushes all SLB entries including 0, so it must be realmode. - */ asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } @@ -177,6 +177,9 @@ void slb_flush_and_rebolt(void) : "memory"); get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } void slb_save_contents(struct slb_entry *slb_ptr) @@ -209,7 +212,7 @@ void slb_dump_contents(struct slb_entry *slb_ptr) return; pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %lld\n", get_paca()->stab_rr); + pr_err("Last SLB entry inserted at slot %u\n", get_paca()->stab_rr); for (i = 0; i < mmu_slb_size; i++) { e = slb_ptr->esid; @@ -342,10 +345,13 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) "isync" :: "r"(ksp_vsid_data), "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; } get_paca()->slb_cache_ptr = 0; } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; /* * preload some userspace segments into the SLB. @@ -418,6 +424,8 @@ void slb_initialize(void) } get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; lflags = SLB_VSID_KERNEL | linear_llp; @@ -469,17 +477,47 @@ static void slb_cache_update(unsigned long esid_data) } } -static enum slb_index alloc_slb_index(void) +static enum slb_index alloc_slb_index(bool kernel) { enum slb_index index; - /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ - index = get_paca()->stab_rr; - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - get_paca()->stab_rr = index; + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (get_paca()->slb_used_bitmap != U32_MAX) { + index = ffz(get_paca()->slb_used_bitmap); + get_paca()->slb_used_bitmap |= 1U << index; + if (kernel) + get_paca()->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = get_paca()->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + get_paca()->stab_rr = index; + if (index < 32) { + if (kernel) + get_paca()->slb_kern_bitmap |= 1U << index; + else + get_paca()->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); return index; } @@ -495,7 +533,7 @@ static long slb_insert_entry(unsigned long ea, unsigned long context, if (!vsid) return -EFAULT; - index = alloc_slb_index(); + index = alloc_slb_index(kernel); vsid_data = __mk_vsid_data(vsid, ssize, flags); esid_data = mk_esid_data(ea, ssize, index); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index cd43c168dc1b..ad6a549a3080 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2393,6 +2393,9 @@ static void dump_one_paca(int cpu) } } DUMP(p, vmalloc_sllp, "%#-*x"); + DUMP(p, stab_rr, "%#-*x"); + DUMP(p, slb_used_bitmap, "%#-*x"); + DUMP(p, slb_kern_bitmap, "%#-*x"); if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { DUMP(p, slb_cache_ptr, "%#-*x"); @@ -2415,7 +2418,6 @@ static void dump_one_paca(int cpu) DUMP(p, __current, "%-*px"); DUMP(p, kstack, "%#-*llx"); printf(" %-*s = 0x%016llx\n", 25, "kstack_base", p->kstack & ~(THREAD_SIZE - 1)); - DUMP(p, stab_rr, "%#-*llx"); DUMP(p, saved_r1, "%#-*llx"); DUMP(p, trap_save, "%#-*x"); DUMP(p, irq_soft_mask, "%#-*x"); From e83cbf7fb7d17618a5d8a415d5c7bb760812a5cb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:54 +1000 Subject: [PATCH 029/221] powerpc/64s: xmon do not dump hash fields when using radix mode Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index ad6a549a3080..694c1d92e796 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2378,30 +2378,32 @@ static void dump_one_paca(int cpu) DUMP(p, cpu_start, "%#-*x"); DUMP(p, kexec_state, "%#-*x"); #ifdef CONFIG_PPC_BOOK3S_64 - for (i = 0; i < SLB_NUM_BOLTED; i++) { - u64 esid, vsid; + if (!early_radix_enabled()) { + for (i = 0; i < SLB_NUM_BOLTED; i++) { + u64 esid, vsid; - if (!p->slb_shadow_ptr) - continue; + if (!p->slb_shadow_ptr) + continue; - esid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].esid); - vsid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].vsid); + esid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].esid); + vsid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].vsid); - if (esid || vsid) { - printf(" %-*s[%d] = 0x%016llx 0x%016llx\n", - 22, "slb_shadow", i, esid, vsid); + if (esid || vsid) { + printf(" %-*s[%d] = 0x%016llx 0x%016llx\n", + 22, "slb_shadow", i, esid, vsid); + } } - } - DUMP(p, vmalloc_sllp, "%#-*x"); - DUMP(p, stab_rr, "%#-*x"); - DUMP(p, slb_used_bitmap, "%#-*x"); - DUMP(p, slb_kern_bitmap, "%#-*x"); + DUMP(p, vmalloc_sllp, "%#-*x"); + DUMP(p, stab_rr, "%#-*x"); + DUMP(p, slb_used_bitmap, "%#-*x"); + DUMP(p, slb_kern_bitmap, "%#-*x"); - if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { - DUMP(p, slb_cache_ptr, "%#-*x"); - for (i = 0; i < SLB_CACHE_ENTRIES; i++) - printf(" %-*s[%d] = 0x%016x\n", - 22, "slb_cache", i, p->slb_cache[i]); + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + DUMP(p, slb_cache_ptr, "%#-*x"); + for (i = 0; i < SLB_CACHE_ENTRIES; i++) + printf(" %-*s[%d] = 0x%016x\n", + 22, "slb_cache", i, p->slb_cache[i]); + } } DUMP(p, rfi_flush_fallback_area, "%-*px"); From 2e1626744e8da01eb5a2a0aaa3f365e41f1feb49 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:55 +1000 Subject: [PATCH 030/221] powerpc/64s/hash: provide arch_setup_exec hooks for hash slice setup This will be used by the SLB code in the next patch, but for now this sets the slb_addr_limit to the correct size for 32-bit tasks. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 ++ arch/powerpc/include/asm/slice.h | 1 + arch/powerpc/include/asm/thread_info.h | 6 ++++++ arch/powerpc/kernel/process.c | 9 +++++++++ arch/powerpc/mm/mmu_context_book3s64.c | 5 +++++ arch/powerpc/mm/slice.c | 14 ++++++++++++++ 6 files changed, 37 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index d3064c7d1b1f..bbeaf6adf93c 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -487,6 +487,8 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); +extern void hash__setup_new_exec(void); + #ifdef CONFIG_PPC_PSERIES void hpte_init_pseries(void); #else diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index e40406cf5628..a595461c9cb0 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -32,6 +32,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); void slice_init_new_context_exec(struct mm_struct *mm); +void slice_setup_new_exec(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 3c0002044bc9..f9a442bb5a72 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -72,6 +72,12 @@ static inline struct thread_info *current_thread_info(void) } extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); + +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 913c5725cdb2..e4feb45ae4c6 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1482,6 +1482,15 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void) +{ + if (radix_enabled()) + return; + hash__setup_new_exec(); +} +#endif + int set_thread_uses_vas(void) { #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index dbd8f762140b..f7352c66b6b8 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -84,6 +84,11 @@ static int hash__init_new_context(struct mm_struct *mm) return index; } +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); +} + static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 606f424aac47..fc5b3a1ec666 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -746,6 +746,20 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } +#ifdef CONFIG_PPC_BOOK3S_64 +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; +} +#endif + void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { From 89ca4e126a3f519ccbd42670b38d78700802c10b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:56 +1000 Subject: [PATCH 031/221] powerpc/64s/hash: Add a SLB preload cache When switching processes, currently all user SLBEs are cleared, and a few (exec_base, pc, and stack) are preloaded. In trivial testing with small apps, this tends to miss the heap and low 256MB segments, and it will also miss commonly accessed segments on large memory workloads. Add a simple round-robin preload cache that just inserts the last SLB miss into the head of the cache and preloads those at context switch time. Every 256 context switches, the oldest entry is removed from the cache to shrink the cache and require fewer slbmte if they are unused. Much more could go into this, including into the SLB entry reclaim side to track some LRU information etc, which would require a study of large memory workloads. But this is a simple thing we can do now that is an obvious win for common workloads. With the full series, process switching speed on the context_switch benchmark on POWER9/hash (with kernel speculation security masures disabled) increases from 140K/s to 178K/s (27%). POWER8 does not change much (within 1%), it's unclear why it does not see a big gain like POWER9. Booting to busybox init with 256MB segments has SLB misses go down from 945 to 69, and with 1T segments 900 to 21. These could almost all be eliminated by preloading a bit more carefully with ELF binary loading. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/include/asm/thread_info.h | 5 + arch/powerpc/kernel/process.c | 7 + arch/powerpc/mm/mmu_context_book3s64.c | 4 + arch/powerpc/mm/slb.c | 170 +++++++++++++++++++------ 5 files changed, 145 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 52fadded5c1e..350c584ca179 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -273,6 +273,7 @@ struct thread_struct { #endif /* CONFIG_HAVE_HW_BREAKPOINT */ struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */ unsigned long trap_nr; /* last trap # on this thread */ + u8 load_slb; /* Ages out SLB preload cache entries */ u8 load_fp; #ifdef CONFIG_ALTIVEC u8 load_vec; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index f9a442bb5a72..406eb952b808 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -29,6 +29,7 @@ #include #include +#define SLB_PRELOAD_NR 16U /* * low level task data. */ @@ -44,6 +45,10 @@ struct thread_info { #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) struct cpu_accounting_data accounting; #endif + u8 slb_preload_nr; + u8 slb_preload_tail; + u32 slb_preload_esid[SLB_PRELOAD_NR]; + /* low level flags - has atomic operations done on it */ unsigned long flags ____cacheline_aligned_in_smp; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e4feb45ae4c6..03c2e1f134bc 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1719,6 +1719,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } +void preload_new_slb_context(unsigned long start, unsigned long sp); + /* * Set up a thread for executing a new program */ @@ -1726,6 +1728,10 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ + +#ifdef CONFIG_PPC_BOOK3S_64 + preload_new_slb_context(start, sp); +#endif #endif /* @@ -1816,6 +1822,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_slb = 0; current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index f7352c66b6b8..510f103d7813 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -53,6 +53,8 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +void slb_setup_new_exec(void); + static int hash__init_new_context(struct mm_struct *mm) { int index; @@ -87,6 +89,8 @@ static int hash__init_new_context(struct mm_struct *mm) void hash__setup_new_exec(void) { slice_setup_new_exec(); + + slb_setup_new_exec(); } static int radix__init_new_context(struct mm_struct *mm) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5bfbd3f61312..b438220c4336 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -257,41 +257,119 @@ void slb_vmalloc_update(void) slb_flush_and_rebolt(); } -/* Helper function to compare esids. There are four cases to handle. - * 1. The system is not 1T segment size capable. Use the GET_ESID compare. - * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. - * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. - * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. - */ -static inline int esids_match(unsigned long addr1, unsigned long addr2) +static bool preload_hit(struct thread_info *ti, unsigned long esid) { - int esid_1t_count; + u8 i; - /* System is not 1T segment size capable. */ - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - return (GET_ESID(addr1) == GET_ESID(addr2)); + for (i = 0; i < ti->slb_preload_nr; i++) { + u8 idx; - esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + - ((addr2 >> SID_SHIFT_1T) != 0)); - - /* both addresses are < 1T */ - if (esid_1t_count == 0) - return (GET_ESID(addr1) == GET_ESID(addr2)); - - /* One address < 1T, the other > 1T. Not a match */ - if (esid_1t_count == 1) - return 0; - - /* Both addresses are > 1T. */ - return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; } +static bool preload_add(struct thread_info *ti, unsigned long ea) +{ + unsigned long esid; + u8 idx; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; + + if (preload_hit(ti, esid)) + return false; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; + + return true; +} + +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} + +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; + + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } +} + +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } +} + + /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long exec_base; + struct thread_info *ti = task_thread_info(tsk); + u8 i; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -314,7 +392,6 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { unsigned long slbie_data = 0; - int i; asm volatile("isync" : : : "memory"); for (i = 0; i < offset; i++) { @@ -354,24 +431,28 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. */ - exec_base = 0x10000000; + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); - if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(exec_base)) - return; + preload_age(ti); + preload_add(ti, pc); + } - slb_allocate_user(mm, pc); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned long ea; + u8 idx; - if (!esids_match(pc, stack)) - slb_allocate_user(mm, stack); + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; - if (!esids_match(pc, exec_base) && - !esids_match(stack, exec_base)) - slb_allocate_user(mm, exec_base); + slb_allocate_user(mm, ea); + } } void slb_set_size(u16 size) @@ -644,11 +725,16 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) return slb_allocate_kernel(ea, id); } else { struct mm_struct *mm = current->mm; + long err; if (unlikely(!mm)) return -EFAULT; - return slb_allocate_user(mm, ea); + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; } } From f2910f0e6835339e6ce82cef22fa15718b7e3bfa Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Sep 2018 15:08:52 +1000 Subject: [PATCH 032/221] powerpc: remove old GCC version checks GCC 4.6 is the minimum supported now. Signed-off-by: Nicholas Piggin Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 11a1acba164a..2ecd0976914a 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -404,36 +404,9 @@ archprepare: checkbin # to stdout and these checks are run even on install targets. TOUT := .tmp_gas_check -# Check gcc and binutils versions: -# - gcc-3.4 and binutils-2.14 are a fatal combination -# - Require gcc 4.0 or above on 64-bit -# - gcc-4.2.0 has issues compiling modules on 64-bit +# Check toolchain versions: +# - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: - @if test "$(cc-name)" != "clang" \ - && test "$(cc-version)" = "0304" ; then \ - if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \ - echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \ - echo 'correctly with gcc-3.4 and your version of binutils.'; \ - echo '*** Please upgrade your binutils or downgrade your gcc'; \ - false; \ - fi ; \ - fi - @if test "$(cc-name)" != "clang" \ - && test "$(cc-version)" -lt "0400" \ - && test "x${CONFIG_PPC64}" = "xy" ; then \ - echo -n "Sorry, GCC v4.0 or above is required to build " ; \ - echo "the 64-bit powerpc kernel." ; \ - false ; \ - fi - @if test "$(cc-name)" != "clang" \ - && test "$(cc-fullversion)" = "040200" \ - && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \ - echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \ - echo 'kernel with modules enabled.' ; \ - echo -n '*** Please use a different GCC version or ' ; \ - echo 'disable kernel modules' ; \ - false ; \ - fi @if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \ && $(LD) --version | head -1 | grep ' 2\.24$$' >/dev/null ; then \ echo -n '*** binutils 2.24 miscompiles weak symbols ' ; \ From 2a056f58fd33ccc6a0261b552b0f17e7fa4a12f3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Sep 2018 15:08:53 +1000 Subject: [PATCH 033/221] powerpc: consolidate -mno-sched-epilog into FTRACE flags Signed-off-by: Nicholas Piggin Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 12 ++++++------ arch/powerpc/kernel/Makefile | 8 ++++---- arch/powerpc/kernel/trace/Makefile | 2 +- arch/powerpc/platforms/powermac/Makefile | 2 +- arch/powerpc/xmon/Makefile | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 2ecd0976914a..be47cf8a0798 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -160,8 +160,13 @@ else CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64 endif +ifdef CONFIG_FUNCTION_TRACER +CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL - CC_FLAGS_FTRACE := -pg -mprofile-kernel +CC_FLAGS_FTRACE += -mprofile-kernel +endif +# Work around a gcc code-gen bug with -fno-omit-frame-pointer. +CC_FLAGS_FTRACE += -mno-sched-epilog endif CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) @@ -229,11 +234,6 @@ ifdef CONFIG_6xx KBUILD_CFLAGS += -mcpu=powerpc endif -# Work around a gcc code-gen bug with -fno-omit-frame-pointer. -ifdef CONFIG_FUNCTION_TRACER -KBUILD_CFLAGS += -mno-sched-epilog -endif - cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) cpu-as-$(CONFIG_E200) += -Wa,-me200 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 3b66f2c19c84..1e64cfe22a83 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -22,10 +22,10 @@ CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code -CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cputable.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index d22d8bafb643..d868ba42032f 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -7,7 +7,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ifdef CONFIG_FUNCTION_TRACER # do not trace tracer code -CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) endif obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index f2839eed0f89..561a67d65e4d 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -3,7 +3,7 @@ CFLAGS_bootx_init.o += -fPIC ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code -CFLAGS_REMOVE_bootx_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bootx_init.o = $(CC_FLAGS_FTRACE) endif obj-y += pic.o setup.o time.o feature.o pci.o \ diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 1bc3abb237cd..93cc1f1b8b61 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -8,7 +8,7 @@ UBSAN_SANITIZE := n # Disable ftrace for the entire directory ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -mno-sched-epilog,,$(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) From 6977f95e63b9b3fb4a5973481a800dd9f48a1338 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Sep 2018 15:08:54 +1000 Subject: [PATCH 034/221] powerpc: avoid -mno-sched-epilog on GCC 4.9 and newer Signed-off-by: Nicholas Piggin Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index be47cf8a0798..07d9dce7eda6 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -165,8 +165,12 @@ CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL CC_FLAGS_FTRACE += -mprofile-kernel endif -# Work around a gcc code-gen bug with -fno-omit-frame-pointer. -CC_FLAGS_FTRACE += -mno-sched-epilog +# Work around gcc code-gen bugs with -pg / -fno-omit-frame-pointer in gcc <= 4.8 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44199 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52828 +ifneq ($(cc-name),clang) +CC_FLAGS_FTRACE += $(call cc-ifversion, -lt, 0409, -mno-sched-epilog) +endif endif CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) From 063b8b1251fd069f3740339fca56119d218f11ba Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 20 Apr 2018 15:29:48 -0500 Subject: [PATCH 035/221] powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR request The updates to powerpc numa and memory hotplug code now use the in-kernel LMB array instead of the device tree. This change allows the pseries memory DLPAR code to only update the device tree once after successfully handling a DLPAR request. Prior to the in-kernel LMB array, the numa code looked up the affinity for memory being added in the device tree, the code now looks this up in the LMB array. This change means the memory hotplug code can just update the affinity for an LMB in the LMB array instead of updating the device tree. This also provides a savings in kernel memory. When updating the device tree old properties are never free'ed since there is no usecount on properties. This behavior leads to a new copy of the property being allocated every time a LMB is added or removed (i.e. a request to add 100 LMBs creates 100 new copies of the property). With this update only a single new property is created when a DLPAR request completes successfully. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/drmem.h | 5 ++ .../platforms/pseries/hotplug-memory.c | 55 ++++++------------- 2 files changed, 21 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index ce242b9ea8c6..7c1d8e74b25d 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -99,4 +99,9 @@ void __init walk_drmem_lmbs_early(unsigned long node, void (*func)(struct drmem_lmb *, const __be32 **)); #endif +static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) +{ + lmb->aa_index = 0xffffffff; +} + #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index c1578f54c626..9a15d39995e5 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -163,7 +163,7 @@ static u32 find_aa_index(struct device_node *dr_node, return aa_index; } -static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb) +static int update_lmb_associativity_index(struct drmem_lmb *lmb) { struct device_node *parent, *lmb_node, *dr_node; struct property *ala_prop; @@ -203,43 +203,14 @@ static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb) aa_index = find_aa_index(dr_node, ala_prop, lmb_assoc); dlpar_free_cc_nodes(lmb_node); - return aa_index; -} -static int dlpar_add_device_tree_lmb(struct drmem_lmb *lmb) -{ - int rc, aa_index; - - lmb->flags |= DRCONF_MEM_ASSIGNED; - - aa_index = lookup_lmb_associativity_index(lmb); if (aa_index < 0) { - pr_err("Couldn't find associativity index for drc index %x\n", - lmb->drc_index); - return aa_index; + pr_err("Could not find LMB associativity\n"); + return -1; } lmb->aa_index = aa_index; - - rtas_hp_event = true; - rc = drmem_update_dt(); - rtas_hp_event = false; - - return rc; -} - -static int dlpar_remove_device_tree_lmb(struct drmem_lmb *lmb) -{ - int rc; - - lmb->flags &= ~DRCONF_MEM_ASSIGNED; - lmb->aa_index = 0xffffffff; - - rtas_hp_event = true; - rc = drmem_update_dt(); - rtas_hp_event = false; - - return rc; + return 0; } static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) @@ -428,7 +399,9 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + return 0; } @@ -688,10 +661,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; - rc = dlpar_add_device_tree_lmb(lmb); + rc = update_lmb_associativity_index(lmb); if (rc) { - pr_err("Couldn't update device tree for drc index %x\n", - lmb->drc_index); dlpar_release_drc(lmb->drc_index); return rc; } @@ -704,14 +675,14 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); if (rc) { - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); return rc; } rc = dlpar_online_lmb(lmb); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } @@ -958,6 +929,12 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) break; } + if (!rc) { + rtas_hp_event = true; + rc = drmem_update_dt(); + rtas_hp_event = false; + } + unlock_device_hotplug(); return rc; } From cd24e457fd8b2d087d9236700c8d2957054598bf Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 10 Sep 2018 09:57:00 -0500 Subject: [PATCH 036/221] powerpc/pseries: Remove prrn_work workqueue When a PRRN event is received we are already running in a worker thread. Instead of spawning off another worker thread on the prrn_work workqueue to handle the PRRN event we can just call the PRRN handler routine directly. With this update we can also pass the scope variable for the PRRN event directly to the handler instead of it being a global variable. This patch fixes the following oops mnessage we are seeing in PRRN testing: Oops: Bad kernel stack pointer, sig: 6 [#1] SMP NR_CPUS=2048 NUMA pSeries Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache binfmt_misc reiserfs vfat fat rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c dm_service_time ibmveth(X) ses enclosure scsi_transport_sas rtc_generic btrfs xor raid6_pq sd_mod ibmvscsi(X) scsi_transport_srp ipr(X) libata sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: Yes, External 54 CPU: 7 PID: 18967 Comm: kworker/u96:0 Tainted: G X 4.4.126-94.22-default #1 Workqueue: pseries hotplug workque pseries_hp_work_fn task: c000000775367790 ti: c00000001ebd4000 task.ti: c00000070d140000 NIP: 0000000000000000 LR: 000000001fb3d050 CTR: 0000000000000000 REGS: c00000001ebd7d40 TRAP: 0700 Tainted: G X (4.4.126-94.22-default) MSR: 8000000102081000 <41,VEC,ME5 CR: 28000002 XER: 20040018 4 CFAR: 000000001fb3d084 40 419 1 3 GPR00: 000000000000000040000000000010007 000000001ffff400 000000041fffe200 GPR04: 000000000000008050000000000000000 000000001fb15fa8 0000000500000500 GPR08: 000000000001f40040000000000000001 0000000000000000 000005:5200040002 GPR12: 00000000000000005c000000007a05400 c0000000000e89f8 000000001ed9f668 GPR16: 000000001fbeff944000000001fbeff94 000000001fb545e4 0000006000000060 GPR20: ffffffffffffffff4ffffffffffffffff 0000000000000000 0000000000000000 GPR24: 00000000000000005400000001fb3c000 0000000000000000 000000001fb1b040 GPR28: 000000001fb240004000000001fb440d8 0000000000000008 0000000000000000 NIP [0000000000000000] 5 (null) LR [000000001fb3d050] 031fb3d050 Call Trace: 4 Instruction dump: 4 5:47 12 2 XXXXXXXX XXXXXXXX XXXXX4XX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXX5XX XXXXXXXX 60000000 60000000 60000000 60000000 ---[ end trace aa5627b04a7d9d6b ]--- 3NMI watchdog: BUG: soft lockup - CPU#27 stuck for 23s! [kworker/27:0:13903] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache binfmt_misc reiserfs vfat fat rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c dm_service_time ibmveth(X) ses enclosure scsi_transport_sas rtc_generic btrfs xor raid6_pq sd_mod ibmvscsi(X) scsi_transport_srp ipr(X) libata sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: Yes, External CPU: 27 PID: 13903 Comm: kworker/27:0 Tainted: G D X 4.4.126-94.22-default #1 Workqueue: events prrn_work_fn task: c000000747cfa390 ti: c00000074712c000 task.ti: c00000074712c000 NIP: c0000000008002a8 LR: c000000000090770 CTR: 000000000032e088 REGS: c00000074712f7b0 TRAP: 0901 Tainted: G D X (4.4.126-94.22-default) MSR: 8000000100009033 CR: 22482044 XER: 20040000 CFAR: c0000000008002c4 SOFTE: 1 GPR00: c000000000090770 c00000074712fa30 c000000000f09800 c000000000fa1928 6:02 GPR04: c000000775f5e000 fffffffffffffffe 0000000000000001 c000000000f42db8 GPR08: 0000000000000001 0000000080000007 0000000000000000 0000000000000000 GPR12: 8006210083180000 c000000007a14400 NIP [c0000000008002a8] _raw_spin_lock+0x68/0xd0 LR [c000000000090770] mobility_rtas_call+0x50/0x100 Call Trace: 59 5 [c00000074712fa60] [c000000000090770] mobility_rtas_call+0x50/0x100 [c00000074712faf0] [c000000000090b08] pseries_devicetree_update+0xf8/0x530 [c00000074712fc20] [c000000000031ba4] prrn_work_fn+0x34/0x50 [c00000074712fc40] [c0000000000e0390] process_one_work+0x1a0/0x4e0 [c00000074712fcd0] [c0000000000e0870] worker_thread+0x1a0/0x6105:57 2 [c00000074712fd80] [c0000000000e8b18] kthread+0x128/0x150 [c00000074712fe30] [c0000000000096f8] ret_from_kernel_thread+0x5c/0x64 Instruction dump: 2c090000 40c20010 7d40192d 40c2fff0 7c2004ac 2fa90000 40de0018 5:540030 3 e8010010 ebe1fff8 7c0803a6 4e800020 <7c210b78> e92d0000 89290009 792affe3 Signed-off-by: John Allen Signed-off-by: Haren Myneni Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtasd.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 44d66c33d59d..23b88b923f06 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -274,27 +274,16 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) } #ifdef CONFIG_PPC_PSERIES -static s32 prrn_update_scope; - -static void prrn_work_fn(struct work_struct *work) +static void handle_prrn_event(s32 scope) { /* * For PRRN, we must pass the negative of the scope value in * the RTAS event. */ - pseries_devicetree_update(-prrn_update_scope); + pseries_devicetree_update(-scope); numa_update_cpu_topology(false); } -static DECLARE_WORK(prrn_work, prrn_work_fn); - -static void prrn_schedule_update(u32 scope) -{ - flush_work(&prrn_work); - prrn_update_scope = scope; - schedule_work(&prrn_work); -} - static void handle_rtas_event(const struct rtas_error_log *log) { if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) @@ -303,7 +292,7 @@ static void handle_rtas_event(const struct rtas_error_log *log) /* For PRRN Events the extended log length is used to denote * the scope for calling rtas update-nodes. */ - prrn_schedule_update(rtas_error_extended_log_length(log)); + handle_prrn_event(rtas_error_extended_log_length(log)); } #else From fd12527a1da42dcb906b694e01794e8d438f7d10 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 10 Sep 2018 09:57:07 -0500 Subject: [PATCH 037/221] powerpc/pseries: Remove unneeded uses of dlpar work queue There are three instances in which dlpar hotplug events are invoked; handling a hotplug interrupt (in a kvm guest), handling a dlpar request through sysfs, and updating LMB affinity when handling a PRRN event. Only in the case of handling a hotplug interrupt do we have to put the work on a workqueue, the other cases can handle the dlpar request directly. This patch exports the handle_dlpar_errorlog() function so that dlpar hotplug events can be handled directly and updates the two instances mentioned above to use the direct invocation. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dlpar.c | 37 ++++++----------------- arch/powerpc/platforms/pseries/mobility.c | 18 ++++------- arch/powerpc/platforms/pseries/pseries.h | 5 +-- arch/powerpc/platforms/pseries/ras.c | 2 +- 4 files changed, 19 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index a0b20c03f078..052c4f2ba0a0 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -32,8 +32,6 @@ static struct workqueue_struct *pseries_hp_wq; struct pseries_hp_work { struct work_struct work; struct pseries_hp_errorlog *errlog; - struct completion *hp_completion; - int *rc; }; struct cc_workarea { @@ -329,7 +327,7 @@ int dlpar_release_drc(u32 drc_index) return 0; } -static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) +int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) { int rc; @@ -371,20 +369,13 @@ static void pseries_hp_work_fn(struct work_struct *work) struct pseries_hp_work *hp_work = container_of(work, struct pseries_hp_work, work); - if (hp_work->rc) - *(hp_work->rc) = handle_dlpar_errorlog(hp_work->errlog); - else - handle_dlpar_errorlog(hp_work->errlog); - - if (hp_work->hp_completion) - complete(hp_work->hp_completion); + handle_dlpar_errorlog(hp_work->errlog); kfree(hp_work->errlog); kfree((void *)work); } -void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, - struct completion *hotplug_done, int *rc) +void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog) { struct pseries_hp_work *work; struct pseries_hp_errorlog *hp_errlog_copy; @@ -397,13 +388,9 @@ void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, if (work) { INIT_WORK((struct work_struct *)work, pseries_hp_work_fn); work->errlog = hp_errlog_copy; - work->hp_completion = hotplug_done; - work->rc = rc; queue_work(pseries_hp_wq, (struct work_struct *)work); } else { - *rc = -ENOMEM; kfree(hp_errlog_copy); - complete(hotplug_done); } } @@ -521,18 +508,15 @@ static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog) static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) { - struct pseries_hp_errorlog *hp_elog; - struct completion hotplug_done; + struct pseries_hp_errorlog hp_elog; char *argbuf; char *args; int rc; args = argbuf = kstrdup(buf, GFP_KERNEL); - hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL); - if (!hp_elog || !argbuf) { + if (!argbuf) { pr_info("Could not allocate resources for DLPAR operation\n"); kfree(argbuf); - kfree(hp_elog); return -ENOMEM; } @@ -540,25 +524,22 @@ static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, * Parse out the request from the user, this will be in the form: * */ - rc = dlpar_parse_resource(&args, hp_elog); + rc = dlpar_parse_resource(&args, &hp_elog); if (rc) goto dlpar_store_out; - rc = dlpar_parse_action(&args, hp_elog); + rc = dlpar_parse_action(&args, &hp_elog); if (rc) goto dlpar_store_out; - rc = dlpar_parse_id_type(&args, hp_elog); + rc = dlpar_parse_id_type(&args, &hp_elog); if (rc) goto dlpar_store_out; - init_completion(&hotplug_done); - queue_hotplug_event(hp_elog, &hotplug_done, &rc); - wait_for_completion(&hotplug_done); + rc = handle_dlpar_errorlog(&hp_elog); dlpar_store_out: kfree(argbuf); - kfree(hp_elog); if (rc) pr_err("Could not handle DLPAR request \"%s\"\n", buf); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index f0e30dc94988..6f27d00505cf 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -242,7 +242,7 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) static void prrn_update_node(__be32 phandle) { - struct pseries_hp_errorlog *hp_elog; + struct pseries_hp_errorlog hp_elog; struct device_node *dn; /* @@ -255,18 +255,12 @@ static void prrn_update_node(__be32 phandle) return; } - hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL); - if(!hp_elog) - return; + hp_elog.resource = PSERIES_HP_ELOG_RESOURCE_MEM; + hp_elog.action = PSERIES_HP_ELOG_ACTION_READD; + hp_elog.id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; + hp_elog._drc_u.drc_index = phandle; - hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM; - hp_elog->action = PSERIES_HP_ELOG_ACTION_READD; - hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; - hp_elog->_drc_u.drc_index = phandle; - - queue_hotplug_event(hp_elog, NULL, NULL); - - kfree(hp_elog); + handle_dlpar_errorlog(&hp_elog); } int pseries_devicetree_update(s32 scope) diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 619f8f3fa173..72c0b8986536 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -60,8 +60,9 @@ extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); -void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, - struct completion *hotplug_done, int *rc); +void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog); +int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); + #ifdef CONFIG_MEMORY_HOTPLUG int dlpar_memory(struct pseries_hp_errorlog *hp_elog); #else diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 8d2ead2d7591..2a9c28e4d4f9 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -334,7 +334,7 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) */ if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) - queue_hotplug_event(hp_elog, NULL, NULL); + queue_hotplug_event(hp_elog); else log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); From 85a88cabad57d26d826dd94ea34d3a785824d802 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 17 Sep 2018 14:14:02 -0500 Subject: [PATCH 038/221] powerpc/pseries: Disable CPU hotplug across migrations When performing partition migrations all present CPUs must be online as all present CPUs must make the H_JOIN call as part of the migration process. Once all present CPUs make the H_JOIN call, one CPU is returned to make the rtas call to perform the migration to the destination system. During testing of migration and changing the SMT state we have found instances where CPUs are offlined, as part of the SMT state change, before they make the H_JOIN call. This results in a hung system where every CPU is either in H_JOIN or offline. To prevent this this patch disables CPU hotplug during the migration process. Signed-off-by: Nathan Fontenot Reviewed-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtas.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 8afd146bc9c7..2c7ed31c736e 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -981,6 +981,7 @@ int rtas_ibm_suspend_me(u64 handle) goto out; } + cpu_hotplug_disable(); stop_topology_update(); /* Call function on all CPUs. One of us will make the @@ -995,6 +996,7 @@ int rtas_ibm_suspend_me(u64 handle) printk(KERN_ERR "Error doing global join\n"); start_topology_update(); + cpu_hotplug_enable(); /* Take down CPUs not online prior to suspend */ cpuret = rtas_offline_cpus_mask(offline_mask); From cbc39809a398b1a30c2bdb0f3123eef18c28a729 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 18 Sep 2018 13:06:17 +0930 Subject: [PATCH 039/221] powerpc/configs: Update skiroot defconfig Disable new features from recent releases, and clean out some other unused options: - Enable EXPERT, so we can disable some things - Disable non-powerpc BPF decoders - Disable TASKSTATS - Disable unused syscalls - Set more things to be modules - Turn off unused network vendors - PPC_OF_BOOT_TRAMPOLINE and FB_OF are unused on powernv - Drop unused Radeon and Matrox GPU drivers - IPV6 support landed in petitboot - Bringup related command line powersave=off dropped, switch to quiet Set CONFIG_I2C_CHARDEV=y as the module is not loaded automatically, and without this i2cget etc. will fail in the skiroot environment. This defconfig gets us build coverage of KERNEL_XZ, which was broken in the 4.19 merge window for powerpc. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/skiroot_defconfig | 154 +++++++++++++++++-------- 1 file changed, 108 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 6bd5e7261335..cfdd08897a06 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -3,20 +3,17 @@ CONFIG_ALTIVEC=y CONFIG_VSX=y CONFIG_NR_CPUS=2048 CONFIG_CPU_LITTLE_ENDIAN=y +CONFIG_KERNEL_XZ=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_CROSS_MEMORY_ATTACH is not set CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y +# CONFIG_CPU_ISOLATION is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=20 -CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_GZIP is not set # CONFIG_RD_BZIP2 is not set @@ -24,8 +21,14 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZO is not set # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_EXPERT=y +# CONFIG_SGETMASK_SYSCALL is not set +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_SHMEM is not set +# CONFIG_AIO is not set CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set +CONFIG_SLAB_FREELIST_HARDENED=y CONFIG_JUMP_LABEL=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_MODULES=y @@ -35,7 +38,9 @@ CONFIG_MODULE_SIG_FORCE=y CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_PPC_VAS is not set # CONFIG_PPC_PSERIES is not set +# CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_IDLE=y CONFIG_HZ_100=y @@ -48,8 +53,9 @@ CONFIG_NUMA=y CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off" +CONFIG_CMDLINE="console=tty0 console=hvc0 ipr.fast_reboot=1 quiet" # CONFIG_SECCOMP is not set +# CONFIG_PPC_MEM_KEYS is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -60,7 +66,6 @@ CONFIG_SYN_COOKIES=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_IPV6 is not set CONFIG_DNS_RESOLVER=y # CONFIG_WIRELESS is not set CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" @@ -73,8 +78,10 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=65536 CONFIG_VIRTIO_BLK=m CONFIG_BLK_DEV_NVME=m -CONFIG_EEPROM_AT24=y +CONFIG_NVME_MULTIPATH=y +CONFIG_EEPROM_AT24=m # CONFIG_CXL is not set +# CONFIG_OCXL is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m CONFIG_BLK_DEV_SR_VENDOR=y @@ -85,7 +92,6 @@ CONFIG_SCSI_FC_ATTRS=y CONFIG_SCSI_CXGB3_ISCSI=m CONFIG_SCSI_CXGB4_ISCSI=m CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_BE2ISCSI=m CONFIG_SCSI_AACRAID=m CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m @@ -102,7 +108,7 @@ CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_ALUA=m CONFIG_ATA=y -CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI=m # CONFIG_ATA_SFF is not set CONFIG_MD=y CONFIG_BLK_DEV_MD=m @@ -119,25 +125,72 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_ADAPTEC is not set +# CONFIG_NET_VENDOR_AGERE is not set +# CONFIG_NET_VENDOR_ALACRITECH is not set CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y -CONFIG_TIGON3=y +# CONFIG_NET_VENDOR_AMAZON is not set +# CONFIG_NET_VENDOR_AMD is not set +# CONFIG_NET_VENDOR_AQUANTIA is not set +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ATHEROS is not set +CONFIG_TIGON3=m CONFIG_BNX2X=m -CONFIG_CHELSIO_T1=y +# CONFIG_NET_VENDOR_BROCADE is not set +# CONFIG_NET_CADENCE is not set +# CONFIG_NET_VENDOR_CAVIUM is not set +CONFIG_CHELSIO_T1=m +# CONFIG_NET_VENDOR_CISCO is not set +# CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DEC is not set +# CONFIG_NET_VENDOR_DLINK is not set CONFIG_BE2NET=m -CONFIG_S2IO=m -CONFIG_E100=m +# CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_E1000=m -CONFIG_E1000E=m +CONFIG_IGB=m CONFIG_IXGB=m CONFIG_IXGBE=m +CONFIG_I40E=m +CONFIG_S2IO=m +# CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m +# CONFIG_MLX4_CORE_GEN2 is not set CONFIG_MLX5_CORE=m -CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_MICREL is not set CONFIG_MYRI10GE=m +# CONFIG_NET_VENDOR_NATSEMI is not set +# CONFIG_NET_VENDOR_NETRONOME is not set +# CONFIG_NET_VENDOR_NI is not set +# CONFIG_NET_VENDOR_NVIDIA is not set +# CONFIG_NET_VENDOR_OKI is not set +# CONFIG_NET_PACKET_ENGINE is not set CONFIG_QLGE=m CONFIG_NETXEN_NIC=m +# CONFIG_NET_VENDOR_QUALCOMM is not set +# CONFIG_NET_VENDOR_RDC is not set +# CONFIG_NET_VENDOR_REALTEK is not set +# CONFIG_NET_VENDOR_RENESAS is not set +# CONFIG_NET_VENDOR_ROCKER is not set +# CONFIG_NET_VENDOR_SAMSUNG is not set +# CONFIG_NET_VENDOR_SEEQ is not set CONFIG_SFC=m +# CONFIG_NET_VENDOR_SILAN is not set +# CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SMSC is not set +# CONFIG_NET_VENDOR_SOCIONEXT is not set +# CONFIG_NET_VENDOR_STMICRO is not set +# CONFIG_NET_VENDOR_SUN is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set +# CONFIG_NET_VENDOR_TEHUTI is not set +# CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set +CONFIG_PHYLIB=y # CONFIG_USB_NET_DRIVERS is not set # CONFIG_WLAN is not set CONFIG_INPUT_EVDEV=y @@ -149,39 +202,51 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_IPMI_HANDLER=y CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y +CONFIG_IPMI_WATCHDOG=y CONFIG_HW_RANDOM=y +CONFIG_TCG_TPM=y CONFIG_TCG_TIS_I2C_NUVOTON=y +CONFIG_I2C=y # CONFIG_I2C_COMPAT is not set CONFIG_I2C_CHARDEV=y # CONFIG_I2C_HELPER_AUTO is not set -CONFIG_DRM=y -CONFIG_DRM_RADEON=y +CONFIG_I2C_ALGOBIT=y +CONFIG_I2C_OPAL=m +CONFIG_PPS=y +CONFIG_SENSORS_IBMPOWERNV=m +CONFIG_DRM=m CONFIG_DRM_AST=m +CONFIG_FB=y CONFIG_FIRMWARE_EDID=y -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_OF=y -CONFIG_FB_MATROX=y -CONFIG_FB_MATROX_MILLENIUM=y -CONFIG_FB_MATROX_MYSTIQUE=y -CONFIG_FB_MATROX_G=y -# CONFIG_LCD_CLASS_DEVICE is not set -# CONFIG_BACKLIGHT_GENERIC is not set # CONFIG_VGA_CONSOLE is not set +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_HID_GENERIC=m +CONFIG_HID_A4TECH=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_EZKEY=y +CONFIG_HID_ITE=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y CONFIG_USB_HIDDEV=y -CONFIG_USB=y -CONFIG_USB_MON=y -CONFIG_USB_XHCI_HCD=y -CONFIG_USB_EHCI_HCD=y +CONFIG_USB=m +CONFIG_USB_XHCI_HCD=m +CONFIG_USB_EHCI_HCD=m # CONFIG_USB_EHCI_HCD_PPC_OF is not set -CONFIG_USB_OHCI_HCD=y -CONFIG_USB_STORAGE=y +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_STORAGE=m CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_OPAL=m CONFIG_RTC_DRV_GENERIC=m CONFIG_VIRT_DRIVERS=y -CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_PCI=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_EXT4_FS=m CONFIG_EXT4_FS_POSIX_ACL=y @@ -195,10 +260,9 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y @@ -207,26 +271,24 @@ CONFIG_NLS_UTF8=y CONFIG_CRC16=y CONFIG_CRC_ITU_T=y CONFIG_LIBCRC32C=y +# CONFIG_XZ_DEC_X86 is not set +# CONFIG_XZ_DEC_IA64 is not set +# CONFIG_XZ_DEC_ARM is not set +# CONFIG_XZ_DEC_ARMTHUMB is not set +# CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y -CONFIG_SCHEDSTATS=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_XMON=y CONFIG_XMON_DEFAULT=y -CONFIG_SECURITY=y -CONFIG_IMA=y -CONFIG_EVM=y +CONFIG_ENCRYPTED_KEYS=y # CONFIG_CRYPTO_ECHAINIV is not set -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_CMAC=y -CONFIG_CRYPTO_MD4=y -CONFIG_CRYPTO_ARC4=y -CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_HW is not set From ee9d21b3b3583712029a0db65a4b7c081d08d3b3 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Fri, 14 Sep 2018 13:36:47 +0930 Subject: [PATCH 040/221] powerpc/boot: Ensure _zimage_start is a weak symbol When building with clang crt0's _zimage_start is not marked weak, which breaks the build when linking the kernel image: $ objdump -t arch/powerpc/boot/crt0.o |grep _zimage_start$ 0000000000000058 g .text 0000000000000000 _zimage_start ld: arch/powerpc/boot/wrapper.a(crt0.o): in function '_zimage_start': (.text+0x58): multiple definition of '_zimage_start'; arch/powerpc/boot/pseries-head.o:(.text+0x0): first defined here Clang requires the .weak directive to appear after the symbol is declared. The binutils manual says: This directive sets the weak attribute on the comma separated list of symbol names. If the symbols do not already exist, they will be created. So it appears this is different with clang. The only reference I could see for this was an OpenBSD mailing list post[1]. Changing it to be after the declaration fixes building with Clang, and still works with GCC. $ objdump -t arch/powerpc/boot/crt0.o |grep _zimage_start$ 0000000000000058 w .text 0000000000000000 _zimage_start Reported to clang as https://bugs.llvm.org/show_bug.cgi?id=38921 [1] https://groups.google.com/forum/#!topic/fa.openbsd.tech/PAgKKen2YCY Signed-off-by: Joel Stanley Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman --- arch/powerpc/boot/crt0.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index dcf2f15e6797..32dfe6d083f3 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -47,8 +47,10 @@ p_end: .long _end p_pstack: .long _platform_stack_top #endif - .weak _zimage_start .globl _zimage_start + /* Clang appears to require the .weak directive to be after the symbol + * is defined. See https://bugs.llvm.org/show_bug.cgi?id=38921 */ + .weak _zimage_start _zimage_start: .globl _zimage_start_lib _zimage_start_lib: From e00d93ac9a189673028ac125a74b9bc8ae73eebc Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 14 Sep 2018 13:36:48 +0930 Subject: [PATCH 041/221] powerpc: Fix duplicate const clang warning in user access code This re-applies commit b91c1e3e7a6f ("powerpc: Fix duplicate const clang warning in user access code") (Jun 2015) which was undone in commits: f2ca80905929 ("powerpc/sparse: Constify the address pointer in __get_user_nosleep()") (Feb 2017) d466f6c5cac1 ("powerpc/sparse: Constify the address pointer in __get_user_nocheck()") (Feb 2017) f84ed59a612d ("powerpc/sparse: Constify the address pointer in __get_user_check()") (Feb 2017) We see a large number of duplicate const errors in the user access code when building with llvm/clang: include/linux/pagemap.h:576:8: warning: duplicate 'const' declaration specifier [-Wduplicate-decl-specifier] ret = __get_user(c, uaddr); The problem is we are doing const __typeof__(*(ptr)), which will hit the warning if ptr is marked const. Removing const does not seem to have any effect on GCC code generation. Signed-off-by: Anton Blanchard Signed-off-by: Joel Stanley Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/uaccess.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index bac225bb7f64..15bea9a0f260 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -260,7 +260,7 @@ do { \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ @@ -274,7 +274,7 @@ do { \ ({ \ long __gu_err = -EFAULT; \ __long_type(*(ptr)) __gu_val = 0; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ might_fault(); \ if (access_ok(VERIFY_READ, __gu_addr, (size))) { \ barrier_nospec(); \ @@ -288,7 +288,7 @@ do { \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ barrier_nospec(); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ From ab91239942a900e209f724886273060d9288f6fb Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Wed, 5 Sep 2018 12:09:50 +1000 Subject: [PATCH 042/221] powerpc/prom: Remove VLA in prom_check_platform_support() In prom_check_platform_support() we retrieve and parse the "ibm,arch-vec-5-platform-support" property of the chosen node. Currently we use a variable length array however to avoid this use an array of constant length 8. This property is used to indicate the supported options of vector 5 bytes 23-26 of the ibm,architecture.vec node. Each of these options is a pair of bytes, thus for 4 options we have a max length of 8 bytes. Signed-off-by: Suraj Jitindar Singh Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9b38a2e5dd35..1af453a61991 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1131,12 +1131,15 @@ static void __init prom_check_platform_support(void) "ibm,arch-vec-5-platform-support"); if (prop_len > 1) { int i; - u8 vec[prop_len]; + u8 vec[8]; prom_debug("Found ibm,arch-vec-5-platform-support, len: %d\n", prop_len); + if (prop_len > sizeof(vec)) + prom_printf("WARNING: ibm,arch-vec-5-platform-support longer than expected (len: %d)\n", + prop_len); prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", &vec, sizeof(vec)); - for (i = 0; i < prop_len; i += 2) { + for (i = 0; i < sizeof(vec); i += 2) { prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2 , vec[i] , vec[i + 1]); From 74422e2b19391e0bb9f11b0ab4522bbf7f93c4ba Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Wed, 5 Sep 2018 12:09:51 +1000 Subject: [PATCH 043/221] powerpc/pseries: Remove VLA from lparcfg_write() In lparcfg_write we hard code kbuf_sz and then use this as the variable length of kbuf creating a variable length array. Since we're hard coding the length anyway just define the array using this as the length and remove the need for kbuf_sz, thus removing the variable length array. Signed-off-by: Suraj Jitindar Singh Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lparcfg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 7c872dc01bdb..8bd590af488a 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -585,8 +585,7 @@ static ssize_t update_mpp(u64 *entitlement, u8 *weight) static ssize_t lparcfg_write(struct file *file, const char __user * buf, size_t count, loff_t * off) { - int kbuf_sz = 64; - char kbuf[kbuf_sz]; + char kbuf[64]; char *tmp; u64 new_entitled, *new_entitled_ptr = &new_entitled; u8 new_weight, *new_weight_ptr = &new_weight; @@ -595,7 +594,7 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return -EINVAL; - if (count > kbuf_sz) + if (count > sizeof(kbuf)) return -EINVAL; if (copy_from_user(kbuf, buf, count)) From 0823c68b054bca9dc321adea829af5cf36afb30b Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 14 Sep 2018 19:36:02 +0530 Subject: [PATCH 044/221] powerpc/fadump: re-register firmware-assisted dump if already registered Firmware-Assisted Dump (FADump) needs to be registered again after any memory hot add/remove operation to update the crash memory ranges. But currently, the kernel returns '-EEXIST' if we try to register without uregistering it first. This could expose the system to racing issues while unregistering and registering FADump from userspace during udev events. Spare the userspace of this and let it be taken care of in the kernel space for a simpler interface. Since this change, running 'echo 1 > /sys/kernel/fadump_registered' would result in re-regisering (unregistering and registering) FADump, if it was already registered. Signed-off-by: Hari Bathini Acked-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a711d22339ea..761b28b1427d 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case 1: if (fw_dump.dump_registered == 1) { - ret = -EEXIST; - goto unlock_out; + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); } /* Register Firmware-assisted dump */ ret = register_fadump(); From 54be0b9c7c9888ebe63b89a31a17ee3df6a68d61 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 2 Oct 2018 23:56:39 +1000 Subject: [PATCH 045/221] Revert "convert SLB miss handlers to C" and subsequent commits This reverts commits: 5e46e29e6a97 ("powerpc/64s/hash: convert SLB miss handlers to C") 8fed04d0f6ae ("powerpc/64s/hash: remove user SLB data from the paca") 655deecf67b2 ("powerpc/64s/hash: SLB allocation status bitmaps") 2e1626744e8d ("powerpc/64s/hash: provide arch_setup_exec hooks for hash slice setup") 89ca4e126a3f ("powerpc/64s/hash: Add a SLB preload cache") This series had a few bugs, and the fixes are not all trivial. So revert most of it for now. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 2 - arch/powerpc/include/asm/book3s/64/mmu-hash.h | 3 - arch/powerpc/include/asm/exception-64s.h | 8 + arch/powerpc/include/asm/paca.h | 19 +- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/include/asm/slice.h | 1 - arch/powerpc/include/asm/thread_info.h | 11 - arch/powerpc/kernel/asm-offsets.c | 11 +- arch/powerpc/kernel/exceptions-64s.S | 206 ++++++-- arch/powerpc/kernel/paca.c | 22 + arch/powerpc/kernel/process.c | 16 - arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/hash_utils_64.c | 46 +- arch/powerpc/mm/mmu_context.c | 3 +- arch/powerpc/mm/mmu_context_book3s64.c | 9 - arch/powerpc/mm/slb.c | 497 ++++++------------ arch/powerpc/mm/slb_low.S | 335 ++++++++++++ arch/powerpc/mm/slice.c | 43 +- arch/powerpc/xmon/xmon.c | 4 +- 19 files changed, 774 insertions(+), 465 deletions(-) create mode 100644 arch/powerpc/mm/slb_low.S diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 78ed3c3f879a..1f4691ce4126 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -78,8 +78,6 @@ void kernel_bad_stack(struct pt_regs *regs); void system_reset_exception(struct pt_regs *regs); void machine_check_exception(struct pt_regs *regs); void emulation_assist_interrupt(struct pt_regs *regs); -long do_slb_fault(struct pt_regs *regs, unsigned long ea); -void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err); /* signals, syscalls and interrupts */ long sys_swapcontext(struct ucontext __user *old_ctx, diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index bbeaf6adf93c..e0e4ce8f77d6 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -487,8 +487,6 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); -extern void hash__setup_new_exec(void); - #ifdef CONFIG_PPC_PSERIES void hpte_init_pseries(void); #else @@ -503,7 +501,6 @@ struct slb_entry { }; extern void slb_initialize(void); -extern void core_flush_all_slbs(struct mm_struct *mm); extern void slb_flush_and_rebolt(void); void slb_flush_all_realmode(void); void __slb_restore_bolted_realmode(void); diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 47578b79f0fb..a86feddddad0 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -60,6 +60,14 @@ */ #define MAX_MCE_DEPTH 4 +/* + * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR + * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole + * in the save area so it's not necessary to overlap them. Could be used + * for future savings though if another 4 byte register was to be saved. + */ +#define EX_LR EX_DAR + /* * EX_R3 is only used by the bad_stack handler. bad_stack reloads and * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d6b3706232c..7b6e23af3808 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -113,10 +113,7 @@ struct paca_struct { * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; - u8 slb_cache_ptr; - u8 stab_rr; /* stab/slb round-robin counter */ - u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ - u32 slb_kern_bitmap; + u16 slb_cache_ptr; u32 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -146,11 +143,24 @@ struct paca_struct { struct tlb_core_data tcd; #endif /* CONFIG_PPC_BOOK3E */ +#ifdef CONFIG_PPC_BOOK3S + mm_context_id_t mm_ctx_id; +#ifdef CONFIG_PPC_MM_SLICES + unsigned char mm_ctx_low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; + unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long mm_ctx_slb_addr_limit; +#else + u16 mm_ctx_user_psize; + u16 mm_ctx_sllp; +#endif +#endif + /* * then miscellaneous read-write fields */ struct task_struct *__current; /* Pointer to current */ u64 kstack; /* Saved Kernel stack addr */ + u64 stab_rr; /* stab/slb round-robin counter */ u64 saved_r1; /* r1 save for RTAS calls or PM or EE=0 */ u64 saved_msr; /* MSR saved here by enter_rtas */ u16 trap_save; /* Used when bad stack is encountered */ @@ -248,6 +258,7 @@ struct paca_struct { #endif /* CONFIG_PPC_BOOK3S_64 */ } ____cacheline_aligned; +extern void copy_mm_to_paca(struct mm_struct *mm); extern struct paca_struct **paca_ptrs; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 350c584ca179..52fadded5c1e 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -273,7 +273,6 @@ struct thread_struct { #endif /* CONFIG_HAVE_HW_BREAKPOINT */ struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */ unsigned long trap_nr; /* last trap # on this thread */ - u8 load_slb; /* Ages out SLB preload cache entries */ u8 load_fp; #ifdef CONFIG_ALTIVEC u8 load_vec; diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index a595461c9cb0..e40406cf5628 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -32,7 +32,6 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); void slice_init_new_context_exec(struct mm_struct *mm); -void slice_setup_new_exec(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 406eb952b808..3c0002044bc9 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -29,7 +29,6 @@ #include #include -#define SLB_PRELOAD_NR 16U /* * low level task data. */ @@ -45,10 +44,6 @@ struct thread_info { #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) struct cpu_accounting_data accounting; #endif - u8 slb_preload_nr; - u8 slb_preload_tail; - u32 slb_preload_esid[SLB_PRELOAD_NR]; - /* low level flags - has atomic operations done on it */ unsigned long flags ____cacheline_aligned_in_smp; }; @@ -77,12 +72,6 @@ static inline struct thread_info *current_thread_info(void) } extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); - -#ifdef CONFIG_PPC_BOOK3S_64 -void arch_setup_new_exec(void); -#define arch_setup_new_exec arch_setup_new_exec -#endif - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index ba9d0fc98730..89cf15566c4e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -173,6 +173,7 @@ int main(void) OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); + OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); OFFSET(PACAKBASE, paca_struct, kernelbase); @@ -180,6 +181,15 @@ int main(void) OFFSET(PACAIRQSOFTMASK, paca_struct, irq_soft_mask); OFFSET(PACAIRQHAPPENED, paca_struct, irq_happened); OFFSET(PACA_FTRACE_ENABLED, paca_struct, ftrace_enabled); +#ifdef CONFIG_PPC_BOOK3S + OFFSET(PACACONTEXTID, paca_struct, mm_ctx_id); +#ifdef CONFIG_PPC_MM_SLICES + OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); + OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); + OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); + DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); +#endif /* CONFIG_PPC_MM_SLICES */ +#endif #ifdef CONFIG_PPC_BOOK3E OFFSET(PACAPGD, paca_struct, pgd); @@ -202,7 +212,6 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); - OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); #ifdef CONFIG_PPC_MM_SLICES OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 786f4fa5100a..301a6a86a20f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -596,36 +596,28 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) -EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + mr r12,r3 /* save r3 */ + mfspr r3,SPRN_DAR + mfspr r11,SPRN_SRR1 + crset 4*cr6+eq + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) -EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) + mr r12,r3 /* save r3 */ + mfspr r3,SPRN_DAR + mfspr r11,SPRN_SRR1 + crset 4*cr6+eq + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) - TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) -EXC_COMMON_BEGIN(data_access_slb_common) - mfspr r10,SPRN_DAR - std r10,PACA_EXSLB+EX_DAR(r13) - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) - ld r4,PACA_EXSLB+EX_DAR(r13) - std r4,_DAR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_slb_fault - cmpdi r3,0 - bne- 1f - b fast_exception_return -1: /* Error case */ - std r3,RESULT(r1) - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - ld r4,_DAR(r1) - ld r5,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault - b ret_from_except - EXC_REAL(instruction_access, 0x400, 0x80) EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) @@ -648,33 +640,159 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) -EXCEPTION_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, KVMTEST_PR, 0x480); + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) + mr r12,r3 /* save r3 */ + mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ + mfspr r11,SPRN_SRR1 + crclr 4*cr6+eq + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) -EXCEPTION_RELON_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, NOTEST, 0x480); + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) + mr r12,r3 /* save r3 */ + mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ + mfspr r11,SPRN_SRR1 + crclr 4*cr6+eq + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) - TRAMP_KVM(PACA_EXSLB, 0x480) -EXC_COMMON_BEGIN(instruction_access_slb_common) - EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) - ld r4,_NIP(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_slb_fault - cmpdi r3,0 - bne- 1f - b fast_exception_return -1: /* Error case */ - std r3,RESULT(r1) - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) - ld r4,_NIP(r1) - ld r5,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault - b ret_from_except +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) + /* + * r13 points to the PACA, r9 contains the saved CR, + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return + * r3 has the faulting address + * r9 - r13 are saved in paca->exslb. + * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss + * We assume we aren't going to take any exceptions during this + * procedure. + */ + mflr r10 + stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ + std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + + andi. r9,r11,MSR_PR // Check for exception from userspace + cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later + + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI + + crset 4*cr0+eq +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_MMU_FTR_SECTION + bl slb_allocate +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) +#endif + + ld r10,PACA_EXSLB+EX_LR(r13) + lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ + mtlr r10 + + /* + * Large address, check whether we have to allocate new contexts. + */ + beq- 8f + + bne- cr5,2f /* if unrecoverable exception, oops */ + + /* All done -- return from exception. */ + + bne cr4,1f /* returning to kernel */ + + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_USER + b . /* prevent speculative execution */ +1: + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_KERNEL + b . /* prevent speculative execution */ + + +2: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 + LOAD_HANDLER(r10,unrecov_slb) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . + +8: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 + LOAD_HANDLER(r10, large_addr_slb) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . + +EXC_COMMON_BEGIN(unrecov_slb) + EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) + RECONCILE_IRQ_STATE(r10, r11) + bl save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b 1b + +EXC_COMMON_BEGIN(large_addr_slb) + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) + RECONCILE_IRQ_STATE(r10, r11) + ld r3, PACA_EXSLB+EX_DAR(r13) + std r3, _DAR(r1) + beq cr6, 2f + li r10, 0x481 /* fix trap number for I-SLB miss */ + std r10, _TRAP(r1) +2: bl save_nvgprs + addi r3, r1, STACK_FRAME_OVERHEAD + bl slb_miss_large_addr + b ret_from_except EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 0cf84e30d1cd..0ee3e6d50f28 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -258,3 +258,25 @@ void __init free_unused_pacas(void) printk(KERN_DEBUG "Allocated %u bytes for %u pacas\n", paca_ptrs_size + paca_struct_size, nr_cpu_ids); } + +void copy_mm_to_paca(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_BOOK3S + mm_context_t *context = &mm->context; + + get_paca()->mm_ctx_id = context->id; +#ifdef CONFIG_PPC_MM_SLICES + VM_BUG_ON(!mm->context.slb_addr_limit); + get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; + memcpy(&get_paca()->mm_ctx_low_slices_psize, + &context->low_slices_psize, sizeof(context->low_slices_psize)); + memcpy(&get_paca()->mm_ctx_high_slices_psize, + &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); +#else /* CONFIG_PPC_MM_SLICES */ + get_paca()->mm_ctx_user_psize = context->user_psize; + get_paca()->mm_ctx_sllp = context->sllp; +#endif +#else /* !CONFIG_PPC_BOOK3S */ + return; +#endif +} diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 03c2e1f134bc..913c5725cdb2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1482,15 +1482,6 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } -#ifdef CONFIG_PPC_BOOK3S_64 -void arch_setup_new_exec(void) -{ - if (radix_enabled()) - return; - hash__setup_new_exec(); -} -#endif - int set_thread_uses_vas(void) { #ifdef CONFIG_PPC_BOOK3S_64 @@ -1719,8 +1710,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } -void preload_new_slb_context(unsigned long start, unsigned long sp); - /* * Set up a thread for executing a new program */ @@ -1728,10 +1717,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ - -#ifdef CONFIG_PPC_BOOK3S_64 - preload_new_slb_context(start, sp); -#endif #endif /* @@ -1822,7 +1807,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif - current->thread.load_slb = 0; current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 892d4e061d62..cdf6a9960046 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 88c95dc8b141..f23a89d8e4ce 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1088,16 +1088,16 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) } #ifdef CONFIG_PPC_MM_SLICES -static unsigned int get_psize(struct mm_struct *mm, unsigned long addr) +static unsigned int get_paca_psize(unsigned long addr) { unsigned char *psizes; unsigned long index, mask_index; if (addr < SLICE_LOW_TOP) { - psizes = mm->context.low_slices_psize; + psizes = get_paca()->mm_ctx_low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); } else { - psizes = mm->context.high_slices_psize; + psizes = get_paca()->mm_ctx_high_slices_psize; index = GET_HIGH_SLICE_INDEX(addr); } mask_index = index & 0x1; @@ -1105,9 +1105,9 @@ static unsigned int get_psize(struct mm_struct *mm, unsigned long addr) } #else -unsigned int get_psize(struct mm_struct *mm, unsigned long addr) +unsigned int get_paca_psize(unsigned long addr) { - return mm->context.user_psize; + return get_paca()->mm_ctx_user_psize; } #endif @@ -1118,11 +1118,15 @@ unsigned int get_psize(struct mm_struct *mm, unsigned long addr) #ifdef CONFIG_PPC_64K_PAGES void demote_segment_4k(struct mm_struct *mm, unsigned long addr) { - if (get_psize(mm, addr) == MMU_PAGE_4K) + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) return; slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); copro_flush_all_slbs(mm); - core_flush_all_slbs(mm); + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { + + copy_mm_to_paca(mm); + slb_flush_and_rebolt(); + } } #endif /* CONFIG_PPC_64K_PAGES */ @@ -1187,6 +1191,22 @@ void hash_failure_debug(unsigned long ea, unsigned long access, trap, vsid, ssize, psize, lpsize, pte); } +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + copy_mm_to_paca(mm); + slb_flush_and_rebolt(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + /* Result code is: * 0 - handled * 1 - normal page fault @@ -1219,7 +1239,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, rc = 1; goto bail; } - psize = get_psize(mm, ea); + psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); vsid = get_user_vsid(&mm->context, ea, ssize); break; @@ -1307,6 +1327,9 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, WARN_ON(1); } #endif + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + goto bail; } @@ -1341,14 +1364,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, "to 4kB pages because of " "non-cacheable mapping\n"); psize = mmu_vmalloc_psize = MMU_PAGE_4K; - slb_vmalloc_update(); copro_flush_all_slbs(mm); - core_flush_all_slbs(mm); } } #endif /* CONFIG_PPC_64K_PAGES */ + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, @@ -1436,7 +1460,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, #ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { - int psize = get_psize(mm, ea); + int psize = get_slice_psize(mm, ea); /* We only prefault standard pages for now */ if (unlikely(psize != mm->context.user_psize)) diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 28ae2835db3d..f84e14f23e50 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -54,7 +54,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * MMU context id, which is then moved to SPRN_PID. * * For the hash MMU it is either the first load from slb_cache - * in switch_slb(), and/or load of MMU context id. + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). * * On the other side, the barrier is in mm/tlb-radix.c for * radix which orders earlier stores to clear the PTEs vs diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 510f103d7813..dbd8f762140b 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -53,8 +53,6 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); -void slb_setup_new_exec(void); - static int hash__init_new_context(struct mm_struct *mm) { int index; @@ -86,13 +84,6 @@ static int hash__init_new_context(struct mm_struct *mm) return index; } -void hash__setup_new_exec(void) -{ - slice_setup_new_exec(); - - slb_setup_new_exec(); -} - static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index b438220c4336..513c6596140d 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,7 +14,6 @@ * 2 of the License, or (at your option) any later version. */ -#include #include #include #include @@ -34,7 +33,7 @@ enum slb_index { KSTACK_INDEX = 1, /* Kernel stack map */ }; -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); +extern void slb_allocate(unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) @@ -45,17 +44,11 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } -static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, - unsigned long flags) -{ - return (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); -} - static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, unsigned long flags) { - return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); + return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); } static inline void slb_shadow_update(unsigned long ea, int ssize, @@ -122,9 +115,6 @@ void slb_restore_bolted_realmode(void) { __slb_restore_bolted_realmode(); get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } /* @@ -132,6 +122,9 @@ void slb_restore_bolted_realmode(void) */ void slb_flush_all_realmode(void) { + /* + * This flushes all SLB entries including 0, so it must be realmode. + */ asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } @@ -177,9 +170,6 @@ void slb_flush_and_rebolt(void) : "memory"); get_paca()->slb_cache_ptr = 0; - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } void slb_save_contents(struct slb_entry *slb_ptr) @@ -212,7 +202,7 @@ void slb_dump_contents(struct slb_entry *slb_ptr) return; pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %u\n", get_paca()->stab_rr); + pr_err("Last SLB entry inserted at slot %lld\n", get_paca()->stab_rr); for (i = 0; i < mmu_slb_size; i++) { e = slb_ptr->esid; @@ -257,119 +247,41 @@ void slb_vmalloc_update(void) slb_flush_and_rebolt(); } -static bool preload_hit(struct thread_info *ti, unsigned long esid) +/* Helper function to compare esids. There are four cases to handle. + * 1. The system is not 1T segment size capable. Use the GET_ESID compare. + * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. + * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. + * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. + */ +static inline int esids_match(unsigned long addr1, unsigned long addr2) { - u8 i; + int esid_1t_count; - for (i = 0; i < ti->slb_preload_nr; i++) { - u8 idx; + /* System is not 1T segment size capable. */ + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + return (GET_ESID(addr1) == GET_ESID(addr2)); - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - if (esid == ti->slb_preload_esid[idx]) - return true; - } - return false; + esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + + ((addr2 >> SID_SHIFT_1T) != 0)); + + /* both addresses are < 1T */ + if (esid_1t_count == 0) + return (GET_ESID(addr1) == GET_ESID(addr2)); + + /* One address < 1T, the other > 1T. Not a match */ + if (esid_1t_count == 1) + return 0; + + /* Both addresses are > 1T. */ + return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); } -static bool preload_add(struct thread_info *ti, unsigned long ea) -{ - unsigned long esid; - u8 idx; - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - /* EAs are stored >> 28 so 256MB segments don't need clearing */ - if (ea & ESID_MASK_1T) - ea &= ESID_MASK_1T; - } - - esid = ea >> SID_SHIFT; - - if (preload_hit(ti, esid)) - return false; - - idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; - ti->slb_preload_esid[idx] = esid; - if (ti->slb_preload_nr == SLB_PRELOAD_NR) - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; - else - ti->slb_preload_nr++; - - return true; -} - -static void preload_age(struct thread_info *ti) -{ - if (!ti->slb_preload_nr) - return; - ti->slb_preload_nr--; - ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; -} - -void slb_setup_new_exec(void) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long exec = 0x10000000; - - /* - * We have no good place to clear the slb preload cache on exec, - * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. - * - * For the most part that's probably okay to use entries from the - * previous exec, they will age out if unused. It may turn out to - * be an advantage to clear the cache before switching to it, - * however. - */ - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - if (!is_kernel_addr(exec)) { - if (preload_add(ti, exec)) - slb_allocate_user(mm, exec); - } - - /* Libraries and mmaps. */ - if (!is_kernel_addr(mm->mmap_base)) { - if (preload_add(ti, mm->mmap_base)) - slb_allocate_user(mm, mm->mmap_base); - } -} - -void preload_new_slb_context(unsigned long start, unsigned long sp) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long heap = mm->start_brk; - - /* Userspace entry address. */ - if (!is_kernel_addr(start)) { - if (preload_add(ti, start)) - slb_allocate_user(mm, start); - } - - /* Top of stack, grows down. */ - if (!is_kernel_addr(sp)) { - if (preload_add(ti, sp)) - slb_allocate_user(mm, sp); - } - - /* Bottom of heap, grows up. */ - if (heap && !is_kernel_addr(heap)) { - if (preload_add(ti, heap)) - slb_allocate_user(mm, heap); - } -} - - /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - struct thread_info *ti = task_thread_info(tsk); - u8 i; + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long exec_base; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -392,6 +304,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { unsigned long slbie_data = 0; + int i; asm volatile("isync" : : : "memory"); for (i = 0; i < offset; i++) { @@ -422,60 +335,67 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) "isync" :: "r"(ksp_vsid_data), "r"(ksp_esid_data)); - - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; } get_paca()->slb_cache_ptr = 0; } - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + copy_mm_to_paca(mm); /* - * We gradually age out SLBs after a number of context switches to - * reduce reload overhead of unused entries (like we do with FP/VEC - * reload). Each time we wrap 256 switches, take an entry out of the - * SLB preload cache. + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. */ - tsk->thread.load_slb++; - if (!tsk->thread.load_slb) { - unsigned long pc = KSTK_EIP(tsk); + exec_base = 0x10000000; - preload_age(ti); - preload_add(ti, pc); - } + if (is_kernel_addr(pc) || is_kernel_addr(stack) || + is_kernel_addr(exec_base)) + return; - for (i = 0; i < ti->slb_preload_nr; i++) { - unsigned long ea; - u8 idx; + slb_allocate(pc); - idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; - ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; + if (!esids_match(pc, stack)) + slb_allocate(stack); - slb_allocate_user(mm, ea); - } + if (!esids_match(pc, exec_base) && + !esids_match(stack, exec_base)) + slb_allocate(exec_base); } +static inline void patch_slb_encoding(unsigned int *insn_addr, + unsigned int immed) +{ + + /* + * This function patches either an li or a cmpldi instruction with + * a new immediate value. This relies on the fact that both li + * (which is actually addi) and cmpldi both take a 16-bit immediate + * value, and it is situated in the same location in the instruction, + * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. + * The signedness of the immediate operand differs between the two + * instructions however this code is only ever patching a small value, + * much less than 1 << 15, so we can get away with it. + * To patch the value we read the existing instruction, clear the + * immediate value, and or in our new value, then write the instruction + * back. + */ + unsigned int insn = (*insn_addr & 0xffff0000) | immed; + patch_instruction(insn_addr, insn); +} + +extern u32 slb_miss_kernel_load_linear[]; +extern u32 slb_miss_kernel_load_io[]; +extern u32 slb_compare_rr_to_size[]; +extern u32 slb_miss_kernel_load_vmemmap[]; + void slb_set_size(u16 size) { - mmu_slb_size = size; -} - -static void cpu_flush_slb(void *parm) -{ - struct mm_struct *mm = parm; - unsigned long flags; - - if (mm != current->active_mm) + if (mmu_slb_size == size) return; - local_irq_save(flags); - slb_flush_and_rebolt(); - local_irq_restore(flags); -} - -void core_flush_all_slbs(struct mm_struct *mm) -{ - on_each_cpu(cpu_flush_slb, mm, 1); + mmu_slb_size = size; + patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) @@ -497,16 +417,24 @@ void slb_initialize(void) #endif if (!slb_encoding_inited) { slb_encoding_inited = 1; + patch_slb_encoding(slb_miss_kernel_load_linear, + SLB_VSID_KERNEL | linear_llp); + patch_slb_encoding(slb_miss_kernel_load_io, + SLB_VSID_KERNEL | io_llp); + patch_slb_encoding(slb_compare_rr_to_size, + mmu_slb_size); + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); pr_devel("SLB: io LLP = %04lx\n", io_llp); + #ifdef CONFIG_SPARSEMEM_VMEMMAP + patch_slb_encoding(slb_miss_kernel_load_vmemmap, + SLB_VSID_KERNEL | vmemmap_llp); pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); #endif } get_paca()->stab_rr = SLB_NUM_BOLTED - 1; - get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; - get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; lflags = SLB_VSID_KERNEL | linear_llp; @@ -530,13 +458,52 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } -static void slb_cache_update(unsigned long esid_data) +static void insert_slb_entry(unsigned long vsid, unsigned long ea, + int bpsize, int ssize) { + unsigned long flags, vsid_data, esid_data; + enum slb_index index; int slb_cache_index; if (cpu_has_feature(CPU_FTR_ARCH_300)) return; /* ISAv3.0B and later does not use slb_cache */ + /* + * We are irq disabled, hence should be safe to access PACA. + */ + VM_WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + index = get_paca()->stab_rr; + + /* + * simple round-robin replacement of slb starting at SLB_NUM_BOLTED. + */ + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + + get_paca()->stab_rr = index; + + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + vsid_data = (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * Also we only handle user segments here. + */ + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data) + : "memory"); + /* * Now update slb cache entries */ @@ -558,196 +525,58 @@ static void slb_cache_update(unsigned long esid_data) } } -static enum slb_index alloc_slb_index(bool kernel) -{ - enum slb_index index; - - /* - * The allocation bitmaps can become out of synch with the SLB - * when the _switch code does slbie when bolting a new stack - * segment and it must not be anywhere else in the SLB. This leaves - * a kernel allocated entry that is unused in the SLB. With very - * large systems or small segment sizes, the bitmaps could slowly - * fill with these entries. They will eventually be cleared out - * by the round robin allocator in that case, so it's probably not - * worth accounting for. - */ - - /* - * SLBs beyond 32 entries are allocated with stab_rr only - * POWER7/8/9 have 32 SLB entries, this could be expanded if a - * future CPU has more. - */ - if (get_paca()->slb_used_bitmap != U32_MAX) { - index = ffz(get_paca()->slb_used_bitmap); - get_paca()->slb_used_bitmap |= 1U << index; - if (kernel) - get_paca()->slb_kern_bitmap |= 1U << index; - } else { - /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ - index = get_paca()->stab_rr; - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - get_paca()->stab_rr = index; - if (index < 32) { - if (kernel) - get_paca()->slb_kern_bitmap |= 1U << index; - else - get_paca()->slb_kern_bitmap &= ~(1U << index); - } - } - BUG_ON(index < SLB_NUM_BOLTED); - - return index; -} - -static long slb_insert_entry(unsigned long ea, unsigned long context, - unsigned long flags, int ssize, bool kernel) +static void handle_multi_context_slb_miss(int context_id, unsigned long ea) { + struct mm_struct *mm = current->mm; unsigned long vsid; - unsigned long vsid_data, esid_data; - enum slb_index index; - - vsid = get_vsid(context, ea, ssize); - if (!vsid) - return -EFAULT; - - index = alloc_slb_index(kernel); - - vsid_data = __mk_vsid_data(vsid, ssize, flags); - esid_data = mk_esid_data(ea, ssize, index); + int bpsize; /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * Also we only handle user segments here. + * We are always above 1TB, hence use high user segment size. */ - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); - - if (!kernel) - slb_cache_update(esid_data); - - return 0; + vsid = get_vsid(context_id, ea, mmu_highuser_ssize); + bpsize = get_slice_psize(mm, ea); + insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize); } -static long slb_allocate_kernel(unsigned long ea, unsigned long id) +void slb_miss_large_addr(struct pt_regs *regs) { - unsigned long context; - unsigned long flags; - int ssize; + enum ctx_state prev_state = exception_enter(); + unsigned long ea = regs->dar; + int context; - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) - return -EFAULT; + if (REGION_ID(ea) != USER_REGION_ID) + goto slb_bad_addr; - if (id == KERNEL_REGION_ID) { - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - } else if (id == VMEMMAP_REGION_ID) { - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - } else if (id == VMALLOC_REGION_ID) { - if (ea < H_VMALLOC_END) - flags = get_paca()->vmalloc_sllp; - else - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; - } else { - return -EFAULT; - } + /* + * Are we beyound what the page table layout supports ? + */ + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) + goto slb_bad_addr; - ssize = MMU_SEGSIZE_1T; - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - ssize = MMU_SEGSIZE_256M; - - context = id - KERNEL_REGION_CONTEXT_OFFSET; - - return slb_insert_entry(ea, context, flags, ssize, true); -} - -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) -{ - unsigned long context; - unsigned long flags; - int bpsize; - int ssize; + /* Lower address should have been handled by asm code */ + if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT)) + goto slb_bad_addr; /* * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= mm->context.slb_addr_limit) - return -EFAULT; + if (ea >= current->mm->context.slb_addr_limit) + goto slb_bad_addr; - context = get_ea_context(&mm->context, ea); + context = get_ea_context(¤t->mm->context, ea); if (!context) - return -EFAULT; + goto slb_bad_addr; - if (unlikely(ea >= H_PGTABLE_RANGE)) { - WARN_ON(1); - return -EFAULT; - } + handle_multi_context_slb_miss(context, ea); + exception_exit(prev_state); + return; - ssize = user_segment_size(ea); - - bpsize = get_slice_psize(mm, ea); - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - - return slb_insert_entry(ea, context, flags, ssize, false); -} - -long do_slb_fault(struct pt_regs *regs, unsigned long ea) -{ - unsigned long id = REGION_ID(ea); - - /* IRQs are not reconciled here, so can't check irqs_disabled */ - VM_WARN_ON(mfmsr() & MSR_EE); - - if (unlikely(!(regs->msr & MSR_RI))) - return -EINVAL; - - /* - * SLB kernel faults must be very careful not to touch anything - * that is not bolted. E.g., PACA and global variables are okay, - * mm->context stuff is not. - * - * SLB user faults can access all of kernel memory, but must be - * careful not to touch things like IRQ state because it is not - * "reconciled" here. The difficulty is that we must use - * fast_exception_return to return from kernel SLB faults without - * looking at possible non-bolted memory. We could test user vs - * kernel faults in the interrupt handler asm and do a full fault, - * reconcile, ret_from_except for user faults which would make them - * first class kernel code. But for performance it's probably nicer - * if they go via fast_exception_return too. - */ - if (id >= KERNEL_REGION_ID) { - return slb_allocate_kernel(ea, id); - } else { - struct mm_struct *mm = current->mm; - long err; - - if (unlikely(!mm)) - return -EFAULT; - - err = slb_allocate_user(mm, ea); - if (!err) - preload_add(current_thread_info(), ea); - - return err; - } -} - -void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) -{ - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } +slb_bad_addr: + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + exception_exit(prev_state); } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S new file mode 100644 index 000000000000..4d2e921d696e --- /dev/null +++ b/arch/powerpc/mm/slb_low.S @@ -0,0 +1,335 @@ +/* + * Low-level SLB routines + * + * Copyright (C) 2004 David Gibson , IBM + * + * Based on earlier C version: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This macro generates asm code to compute the VSID scramble + * function. Used in slb_allocate() and do_stab_bolted. The function + * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS + * + * rt = register containing the proto-VSID and into which the + * VSID will be stored + * rx = scratch register (clobbered) + * rf = flags + * + * - rt and rx must be different registers + * - The answer will end up in the low VSID_BITS bits of rt. The higher + * bits may contain other garbage, so you may need to mask the + * result. + */ +#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ + lis rx,VSID_MULTIPLIER_##size@h; \ + ori rx,rx,VSID_MULTIPLIER_##size@l; \ + mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ +/* \ + * powermac get slb fault before feature fixup, so make 65 bit part \ + * the default part of feature fixup \ + */ \ +BEGIN_MMU_FTR_SECTION \ + srdi rx,rt,VSID_BITS_65_##size; \ + clrldi rt,rt,(64-VSID_BITS_65_##size); \ + add rt,rt,rx; \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_65_##size; \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ +MMU_FTR_SECTION_ELSE \ + srdi rx,rt,VSID_BITS_##size; \ + clrldi rt,rt,(64-VSID_BITS_##size); \ + add rt,rt,rx; /* add high and low bits */ \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) + + +/* void slb_allocate(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * r3 is preserved. + * No other registers are examined or changed. + */ +_GLOBAL(slb_allocate) + /* + * Check if the address falls within the range of the first context, or + * if we may need to handle multi context. For the first context we + * allocate the slb entry via the fast path below. For large address we + * branch out to C-code and see if additional contexts have been + * allocated. + * The test here is: + * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT) + */ + rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4) + bne- 8f + + srdi r9,r3,60 /* get region */ + srdi r10,r3,SID_SHIFT /* get esid */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ + + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ + blt cr7,0f /* user or kernel? */ + + /* Check if hitting the linear mapping or some other kernel space + */ + bne cr7,1f + + /* Linear mapping encoding bits, the "li" instruction below will + * be patched by the kernel at boot + */ +.globl slb_miss_kernel_load_linear +slb_miss_kernel_load_linear: + li r11,0 + /* + * context = (ea >> 60) - (0xc - 1) + * r9 = region id. + */ + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET + +BEGIN_FTR_SECTION + b .Lslb_finish_load +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) + b .Lslb_finish_load_1T + +1: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + cmpldi cr0,r9,0xf + bne 1f +/* Check virtual memmap region. To be patched at kernel boot */ +.globl slb_miss_kernel_load_vmemmap +slb_miss_kernel_load_vmemmap: + li r11,0 + b 6f +1: +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + /* + * r10 contains the ESID, which is the original faulting EA shifted + * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) + * which is 0xd00038000. That can't be used as an immediate, even if we + * ignored the 0xd, so we have to load it into a register, and we only + * have one register free. So we must load all of (H_VMALLOC_END >> 28) + * into a register and compare ESID against that. + */ + lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 + ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 + // Rotate left 4, then mask with 0xffffffff0 + rldic r11,r11,4,28 // r11 = 0xd00038000 + cmpld r10,r11 // if r10 >= r11 + bge 5f // goto io_mapping + + /* + * vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines. + */ + lhz r11,PACAVMALLOCSLLP(r13) + b 6f +5: + /* IO mapping */ +.globl slb_miss_kernel_load_io +slb_miss_kernel_load_io: + li r11,0 +6: + /* + * context = (ea >> 60) - (0xc - 1) + * r9 = region id. + */ + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET + +BEGIN_FTR_SECTION + b .Lslb_finish_load +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) + b .Lslb_finish_load_1T + +0: /* + * For userspace addresses, make sure this is region 0. + */ + cmpdi r9, 0 + bne- 8f + /* + * user space make sure we are within the allowed limit + */ + ld r11,PACA_SLB_ADDR_LIMIT(r13) + cmpld r3,r11 + bge- 8f + + /* when using slices, we extract the psize off the slice bitmaps + * and then we need to get the sllp encoding off the mmu_psize_defs + * array. + * + * XXX This is a bit inefficient especially for the normal case, + * so we should try to implement a fast path for the standard page + * size using the old sllp value so we avoid the array. We cannot + * really do dynamic patching unfortunately as processes might flip + * between 4k and 64k standard page size + */ +#ifdef CONFIG_PPC_MM_SLICES + /* r10 have esid */ + cmpldi r10,16 + /* below SLICE_LOW_TOP */ + blt 5f + /* + * Handle hpsizes, + * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index + */ + srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ + addi r9,r11,PACAHIGHSLICEPSIZE + lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ + /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ + rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 + b 6f + +5: + /* + * Handle lpsizes + * r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index + */ + srdi r11,r10,1 /* index */ + addi r9,r11,PACALOWSLICESPSIZE + lbzx r9,r13,r9 /* r9 is lpsizes[r11] */ + rldicl r11,r10,0,63 /* r11 = r10 & 0x1 */ +6: + sldi r11,r11,2 /* index * 4 */ + /* Extract the psize and multiply to get an array offset */ + srd r9,r9,r11 + andi. r9,r9,0xf + mulli r9,r9,MMUPSIZEDEFSIZE + + /* Now get to the array and obtain the sllp + */ + ld r11,PACATOC(r13) + ld r11,mmu_psize_defs@got(r11) + add r11,r11,r9 + ld r11,MMUPSIZESLLP(r11) + ori r11,r11,SLB_VSID_USER +#else + /* paca context sllp already contains the SLB_VSID_USER bits */ + lhz r11,PACACONTEXTSLLP(r13) +#endif /* CONFIG_PPC_MM_SLICES */ + + ld r9,PACACONTEXTID(r13) +BEGIN_FTR_SECTION + cmpldi r10,0x1000 + bge .Lslb_finish_load_1T +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + b .Lslb_finish_load + +8: /* invalid EA - return an error indication */ + crset 4*cr0+eq /* indicate failure */ + blr + +/* + * Finish loading of an SLB entry and return + * + * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET + */ +.Lslb_finish_load: + rldimi r10,r9,ESID_BITS,0 + ASM_VSID_SCRAMBLE(r10,r9,r11,256M) + /* r3 = EA, r11 = VSID data */ + /* + * Find a slot, round robin. Previously we tried to find a + * free slot first but that took too long. Unfortunately we + * dont have any LRU information to help us choose a slot. + */ + + mr r9,r3 + + /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ +7: ld r10,PACASTABRR(r13) + addi r10,r10,1 + /* This gets soft patched on boot. */ +.globl slb_compare_rr_to_size +slb_compare_rr_to_size: + cmpldi r10,0 + + blt+ 4f + li r10,SLB_NUM_BOLTED + +4: + std r10,PACASTABRR(r13) + +3: + rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ + oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ + + /* r9 = ESID data, r11 = VSID data */ + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + */ + slbmte r11,r10 + + /* we're done for kernel addresses */ + crclr 4*cr0+eq /* set result to "success" */ + bgelr cr7 + + /* Update the slb cache */ + lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r9,SLB_CACHE_ENTRIES + bge 1f + + /* still room in the slb cache */ + sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ + srdi r10,r10,28 /* get the 36 bits of the ESID */ + add r11,r11,r13 /* r11 = (u32 *)paca + offset */ + stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + addi r9,r9,1 /* offset++ */ + b 2f +1: /* offset >= SLB_CACHE_ENTRIES */ + li r9,SLB_CACHE_ENTRIES+1 +2: + sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + crclr 4*cr0+eq /* set result to "success" */ + blr + +/* + * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. + * + * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 + */ +.Lslb_finish_load_1T: + srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ + rldimi r10,r9,ESID_BITS_1T,0 + ASM_VSID_SCRAMBLE(r10,r9,r11,1T) + + li r10,MMU_SEGSIZE_1T + rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ + + /* r3 = EA, r11 = VSID data */ + clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ + b 7b + + +_ASM_NOKPROBE_SYMBOL(slb_allocate) +_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) +_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) +_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap) +#endif diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index fc5b3a1ec666..205fe557ca10 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -207,6 +207,23 @@ static bool slice_check_range_fits(struct mm_struct *mm, return true; } +static void slice_flush_segments(void *parm) +{ +#ifdef CONFIG_PPC64 + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + copy_mm_to_paca(current->active_mm); + + local_irq_save(flags); + slb_flush_and_rebolt(); + local_irq_restore(flags); +#endif +} + static void slice_convert(struct mm_struct *mm, const struct slice_mask *mask, int psize) { @@ -272,9 +289,6 @@ static void slice_convert(struct mm_struct *mm, spin_unlock_irqrestore(&slice_convert_lock, flags); copro_flush_all_slbs(mm); -#ifdef CONFIG_PPC64 - core_flush_all_slbs(mm); -#endif } /* @@ -488,9 +502,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * be already initialised beyond the old address limit. */ mm->context.slb_addr_limit = high_limit; -#ifdef CONFIG_PPC64 - core_flush_all_slbs(mm); -#endif + + on_each_cpu(slice_flush_segments, mm, 1); } /* Sanity checks */ @@ -652,10 +665,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, (SLICE_NUM_HIGH && !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { slice_convert(mm, &potential_mask, psize); -#ifdef CONFIG_PPC64 if (psize > MMU_PAGE_BASE) - core_flush_all_slbs(mm); -#endif + on_each_cpu(slice_flush_segments, mm, 1); } return newaddr; @@ -746,20 +757,6 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } -#ifdef CONFIG_PPC_BOOK3S_64 -void slice_setup_new_exec(void) -{ - struct mm_struct *mm = current->mm; - - slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); - - if (!is_32bit_task()) - return; - - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; -} -#endif - void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 694c1d92e796..c70d17c9a6ba 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2394,9 +2394,7 @@ static void dump_one_paca(int cpu) } } DUMP(p, vmalloc_sllp, "%#-*x"); - DUMP(p, stab_rr, "%#-*x"); - DUMP(p, slb_used_bitmap, "%#-*x"); - DUMP(p, slb_kern_bitmap, "%#-*x"); + DUMP(p, stab_rr, "%#-*llx"); if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { DUMP(p, slb_cache_ptr, "%#-*x"); From db6711b7a17f03921e734e11e3a1e9bccb28bf46 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 20 Sep 2018 19:41:11 +1000 Subject: [PATCH 046/221] powerpc/perf: Add missing break in power7_marked_instr_event() In power7_marked_instr_event() there is a switch case that is missing a break or an explicit fallthrough, it's not immediately clear which it should be. The function determines based on the PMU event code, whether the event is a "marked" event (which then requires us to configure the PMU in a certain way). On Power7 there is no specific bit(s) in the event to tell us that, we just have to know. Rather than having a full list of every event and whether they are marked, we pull apart the event code and for events with certain values of certain fields we can say that those are all marked events. We take the psel (bits 0-7) of the event, and look at bits 4-7. For a value of 6 we say that if the entire psel == 0x64 then if the pmc == 3 the event is marked, else not, and otherwise we continue. It is then that we fallthrough to the 8 case, where we return true if the unit == 0xd. The question is should the 6 case also fallthrough and check for unit == 0xd, or should it return. Looking at the full list of events we see that there are zero events where (psel >> 4) == 0x6 and unit == 0xd. So the answer is it doesn't really matter, there are no valid event codes that will return a different result whether we fallthrough or break. But equally, testing the 6 case events against unit == 0xd is slightly bogus, as there are no such events. So to make the code clearer, and avoid any future confusion, have the 6 case break rather than falling through. Signed-off-by: Michael Ellerman Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/power7-pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 7963658dbc22..6dbae9884ec4 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -238,6 +238,7 @@ static int power7_marked_instr_event(u64 event) case 6: if (psel == 0x64) return pmc >= 3; + break; case 8: return unit == 0xd; } From 8139046a5a34787849df81f4a5875cf4b404a7a1 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Fri, 7 Sep 2018 13:04:48 +0530 Subject: [PATCH 047/221] powerpc/powernv: Make possible for user to force a full ipl cec reboot Ever since fast reboot is enabled by default in opal, opal_cec_reboot() will use fast-reset instead of full IPL to perform system reboot. This leaves the user with no direct way to force a full IPL reboot except changing an nvram setting that persistently disables fast-reset for all subsequent reboots. This patch provides a more direct way for the user to force a one-shot full IPL reboot by passing the command line argument 'full' to the reboot command. So the user will be able to tweak the reboot behavior via: $ sudo reboot full # Force a full ipl reboot skipping fast-reset or $ sudo reboot # default reboot path (usually fast-reset) The reboot command passes the un-parsed command argument to the kernel via the 'Reboot' syscall which is then passed on to the arch function pnv_restart(). The patch updates pnv_restart() to handle this cmd-arg and issues opal_cec_reboot2 with OPAL_REBOOT_FULL_IPL to force a full IPL reset. Signed-off-by: Vaibhav Jain Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/platforms/powernv/setup.c | 36 +++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 8365353330b4..870fb7b239ea 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1050,6 +1050,7 @@ enum OpalSysCooling { enum { OPAL_REBOOT_NORMAL = 0, OPAL_REBOOT_PLATFORM_ERROR = 1, + OPAL_REBOOT_FULL_IPL = 2, }; /* Argument to OPAL_PCI_TCE_KILL */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index c9cbd11a442e..14befee4b3f1 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -219,17 +219,41 @@ static void pnv_prepare_going_down(void) static void __noreturn pnv_restart(char *cmd) { - long rc = OPAL_BUSY; + long rc; pnv_prepare_going_down(); - while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_cec_reboot(); - if (rc == OPAL_BUSY_EVENT) - opal_poll_events(NULL); + do { + if (!cmd) + rc = opal_cec_reboot(); + else if (strcmp(cmd, "full") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL); else + rc = OPAL_UNSUPPORTED; + + if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + /* Opal is busy wait for some time and retry */ + opal_poll_events(NULL); mdelay(10); - } + + } else if (cmd && rc) { + /* Unknown error while issuing reboot */ + if (rc == OPAL_UNSUPPORTED) + pr_err("Unsupported '%s' reboot.\n", cmd); + else + pr_err("Unable to issue '%s' reboot. Err=%ld\n", + cmd, rc); + pr_info("Forcing a cec-reboot\n"); + cmd = NULL; + rc = OPAL_BUSY; + + } else if (rc != OPAL_SUCCESS) { + /* Unknown error while issuing cec-reboot */ + pr_err("Unable to reboot. Err=%ld\n", rc); + } + + } while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT); + for (;;) opal_poll_events(NULL); } From da7ad366b497f5fc1d4a416f168057ba40bddb98 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:42 +0530 Subject: [PATCH 048/221] powerpc/mm/book3s: Update pmd_present to look at _PAGE_PRESENT bit With this patch we use 0x8000000000000000UL (_PAGE_PRESENT) to indicate a valid pgd/pud/pmd entry. We also switch the p**_present() to look at this bit. With pmd_present, we have a special case. We need to make sure we consider a pmd marked invalid during THP split as present. Right now we clear the _PAGE_PRESENT bit during a pmdp_invalidate. Inorder to consider this special case we add a new pte bit _PAGE_INVALID (mapped to _RPAGE_SW0). This bit is only used with _PAGE_PRESENT cleared. Hence we are not really losing a pte bit for this special case. pmd_present is also updated to look at _PAGE_INVALID. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 5 +++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 14 +++++++++++--- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/pgtable-book3s64.c | 8 ++++++-- arch/powerpc/mm/pgtable.c | 7 +++---- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d52a51b2ce7b..fcf8b10a209f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -18,6 +18,11 @@ #include #endif +/* Bits to set in a PMD/PUD/PGD entry valid bit*/ +#define HASH_PMD_VAL_BITS (0x8000000000000000UL) +#define HASH_PUD_VAL_BITS (0x8000000000000000UL) +#define HASH_PGD_VAL_BITS (0x8000000000000000UL) + /* * Size of EA range mapped by our pagetables. */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 13a688fc8cd0..8feb4a3240d5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -875,8 +875,16 @@ static inline int pmd_none(pmd_t pmd) static inline int pmd_present(pmd_t pmd) { + /* + * A pmd is considerent present if _PAGE_PRESENT is set. + * We also need to consider the pmd present which is marked + * invalid during a split. Hence we look for _PAGE_INVALID + * if we find _PAGE_PRESENT cleared. + */ + if (pmd_raw(pmd) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)) + return true; - return !pmd_none(pmd); + return false; } static inline int pmd_bad(pmd_t pmd) @@ -903,7 +911,7 @@ static inline int pud_none(pud_t pud) static inline int pud_present(pud_t pud) { - return !pud_none(pud); + return (pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT)); } extern struct page *pud_page(pud_t pud); @@ -950,7 +958,7 @@ static inline int pgd_none(pgd_t pgd) static inline int pgd_present(pgd_t pgd) { - return !pgd_none(pgd); + return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); } static inline pte_t pgd_pte(pgd_t pgd) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f23a89d8e4ce..8ff03c7205a0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1001,9 +1001,9 @@ void __init hash__early_init_mmu(void) * 4k use hugepd format, so for hash set then to * zero */ - __pmd_val_bits = 0; - __pud_val_bits = 0; - __pgd_val_bits = 0; + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; __kernel_virt_size = H_KERN_VIRT_SIZE; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 01d7c0f7c4f0..654000da8b15 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -69,7 +69,11 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + WARN_ON(pte_val(pmd_pte(*pmdp)) & _PAGE_PRESENT); assert_spin_locked(pmd_lockptr(mm, pmdp)); WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); #endif @@ -106,7 +110,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, { unsigned long old_pmd; - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d71c7777669c..aee04b209b51 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -188,11 +188,10 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* - * When handling numa faults, we already have the pte marked - * _PAGE_PRESENT, but we can be sure that it is not in hpte. - * Hence we can use set_pte_at for them. + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. */ - VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + VM_WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); /* Add the pte bit when trying to set a pte */ pte = __pte(pte_val(pte) | _PAGE_PTE); From f1981b5b302f7f3b4c8b9b5e25ea8c48cebb1ae1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:43 +0530 Subject: [PATCH 049/221] powerpc/mm/hugetlb/book3s: add _PAGE_PRESENT to hugepd pointer. This make hugetlb directory pointer similar to other page able entries. A hugepd entry is identified by lack of _PAGE_PTE bit set and directory size stored in HUGEPD_SHIFT_MASK. We update that to also look at _PAGE_PRESENT Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 +- arch/powerpc/include/asm/book3s/64/hugetlb.h | 3 +++ arch/powerpc/mm/hugetlbpage.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 9a3798660cef..15bc16b1dc9c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -66,7 +66,7 @@ static inline int hash__hugepd_ok(hugepd_t hpd) * if it is not a pte and have hugepd shift mask * set, then it is a hugepd directory pointer */ - if (!(hpdval & _PAGE_PTE) && + if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) && ((hpdval & HUGEPD_SHIFT_MASK) != 0)) return true; return false; diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 50888388a359..5b0177733994 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -39,4 +39,7 @@ static inline bool gigantic_page_supported(void) } #endif +/* hugepd entry valid bit */ +#define HUGEPD_VAL_BITS (0x8000000000000000UL) + #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e87f9ef9115b..c6df73c66c40 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -95,7 +95,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, break; else { #ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | + *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); #elif defined(CONFIG_PPC_8xx) *hpdp = __hugepd(__pa(new) | _PMD_USER | From ae28f17b5eeb6702427ccb59e32e32a0c7e02f6b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:44 +0530 Subject: [PATCH 050/221] powerpc/mm/book3s: Check for pmd_large instead of pmd_trans_huge Update few code paths to check for pmd_large. set_pmd_at: We want to use this to store swap pte at pmd level. For swap ptes we don't want to set H_PAGE_THP_HUGE. Hence check for pmd_large in set_pmd_at. This remove the false WARN_ON when using this with swap pmd entry. pmd_page: We don't really use them on pmd migration entries. But they can also work with migration entries and we don't differentiate at the pte level. Hence update pmd_page to work with pmd migration entries too __find_linux_pte: lockless page table walk need to handle pmd migration entries. pmd_trans_huge check will return false on them. We don't set thp = 1 for such entries, but update hpage_shift correctly. Without this we will walk pmd migration entries as a pte page pointer which is wrong. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 8 ++++++-- arch/powerpc/mm/pgtable-book3s64.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c6df73c66c40..9504641bd4d9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -837,8 +837,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, ret_pte = (pte_t *) pmdp; goto out; } - - if (pmd_huge(pmd)) { + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { ret_pte = (pte_t *) pmdp; goto out; } else if (is_hugepd(__hugepd(pmd_val(pmd)))) diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 654000da8b15..43e99e1d947b 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -75,7 +75,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, */ WARN_ON(pte_val(pmd_pte(*pmdp)) & _PAGE_PRESENT); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); + WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 53e9eeecd5d4..e15e63079ba8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -306,7 +306,7 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) + if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) return pte_page(pmd_pte(pmd)); return virt_to_page(pmd_page_vaddr(pmd)); } From 75646c480fdeffbc246e9d45f29721ab5569f921 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:45 +0530 Subject: [PATCH 051/221] arch/powerpc/mm/hash: validate the pte entries before handling the hash fault Make sure we are operating on THP and hugetlb entries in the respective hash fault handling routines. No functional change in this patch. If we walked the table wrongly before, we will retry the access. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugepage-hash64.c | 6 ++++++ arch/powerpc/mm/hugetlbpage-hash64.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 01f213d2bcb9..dfbc3b32f09b 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -51,6 +51,12 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, new_pmd |= _PAGE_DIRTY; } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + /* + * Make sure this is thp or devmap entry + */ + if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + return 0; + rflags = htab_convert_pte_flags(new_pmd); #if 0 diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index b320f5097a06..2e6a8f9345d3 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -62,6 +62,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, new_pte |= _PAGE_DIRTY; } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + rflags = htab_convert_pte_flags(new_pte); if (unlikely(mmu_psize == MMU_PAGE_16G)) offset = PTRS_PER_PUD; From 8890e03380d361f67fdd7f91758d93339fd66ca9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:46 +0530 Subject: [PATCH 052/221] powerpc/mm/thp: update pmd_trans_huge to check for pmd_present We need to make sure pmd_trans_huge returns false for a pmd migration entry. We mark the migration entry by clearing the _PAGE_PRESENT bit. We keep the _PAGE_PTE bit set to indicate a leaf page table entry. Hence we need to make sure we check for pmd_present() so that pmd_trans_huge won't return true on pmd migration entry. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- .../include/asm/book3s/64/pgtable-64k.h | 3 +++ arch/powerpc/include/asm/book3s/64/pgtable.h | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index d7ee249d6890..e3d4dd4ae2fa 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -10,6 +10,9 @@ * * Defined in such a way that we can optimize away code block at build time * if CONFIG_HUGETLB_PAGE=n. + * + * returns true for pmd migration entries, THP, devmap, hugetlb + * But compile time dependent on CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8feb4a3240d5..e24db2aa260f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1136,6 +1136,10 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +/* + * returns true for pmd migration entries, THP, devmap, hugetlb + * But compile time dependent on THP config + */ static inline int pmd_large(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); @@ -1170,8 +1174,22 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED); } +/* + * Only returns true for a THP. False for pmd migration entry. + * We also need to return true when we come across a pte that + * in between a thp split. While splitting THP, we mark the pmd + * invalid (pmdp_invalidate()) before we set it with pte page + * address. A pmd_trans_huge() check against a pmd entry during that time + * should return true. + * We should not call this on a hugetlb entry. We should check for HugeTLB + * entry using vma->vm_flags + * The page table walk rule is explained in Documentation/vm/transhuge.rst + */ static inline int pmd_trans_huge(pmd_t pmd) { + if (!pmd_present(pmd)) + return false; + if (radix_enabled()) return radix__pmd_trans_huge(pmd); return hash__pmd_trans_huge(pmd); From a0820ff33451aa1a12ec66812e229ac58beb3f24 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 23:39:47 +0530 Subject: [PATCH 053/221] powerpc/mm:book3s: Enable THP migration support Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 8 ++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e24db2aa260f..c68cbbff3429 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -741,6 +741,8 @@ static inline bool pte_user(pte_t pte) */ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) +#define __pmd_to_swp_entry(pmd) (__pte_to_swp_entry(pmd_pte(pmd))) +#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) @@ -1091,6 +1093,12 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd)) #define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))) #define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd))) + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define pmd_swp_mksoft_dirty(pmd) pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd))) +#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd)) +#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd))) +#endif #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ #ifdef CONFIG_NUMA_BALANCING diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6c6a7c72cae4..495db17dcbca 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,6 +72,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK From 8c8933eba0c2853ecbd6a9ef7542b9058f8b5e11 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 4 Jan 2018 16:45:41 -0600 Subject: [PATCH 054/221] powerpc/cell: Use irq_of_parse_and_map() helper Instead of calling both of_irq_parse_one() and irq_create_of_mapping(), call of_irq_parse_and_map() instead which does the same thing. This gets us closer to making the former 2 functions static. Signed-off-by: Rob Herring Acked-by: Arnd Bergmann Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spu_manage.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 5c409c98cca8..e7a04af71452 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -180,35 +180,22 @@ static int __init spu_map_device_old(struct spu *spu) static int __init spu_map_interrupts(struct spu *spu, struct device_node *np) { - struct of_phandle_args oirq; - int ret; int i; for (i=0; i < 3; i++) { - ret = of_irq_parse_one(np, i, &oirq); - if (ret) { - pr_debug("spu_new: failed to get irq %d\n", i); + spu->irqs[i] = irq_of_parse_and_map(np, i); + if (!spu->irqs[i]) goto err; - } - ret = -EINVAL; - pr_debug(" irq %d no 0x%x on %pOF\n", i, oirq.args[0], - oirq.np); - spu->irqs[i] = irq_create_of_mapping(&oirq); - if (!spu->irqs[i]) { - pr_debug("spu_new: failed to map it !\n"); - goto err; - } } return 0; err: - pr_debug("failed to map irq %x for spu %s\n", *oirq.args, - spu->name); + pr_debug("failed to map irq %x for spu %s\n", i, spu->name); for (; i >= 0; i--) { if (spu->irqs[i]) irq_dispose_mapping(spu->irqs[i]); } - return ret; + return -EINVAL; } static int spu_map_resource(struct spu *spu, int nr, From c417596d2409125b1814c05e994a21ef9282b894 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 1 Feb 2018 11:59:22 -0600 Subject: [PATCH 055/221] powerpc/pseries: Use of_irq_get helper() in request_event_sources_irqs() Instead of calling both of_irq_parse_one() and irq_create_of_mapping(), call of_irq_get() instead which does essentially the same thing. of_irq_get() also calls irq_find_host() for deferred probe support, but this should be fine as irq_create_of_mapping() also calls that internally. This gets us closer to making the former 2 functions static. In the process of simplifying request_event_sources_irqs(), combine the the pr_err() and WARN_ON() calls to just a WARN(). Signed-off-by: Rob Herring Signed-off-by: Michael Ellerman --- .../powerpc/platforms/pseries/event_sources.c | 42 +++++++------------ 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index 6eeb0d4bab61..446ef104fb3a 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -16,7 +16,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include +#include #include "pseries.h" @@ -24,34 +25,19 @@ void request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name) { - int i, index, count = 0; - struct of_phandle_args oirq; - unsigned int virqs[16]; + int i, virq, rc; - /* First try to do a proper OF tree parsing */ - for (index = 0; of_irq_parse_one(np, index, &oirq) == 0; - index++) { - if (count > 15) - break; - virqs[count] = irq_create_of_mapping(&oirq); - if (!virqs[count]) { - pr_err("event-sources: Unable to allocate " - "interrupt number for %pOF\n", - np); - WARN_ON(1); - } else { - count++; - } - } - - /* Now request them */ - for (i = 0; i < count; i++) { - if (request_irq(virqs[i], handler, 0, name, NULL)) { - pr_err("event-sources: Unable to request interrupt " - "%d for %pOF\n", virqs[i], np); - WARN_ON(1); + for (i = 0; i < 16; i++) { + virq = of_irq_get(np, i); + if (virq < 0) + return; + if (WARN(!virq, "event-sources: Unable to allocate " + "interrupt number for %pOF\n", np)) + continue; + + rc = request_irq(virq, handler, 0, name, NULL); + if (WARN(rc, "event-sources: Unable to request interrupt %d for %pOF\n", + virq, np)) return; - } } } - From 0bdba867f01d69cffefee707504d3155a30f2d0f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 4 Sep 2018 16:27:44 -0500 Subject: [PATCH 056/221] macintosh: Convert to using %pOFn instead of device_node.name In preparation to remove the node name pointer from struct device_node, convert printf users to use the %pOFn format specifier. Signed-off-by: Rob Herring Signed-off-by: Michael Ellerman --- drivers/macintosh/macio_asic.c | 8 +++++--- drivers/macintosh/macio_sysfs.c | 8 +++++++- drivers/macintosh/windfarm_smu_controls.c | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 07074820a167..17d3bc917562 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -360,9 +360,10 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, struct macio_dev *in_bay, struct resource *parent_res) { + char name[MAX_NODE_NAME_SIZE + 1]; struct macio_dev *dev; const u32 *reg; - + if (np == NULL) return NULL; @@ -402,6 +403,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, #endif /* MacIO itself has a different reg, we use it's PCI base */ + snprintf(name, sizeof(name), "%pOFn", np); if (np == chip->of_node) { dev_set_name(&dev->ofdev.dev, "%1d.%08x:%.*s", chip->lbus.index, @@ -410,12 +412,12 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, #else 0, /* NuBus may want to do something better here */ #endif - MAX_NODE_NAME_SIZE, np->name); + MAX_NODE_NAME_SIZE, name); } else { reg = of_get_property(np, "reg", NULL); dev_set_name(&dev->ofdev.dev, "%1d.%08x:%.*s", chip->lbus.index, - reg ? *reg : 0, MAX_NODE_NAME_SIZE, np->name); + reg ? *reg : 0, MAX_NODE_NAME_SIZE, name); } /* Setup interrupts & resources */ diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c index ca4fcffe454b..d2451e58acb9 100644 --- a/drivers/macintosh/macio_sysfs.c +++ b/drivers/macintosh/macio_sysfs.c @@ -58,7 +58,13 @@ static ssize_t devspec_show(struct device *dev, static DEVICE_ATTR_RO(modalias); static DEVICE_ATTR_RO(devspec); -macio_config_of_attr (name, "%s\n"); +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%pOFn\n", dev->of_node); +} +static DEVICE_ATTR_RO(name); + macio_config_of_attr (type, "%s\n"); static struct attribute *macio_dev_attrs[] = { diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c index d174c7437337..86d65462a61c 100644 --- a/drivers/macintosh/windfarm_smu_controls.c +++ b/drivers/macintosh/windfarm_smu_controls.c @@ -277,7 +277,7 @@ static int __init smu_controls_init(void) fct = smu_fan_create(fan, 0); if (fct == NULL) { printk(KERN_WARNING "windfarm: Failed to create SMU " - "RPM fan %s\n", fan->name); + "RPM fan %pOFn\n", fan); continue; } list_add(&fct->link, &smu_fans); @@ -296,7 +296,7 @@ static int __init smu_controls_init(void) fct = smu_fan_create(fan, 1); if (fct == NULL) { printk(KERN_WARNING "windfarm: Failed to create SMU " - "PWM fan %s\n", fan->name); + "PWM fan %pOFn\n", fan); continue; } list_add(&fct->link, &smu_fans); From b9ef7b4b867f56114bedbe6bf104cfaba0ca818e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 27 Aug 2018 20:52:07 -0500 Subject: [PATCH 057/221] powerpc: Convert to using %pOFn instead of device_node.name In preparation to remove the node name pointer from struct device_node, convert printf users to use the %pOFn format specifier. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Arnd Bergmann Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Rob Herring Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/44x/fsp2.c | 8 +++--- arch/powerpc/platforms/cell/spu_manage.c | 4 +-- arch/powerpc/platforms/embedded6xx/wii.c | 2 +- .../powerpc/platforms/powernv/opal-powercap.c | 3 +- .../platforms/powernv/opal-sensor-groups.c | 4 +-- .../powerpc/platforms/powernv/opal-sysparam.c | 2 +- arch/powerpc/platforms/pseries/hotplug-cpu.c | 28 +++++++++---------- arch/powerpc/platforms/pseries/ibmebus.c | 2 +- arch/powerpc/platforms/pseries/vio.c | 27 +++++++++--------- 9 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index 04f0c73a9b4f..7a507f775308 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -210,15 +210,15 @@ static void node_irq_request(const char *compat, irq_handler_t errirq_handler) for_each_compatible_node(np, NULL, compat) { irq = irq_of_parse_and_map(np, 0); if (irq == NO_IRQ) { - pr_err("device tree node %s is missing a interrupt", - np->name); + pr_err("device tree node %pOFn is missing a interrupt", + np); return; } rc = request_irq(irq, errirq_handler, 0, np->name, np); if (rc) { - pr_err("fsp_of_probe: request_irq failed: np=%s rc=%d", - np->full_name, rc); + pr_err("fsp_of_probe: request_irq failed: np=%pOF rc=%d", + np, rc); return; } } diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index e7a04af71452..f7e36373f6e0 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -282,8 +282,8 @@ static int __init of_enumerate_spus(int (*fn)(void *data)) for_each_node_by_type(node, "spe") { ret = fn(node); if (ret) { - printk(KERN_WARNING "%s: Error initializing %s\n", - __func__, node->name); + printk(KERN_WARNING "%s: Error initializing %pOFn\n", + __func__, node); of_node_put(node); break; } diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index 403523c061ba..ecf703ee3a76 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -112,7 +112,7 @@ static void __iomem *wii_ioremap_hw_regs(char *name, char *compatible) } error = of_address_to_resource(np, 0, &res); if (error) { - pr_err("no valid reg found for %s\n", np->name); + pr_err("no valid reg found for %pOFn\n", np); goto out_put; } diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c index badb29bde93f..d90ee4fc2c6a 100644 --- a/arch/powerpc/platforms/powernv/opal-powercap.c +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -199,7 +199,7 @@ void __init opal_powercap_init(void) } j = 0; - pcaps[i].pg.name = node->name; + pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node); if (has_min) { powercap_add_attr(min, "powercap-min", &pcaps[i].pattrs[j]); @@ -237,6 +237,7 @@ void __init opal_powercap_init(void) while (--i >= 0) { kfree(pcaps[i].pattrs); kfree(pcaps[i].pg.attrs); + kfree(pcaps[i].pg.name); } kobject_put(powercap_kobj); out_pcaps: diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index f7d04b6a2d7a..179609220e6f 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -214,9 +214,9 @@ void __init opal_sensor_groups_init(void) } if (!of_property_read_u32(node, "ibm,chip-id", &chipid)) - sprintf(sgs[i].name, "%s%d", node->name, chipid); + sprintf(sgs[i].name, "%pOFn%d", node, chipid); else - sprintf(sgs[i].name, "%s", node->name); + sprintf(sgs[i].name, "%pOFn", node); sgs[i].sg.name = sgs[i].name; if (add_attr_group(ops, len, &sgs[i], sgid)) { diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 9aa87df114fd..916a4b7b1bb5 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -194,7 +194,7 @@ void __init opal_sys_param_init(void) count = of_property_count_strings(sysparam, "param-name"); if (count < 0) { pr_err("SYSPARAM: No string found of property param-name in " - "the node %s\n", sysparam->name); + "the node %pOFn\n", sysparam); goto out_param_buf; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 6ef77caf7bcf..2f8e62163602 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -287,7 +287,7 @@ static int pseries_add_processor(struct device_node *np) if (cpumask_empty(tmp)) { printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %s with %d thread(s)\n", np->name, + " processor %pOFn with %d thread(s)\n", np, nthreads); goto out_unlock; } @@ -481,8 +481,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) if (rc) { saved_rc = rc; - pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n", - dn->name, rc, drc_index); + pr_warn("Failed to attach node %pOFn, rc: %d, drc index: %x\n", + dn, rc, drc_index); rc = dlpar_release_drc(drc_index); if (!rc) @@ -494,8 +494,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; - pr_warn("Failed to online cpu %s, rc: %d, drc index: %x\n", - dn->name, rc, drc_index); + pr_warn("Failed to online cpu %pOFn, rc: %d, drc index: %x\n", + dn, rc, drc_index); rc = dlpar_detach_node(dn); if (!rc) @@ -504,7 +504,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } - pr_debug("Successfully added CPU %s, drc index: %x\n", dn->name, + pr_debug("Successfully added CPU %pOFn, drc index: %x\n", dn, drc_index); return rc; } @@ -570,19 +570,19 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; - pr_debug("Attempting to remove CPU %s, drc index: %x\n", - dn->name, drc_index); + pr_debug("Attempting to remove CPU %pOFn, drc index: %x\n", + dn, drc_index); rc = dlpar_offline_cpu(dn); if (rc) { - pr_warn("Failed to offline CPU %s, rc: %d\n", dn->name, rc); + pr_warn("Failed to offline CPU %pOFn, rc: %d\n", dn, rc); return -EINVAL; } rc = dlpar_release_drc(drc_index); if (rc) { - pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n", - drc_index, dn->name, rc); + pr_warn("Failed to release drc (%x) for CPU %pOFn, rc: %d\n", + drc_index, dn, rc); dlpar_online_cpu(dn); return rc; } @@ -591,7 +591,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) if (rc) { int saved_rc = rc; - pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc); + pr_warn("Failed to detach CPU %pOFn, rc: %d", dn, rc); rc = dlpar_acquire_drc(drc_index); if (!rc) @@ -662,8 +662,8 @@ static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove) rc = of_property_read_u32(dn, "ibm,my-drc-index", &cpu_drcs[cpus_found - 1]); if (rc) { - pr_warn("Error occurred getting drc-index for %s\n", - dn->name); + pr_warn("Error occurred getting drc-index for %pOFn\n", + dn); of_node_put(dn); return -1; } diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index c7c1140c13b6..5b4a56131904 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -404,7 +404,7 @@ static ssize_t name_show(struct device *dev, struct platform_device *ofdev; ofdev = to_platform_device(dev); - return sprintf(buf, "%s\n", ofdev->dev.of_node->name); + return sprintf(buf, "%pOFn\n", ofdev->dev.of_node); } static DEVICE_ATTR_RO(name); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 49e04ec19238..88f1ad1d6309 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1349,7 +1349,6 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) struct device_node *parent_node; const __be32 *prop; enum vio_dev_family family; - const char *of_node_name = of_node->name ? of_node->name : ""; /* * Determine if this node is a under the /vdevice node or under the @@ -1362,24 +1361,24 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) else if (!strcmp(parent_node->type, "vdevice")) family = VDEVICE; else { - pr_warn("%s: parent(%pOF) of %s not recognized.\n", + pr_warn("%s: parent(%pOF) of %pOFn not recognized.\n", __func__, parent_node, - of_node_name); + of_node); of_node_put(parent_node); return NULL; } of_node_put(parent_node); } else { - pr_warn("%s: could not determine the parent of node %s.\n", - __func__, of_node_name); + pr_warn("%s: could not determine the parent of node %pOFn.\n", + __func__, of_node); return NULL; } if (family == PFO) { if (of_get_property(of_node, "interrupt-controller", NULL)) { - pr_debug("%s: Skipping the interrupt controller %s.\n", - __func__, of_node_name); + pr_debug("%s: Skipping the interrupt controller %pOFn.\n", + __func__, of_node); return NULL; } } @@ -1399,15 +1398,15 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) if (of_node->type != NULL) viodev->type = of_node->type; else { - pr_warn("%s: node %s is missing the 'device_type' " - "property.\n", __func__, of_node_name); + pr_warn("%s: node %pOFn is missing the 'device_type' " + "property.\n", __func__, of_node); goto out; } prop = of_get_property(of_node, "reg", NULL); if (prop == NULL) { - pr_warn("%s: node %s missing 'reg'\n", - __func__, of_node_name); + pr_warn("%s: node %pOFn missing 'reg'\n", + __func__, of_node); goto out; } unit_address = of_read_number(prop, 1); @@ -1422,8 +1421,8 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) if (prop != NULL) viodev->resource_id = of_read_number(prop, 1); - dev_set_name(&viodev->dev, "%s", of_node_name); - viodev->type = of_node_name; + dev_set_name(&viodev->dev, "%pOFn", of_node); + viodev->type = dev_name(&viodev->dev); viodev->irq = 0; } @@ -1694,7 +1693,7 @@ struct vio_dev *vio_find_node(struct device_node *vnode) snprintf(kobj_name, sizeof(kobj_name), "%x", (uint32_t)of_read_number(prop, 1)); } else if (!strcmp(dev_type, "ibm,platform-facilities")) - snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); + snprintf(kobj_name, sizeof(kobj_name), "%pOFn", vnode); else return NULL; From 51423a9c9b09352bea1c53b8324db78bf3b170d1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 25 Sep 2018 14:10:04 +0000 Subject: [PATCH 058/221] powerpc/traps: merge unrecoverable_exception() and nonrecoverable_exception() PPC32 uses nonrecoverable_exception() while PPC64 uses unrecoverable_exception(). Both functions are doing almost the same thing. This patch removes nonrecoverable_exception() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 1 - arch/powerpc/kernel/entry_32.S | 4 ++-- arch/powerpc/kernel/traps.c | 12 ++---------- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1f4691ce4126..9bc98c239305 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -63,7 +63,6 @@ void program_check_exception(struct pt_regs *regs); void alignment_exception(struct pt_regs *regs); void slb_miss_bad_addr(struct pt_regs *regs); void StackOverflow(struct pt_regs *regs); -void nonrecoverable_exception(struct pt_regs *regs); void kernel_fp_unavailable_exception(struct pt_regs *regs); void altivec_unavailable_exception(struct pt_regs *regs); void vsx_unavailable_exception(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e58c3f467db5..77decded1175 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -794,7 +794,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) lis r10,MSR_KERNEL@h ori r10,r10,MSR_KERNEL@l bl transfer_to_handler_full - .long nonrecoverable_exception + .long unrecoverable_exception .long ret_from_except #endif @@ -1297,7 +1297,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) 4: addi r3,r1,STACK_FRAME_OVERHEAD - bl nonrecoverable_exception + bl unrecoverable_exception /* shouldn't return */ b 4b diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 6ab66a88db14..a29575215198 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1547,14 +1547,6 @@ void StackOverflow(struct pt_regs *regs) panic("kernel stack overflow"); } -void nonrecoverable_exception(struct pt_regs *regs) -{ - printk(KERN_ERR "Non-recoverable exception at PC=%lx MSR=%lx\n", - regs->nip, regs->msr); - debugger(regs); - die("nonrecoverable exception", regs, SIGKILL); -} - void kernel_fp_unavailable_exception(struct pt_regs *regs) { enum ctx_state prev_state = exception_enter(); @@ -2090,8 +2082,8 @@ void SPEFloatingPointRoundException(struct pt_regs *regs) */ void unrecoverable_exception(struct pt_regs *regs) { - printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n", - regs->trap, regs->nip); + pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", + regs->trap, regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } NOKPROBE_SYMBOL(unrecoverable_exception); From db787af1b8a6b4be428ee2ea7d409dafcaa4a43c Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 27 Sep 2018 13:40:57 +0530 Subject: [PATCH 059/221] powerpc/pseries: Fix DTL buffer registration When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we register the DTL buffer for a cpu when the associated file under powerpc/dtl in debugfs is opened. When doing so, we need to set the size of the buffer being registered in the second u32 word of the buffer. This needs to be in big endian, but we are not doing the conversion resulting in the below error showing up in dmesg: dtl_start: DTL registration for cpu 0 (hw 0) failed with -4 Fix this in the obvious manner. Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config option.") Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 18014cdeb590..c762689e0eb3 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -149,7 +149,7 @@ static int dtl_start(struct dtl *dtl) /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ - ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES; + ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); hwcpu = get_hard_smp_processor_id(dtl->cpu); addr = __pa(dtl->buf); From 9258227e9dd1da8feddb07ad9702845546a581c9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 27 Sep 2018 13:40:58 +0530 Subject: [PATCH 060/221] powerpc/pseries: Fix how we iterate over the DTL entries When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we look up dtl_idx in the lppaca to determine the number of entries in the buffer. Since lppaca is in big endian, we need to do an endian conversion before using this in our calculation to determine the number of entries in the buffer. Without this, we do not iterate over the existing entries in the DTL buffer properly. Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config option.") Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index c762689e0eb3..ef6595153642 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -184,7 +184,7 @@ static void dtl_stop(struct dtl *dtl) static u64 dtl_current_index(struct dtl *dtl) { - return lppaca_of(dtl->cpu).dtl_idx; + return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ From cd5ff94577e004e0a4457e70d0ef3a030f4010b8 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 26 Sep 2018 20:09:32 +0800 Subject: [PATCH 061/221] powerpc/xive: Move a dereference below a NULL test Move the dereference of xc below the NULL test. Signed-off-by: zhong jiang Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/common.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 959a2a62f233..9824074ec1b5 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1010,12 +1010,13 @@ static void xive_ipi_eoi(struct irq_data *d) { struct xive_cpu *xc = __this_cpu_read(xive_cpu); - DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", - d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); - /* Handle possible race with unplug and drop stale IPIs */ if (!xc) return; + + DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); xive_do_queue_eoi(xc); } From c3ff2a5193fa61b1b284cfb1d79628814ed0e95a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 07:05:53 +0000 Subject: [PATCH 062/221] powerpc/32: add stack protector support This functionality was tentatively added in the past (commit 6533b7c16ee5 ("powerpc: Initial stack protector (-fstack-protector) support")) but had to be reverted (commit f2574030b0e3 ("powerpc: Revert the initial stack protector support") because of GCC implementing it differently whether it had been built with libc support or not. Now, GCC offers the possibility to manually set the stack-protector mode (global or tls) regardless of libc support. This time, the patch selects HAVE_STACKPROTECTOR only if -mstack-protector-guard=tls is supported by GCC. On PPC32, as register r2 points to current task_struct at all time, the stack_canary located inside task_struct can be used directly by using the following GCC options: -mstack-protector-guard=tls -mstack-protector-guard-reg=r2 -mstack-protector-guard-offset=offsetof(struct task_struct, stack_canary)) The protector is disabled for prom_init and bootx_init as it is too early to handle it properly. $ echo CORRUPT_STACK > /sys/kernel/debug/provoke-crash/DIRECT [ 134.943666] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: lkdtm_CORRUPT_STACK+0x64/0x64 [ 134.943666] [ 134.955414] CPU: 0 PID: 283 Comm: sh Not tainted 4.18.0-s3k-dev-12143-ga3272be41209 #835 [ 134.963380] Call Trace: [ 134.965860] [c6615d60] [c001f76c] panic+0x118/0x260 (unreliable) [ 134.971775] [c6615dc0] [c001f654] panic+0x0/0x260 [ 134.976435] [c6615dd0] [c032c368] lkdtm_CORRUPT_STACK_STRONG+0x0/0x64 [ 134.982769] [c6615e00] [ffffffff] 0xffffffff Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/Makefile | 10 +++++++ arch/powerpc/include/asm/stackprotector.h | 34 +++++++++++++++++++++++ arch/powerpc/kernel/Makefile | 2 ++ arch/powerpc/kernel/asm-offsets.c | 3 ++ arch/powerpc/platforms/powermac/Makefile | 1 + 6 files changed, 51 insertions(+) create mode 100644 arch/powerpc/include/asm/stackprotector.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a80669209155..3bcb05929931 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,6 +180,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 + select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) && PPC32 select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 07d9dce7eda6..45b8eb4d8fe7 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -112,6 +112,9 @@ KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r2 + LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) @@ -404,6 +407,13 @@ archclean: archprepare: checkbin +ifdef CONFIG_STACKPROTECTOR +prepare: stack_protector_prepare + +stack_protector_prepare: prepare0 + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) +endif + # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. TOUT := .tmp_gas_check diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h new file mode 100644 index 000000000000..d05d969c98c2 --- /dev/null +++ b/arch/powerpc/include/asm/stackprotector.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * GCC stack protector support. + * + */ + +#ifndef _ASM_STACKPROTECTOR_H +#define _ASM_STACKPROTECTOR_H + +#include +#include +#include +#include + +/* + * Initialize the stackprotector canary value. + * + * NOTE: this must only be called from functions that never return, + * and it must always be inlined. + */ +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + canary = get_random_canary(); + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; + + current->stack_canary = canary; +} + +#endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1e64cfe22a83..85ffa488dfb5 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -20,6 +20,8 @@ CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector) + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_cputable.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 89cf15566c4e..9e9ee168e177 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -79,6 +79,9 @@ int main(void) { OFFSET(THREAD, task_struct, thread); OFFSET(MM, task_struct, mm); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TASK_CANARY, task_struct, stack_canary); +#endif OFFSET(MMCONTEXTID, mm_struct, context.id); #ifdef CONFIG_PPC64 DEFINE(SIGSEGV, SIGSEGV); diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 561a67d65e4d..923bfb340433 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS_bootx_init.o += -fPIC +CFLAGS_bootx_init.o += $(call cc-option, -fno-stack-protector) ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code From 06ec27aea9fc84d9c6d879eb64b5bcf28a8a1eb7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 07:05:55 +0000 Subject: [PATCH 063/221] powerpc/64: add stack protector support On PPC64, as register r13 points to the paca_struct at all time, this patch adds a copy of the canary there, which is copied at task_switch. That new canary is then used by using the following GCC options: -mstack-protector-guard=tls -mstack-protector-guard-reg=r13 -mstack-protector-guard-offset=offsetof(struct paca_struct, canary)) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile | 8 ++++++++ arch/powerpc/include/asm/paca.h | 3 +++ arch/powerpc/include/asm/stackprotector.h | 4 ++++ arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/kernel/entry_64.S | 4 ++++ 6 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3bcb05929931..602eea723624 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,7 +180,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 - select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) && PPC32 + select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 45b8eb4d8fe7..81552c7b46eb 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -113,7 +113,11 @@ KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls +ifdef CONFIG_PPC64 +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r13 +else cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r2 +endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie @@ -411,8 +415,12 @@ ifdef CONFIG_STACKPROTECTOR prepare: stack_protector_prepare stack_protector_prepare: prepare0 +ifdef CONFIG_PPC64 + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) +else $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) endif +endif # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 7b6e23af3808..c6d01f0aa898 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -256,6 +256,9 @@ struct paca_struct { struct slb_entry *mce_faulty_slbs; u16 slb_save_cache_ptr; #endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_STACKPROTECTOR + unsigned long canary; +#endif } ____cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h index d05d969c98c2..1c8460e23583 100644 --- a/arch/powerpc/include/asm/stackprotector.h +++ b/arch/powerpc/include/asm/stackprotector.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Initialize the stackprotector canary value. @@ -29,6 +30,9 @@ static __always_inline void boot_init_stack_canary(void) canary &= CANARY_MASK; current->stack_canary = canary; +#ifdef CONFIG_PPC64 + get_paca()->canary = canary; +#endif } #endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9e9ee168e177..a6d70fd2e499 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -81,6 +81,9 @@ int main(void) OFFSET(MM, task_struct, mm); #ifdef CONFIG_STACKPROTECTOR OFFSET(TASK_CANARY, task_struct, stack_canary); +#ifdef CONFIG_PPC64 + OFFSET(PACA_CANARY, paca_struct, canary); +#endif #endif OFFSET(MMCONTEXTID, mm_struct, context.id); #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 77a888bfcb53..573fa879d785 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -624,6 +624,10 @@ _GLOBAL(_switch) addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ +#if defined(CONFIG_STACKPROTECTOR) + ld r6, TASK_CANARY(r6) + std r6, PACA_CANARY(r13) +#endif ld r8,KSP(r4) /* new stack pointer */ #ifdef CONFIG_PPC_BOOK3S_64 From 16d7c69c898531210d13dbd1eb2053759ff0946d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 20 Sep 2018 13:45:05 -0300 Subject: [PATCH 064/221] powerpc: Redefine TIF_32BITS thread flag Moving TIF_32BIT to use bit 20 instead of 4 in the task flag field. This change is making room for an upcoming new task macro (_TIF_SYSCALL_EMU) which is preferred to set a bit in the lower 16-bits part of the word. This upcoming flag macro will take part in a composed macro (_TIF_SYSCALL_DOTRACE) which will contain other flags as well, and it is preferred that the whole _TIF_SYSCALL_DOTRACE macro only sets the lower 16 bits of a word, so, it could be handled using immediate operations (as load immediate, add immediate, ...) where the immediate operand (SI) is limited to 16-bits. Another possible solution would be using the LOAD_REG_IMMEDIATE() macro to load a full 64-bits word immediate, but it takes 5 operations instead of one. Having TIF_32BITS being redefined to use an upper bit is not a problem since there is only one place in the assembly code where TIF_32BIT is being used, and it could be replaced with an operation with right shift (addis), since it is used alone, i.e. not being part of a composed macro, which has different bits set, and would require LOAD_REG_IMMEDIATE(). Tested on a 64 bits Big Endian machine running a 32 bits task. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/thread_info.h | 2 +- arch/powerpc/kernel/entry_64.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 3c0002044bc9..1a52e14ec3ee 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -81,7 +81,6 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_FSCHECK 3 /* Check FS is USER_DS on return */ -#define TIF_32BIT 4 /* 32 bit binary */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ @@ -100,6 +99,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define TIF_ELF2ABI 18 /* function descriptors must die! */ #endif #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +#define TIF_32BIT 20 /* 32 bit binary */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< Date: Thu, 20 Sep 2018 13:45:06 -0300 Subject: [PATCH 065/221] powerpc/ptrace: Add support for PTRACE_SYSEMU This is a patch that adds support for PTRACE_SYSEMU ptrace request in PowerPC architecture. When ptrace(PTRACE_SYSEMU, ...) request is called, it will be handled by the arch independent function ptrace_resume(), which will tag the task with the TIF_SYSCALL_EMU flag. This flag needs to be handled from a platform dependent point of view, which is what this patch does. This patch adds this task's flag as part of the _TIF_SYSCALL_DOTRACE, which is the MACRO that is used to trace syscalls at entrance/exit. Since TIF_SYSCALL_EMU is now part of _TIF_SYSCALL_DOTRACE, if the task has _TIF_SYSCALL_DOTRACE set, it will hit do_syscall_trace_enter() at syscall entrance and do_syscall_trace_leave() at syscall leave. do_syscall_trace_enter() needs to handle the TIF_SYSCALL_EMU flag properly, which will interrupt the syscall executing if TIF_SYSCALL_EMU is set. The output values should not be changed, i.e. the return value (r3) should contain the original syscall argument on exit. With this flag set, the syscall is not executed fundamentally, because do_syscall_trace_enter() is returning -1 which is bigger than NR_syscall, thus, skipping the syscall execution and exiting userspace. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/thread_info.h | 4 +++- arch/powerpc/include/uapi/asm/ptrace.h | 4 ++++ arch/powerpc/kernel/ptrace.c | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 1a52e14ec3ee..3185f8ac1182 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -81,6 +81,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_FSCHECK 3 /* Check FS is USER_DS on return */ +#define TIF_SYSCALL_EMU 4 /* syscall emulation active */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ @@ -120,9 +121,10 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define _TIF_EMULATE_STACK_STORE (1< Date: Thu, 20 Sep 2018 13:45:07 -0300 Subject: [PATCH 066/221] selftests/powerpc: New PTRACE_SYSEMU test This patch adds a new test for the new PTRACE_SYSEMU ptrace request. This test also relies on PTRACE_GETREGS and PTRACE_SETREGS requests to run properly, since the trace instruction (gettid() syscall) is being modified at run-time (by PTRACE_SETREGS) and re-executed three times. PTRACE_GETREGS is being used to check that the registers are still sane. This test basically creates a child process that executes syscalls and the parent process check if it is being traced appropriately. The parent process guarantees that the SYSCALLs are being traced, with PTRACE_SYSEMU, and ptrace stops the child application before a syscall is executed. The way the tests validates it, is by guaranteeing that the system calls arguments, as argv[0] (r3) which is the same register that will have the syscall return value on powerpc, are not being corrupted on PTRACE_SYSEMU with a return value, i.e, it continues to have the current arguments instead, meaning that the registers where not clobbered. This test is basically the same test for x86 located at tools/testing/selftests/x86/ptrace_syscall.c, limited to test PTRACE_SYSEMU request, and ported to PowerPC. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- .../testing/selftests/powerpc/ptrace/Makefile | 2 +- .../selftests/powerpc/ptrace/ptrace-syscall.c | 228 ++++++++++++++++++ 2 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 28f5b781a553..1ee59978508d 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -2,7 +2,7 @@ TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \ - perf-hwbreak + perf-hwbreak ptrace-syscall include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c b/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c new file mode 100644 index 000000000000..3353210dcdbd --- /dev/null +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A ptrace test for testing PTRACE_SYSEMU, PTRACE_SETREGS and + * PTRACE_GETREG. This test basically create a child process that executes + * syscalls and the parent process check if it is being traced appropriated. + * + * This test is heavily based on tools/testing/selftests/x86/ptrace_syscall.c + * test, and it was adapted to run on Powerpc by + * Breno Leitao + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" + +/* Bitness-agnostic defines for user_regs_struct fields. */ +#define user_syscall_nr gpr[0] +#define user_arg0 gpr[3] +#define user_arg1 gpr[4] +#define user_arg2 gpr[5] +#define user_arg3 gpr[6] +#define user_arg4 gpr[7] +#define user_arg5 gpr[8] +#define user_ip nip + +#define PTRACE_SYSEMU 0x1d + +static int nerrs; + +static void wait_trap(pid_t chld) +{ + siginfo_t si; + + if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0) + err(1, "waitid"); + if (si.si_pid != chld) + errx(1, "got unexpected pid in event\n"); + if (si.si_code != CLD_TRAPPED) + errx(1, "got unexpected event type %d\n", si.si_code); +} + +static void test_ptrace_syscall_restart(void) +{ + int status; + struct pt_regs regs; + pid_t chld; + + printf("[RUN]\tptrace-induced syscall restart\n"); + + chld = fork(); + if (chld < 0) + err(1, "fork"); + + /* + * Child process is running 4 syscalls after ptrace. + * + * 1) getpid() + * 2) gettid() + * 3) tgkill() -> Send SIGSTOP + * 4) gettid() -> Where the tests will happen essentially + */ + if (chld == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0) + err(1, "PTRACE_TRACEME"); + + pid_t pid = getpid(), tid = syscall(SYS_gettid); + + printf("\tChild will make one syscall\n"); + syscall(SYS_tgkill, pid, tid, SIGSTOP); + + syscall(SYS_gettid, 10, 11, 12, 13, 14, 15); + _exit(0); + } + /* Parent process below */ + + /* Wait for SIGSTOP sent by tgkill above. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + printf("[RUN]\tSYSEMU\n"); + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSEMU"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + /* + * Ptrace trapped prior to executing the syscall, thus r3 still has + * the syscall number instead of the sys_gettid() result + */ + if (regs.user_syscall_nr != SYS_gettid || + regs.user_arg0 != 10 || regs.user_arg1 != 11 || + regs.user_arg2 != 12 || regs.user_arg3 != 13 || + regs.user_arg4 != 14 || regs.user_arg5 != 15) { + printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", + (unsigned long)regs.user_syscall_nr, + (unsigned long)regs.user_arg0, + (unsigned long)regs.user_arg1, + (unsigned long)regs.user_arg2, + (unsigned long)regs.user_arg3, + (unsigned long)regs.user_arg4, + (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tInitial nr and args are correct\n"); } + + printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n", + (unsigned long)regs.user_ip); + + /* + * Rewind to retry the same syscall again. This will basically test + * the rewind process together with PTRACE_SETREGS and PTRACE_GETREGS. + */ + regs.user_ip -= 4; + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSEMU"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_gettid || + regs.user_arg0 != 10 || regs.user_arg1 != 11 || + regs.user_arg2 != 12 || regs.user_arg3 != 13 || + regs.user_arg4 != 14 || regs.user_arg5 != 15) { + printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", + (unsigned long)regs.user_syscall_nr, + (unsigned long)regs.user_arg0, + (unsigned long)regs.user_arg1, + (unsigned long)regs.user_arg2, + (unsigned long)regs.user_arg3, + (unsigned long)regs.user_arg4, + (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tRestarted nr and args are correct\n"); + } + + printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n", + (unsigned long)regs.user_ip); + + /* + * Inject a new syscall (getpid) in the same place the previous + * syscall (gettid), rewind and re-execute. + */ + regs.user_syscall_nr = SYS_getpid; + regs.user_arg0 = 20; + regs.user_arg1 = 21; + regs.user_arg2 = 22; + regs.user_arg3 = 23; + regs.user_arg4 = 24; + regs.user_arg5 = 25; + regs.user_ip -= 4; + + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSEMU"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + /* Check that ptrace stopped at the new syscall that was + * injected, and guarantee that it haven't executed, i.e, user_args + * contain the arguments and not the syscall return value, for + * instance. + */ + if (regs.user_syscall_nr != SYS_getpid + || regs.user_arg0 != 20 || regs.user_arg1 != 21 + || regs.user_arg2 != 22 || regs.user_arg3 != 23 + || regs.user_arg4 != 24 || regs.user_arg5 != 25) { + + printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", + (unsigned long)regs.user_syscall_nr, + (unsigned long)regs.user_arg0, + (unsigned long)regs.user_arg1, + (unsigned long)regs.user_arg2, + (unsigned long)regs.user_arg3, + (unsigned long)regs.user_arg4, + (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tReplacement nr and args are correct\n"); + } + + if (ptrace(PTRACE_CONT, chld, 0, 0) != 0) + err(1, "PTRACE_CONT"); + + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); + + /* Guarantee that the process executed properly, returning 0 */ + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild exited cleanly\n"); + } +} + +int ptrace_syscall(void) +{ + test_ptrace_syscall_restart(); + + return nerrs; +} + +int main(void) +{ + return test_harness(ptrace_syscall, "ptrace_syscall"); +} From 62dea077f56728979e40a741c8a0941dca7290e8 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 25 Sep 2018 11:29:33 -0300 Subject: [PATCH 067/221] powerpc/powernv: Mark function as __noreturn There is a mismatch between function pnv_platform_error_reboot() definition and declaration regarding function modifiers. In the declaration part, it contains the function attribute __noreturn, while function definition itself lacks it. This was reported by sparse tool as an error: arch/powerpc/platforms/powernv/opal.c:538:6: error: symbol 'pnv_platform_error_reboot' redeclared with different type (originally declared at arch/powerpc/platforms/powernv/powernv.h:11) - different modifiers I checked and the function is already being considered as being 'noreturn' by the compiler, thus, I understand this patch does not change any code being generated. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 62c291e23dbe..06bf532fa000 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -535,7 +535,7 @@ static int opal_recover_mce(struct pt_regs *regs, return recovered; } -void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) +void __noreturn pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) { panic_flush_kmsg_start(); From 5c784c8414fba11b62e12439f11e109fb5751f38 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 16 Aug 2018 14:21:07 -0300 Subject: [PATCH 068/221] powerpc/tm: Remove msr_tm_active() Currently msr_tm_active() is a wrapper around MSR_TM_ACTIVE() if CONFIG_PPC_TRANSACTIONAL_MEM is set, or it is just a function that returns false if CONFIG_PPC_TRANSACTIONAL_MEM is not set. This function is not necessary, since MSR_TM_ACTIVE() just do the same and could be used, removing the dualism and simplifying the code. This patchset remove every instance of msr_tm_active() and replaced it by MSR_TM_ACTIVE(). Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 7 ++++++- arch/powerpc/kernel/process.c | 21 +++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e5b314ed054e..640a4d818772 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -118,11 +118,16 @@ #define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */ #define MSR_TS_T __MASK(MSR_TS_T_LG) /* Transaction Transactional */ #define MSR_TS_MASK (MSR_TS_T | MSR_TS_S) /* Transaction State bits */ -#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ #define MSR_TM_RESV(x) (((x) & MSR_TS_MASK) == MSR_TS_MASK) /* Reserved */ #define MSR_TM_TRANSACTIONAL(x) (((x) & MSR_TS_MASK) == MSR_TS_T) #define MSR_TM_SUSPENDED(x) (((x) & MSR_TS_MASK) == MSR_TS_S) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ +#else +#define MSR_TM_ACTIVE(x) 0 +#endif + #if defined(CONFIG_PPC_BOOK3S_64) #define MSR_64BIT MSR_SF diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 913c5725cdb2..ec264a6f0eb3 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -102,24 +102,18 @@ static void check_if_tm_restore_required(struct task_struct *tsk) } } -static inline bool msr_tm_active(unsigned long msr) -{ - return MSR_TM_ACTIVE(msr); -} - static bool tm_active_with_fp(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_FP); } static bool tm_active_with_altivec(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_VEC); } #else -static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } @@ -247,7 +241,8 @@ void enable_kernel_fp(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_fpu(current); } @@ -311,7 +306,8 @@ void enable_kernel_altivec(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_altivec(current); } @@ -397,7 +393,8 @@ void enable_kernel_vsx(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_vsx(current); } @@ -530,7 +527,7 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; - if (!msr_tm_active(regs->msr) && + if (!MSR_TM_ACTIVE(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; From 51303113e32fd92d327b3c441c45e235642fa69c Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 7 Aug 2018 10:35:00 -0300 Subject: [PATCH 069/221] powerpc/tm: Print 64-bits MSR On a kernel TM Bad thing program exception, the Machine State Register (MSR) is not being properly displayed. The exception code dumps a 32-bits value but MSR is a 64 bits register for all platforms that have HTM enabled. This patch dumps the MSR value as a 64-bits value instead of 32 bits. In order to do so, the 'reason' variable could not be used, since it trimmed MSR to 32-bits (int). Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a29575215198..5ef85ab49dc3 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1433,7 +1433,7 @@ void program_check_exception(struct pt_regs *regs) goto bail; } else { printk(KERN_EMERG "Unexpected TM Bad Thing exception " - "at %lx (msr 0x%x)\n", regs->nip, reason); + "at %lx (msr 0x%lx)\n", regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } } From 8a03e81cb14712f986c5d5fa42a30ff2cbe3237c Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Wed, 26 Sep 2018 14:24:30 +0200 Subject: [PATCH 070/221] powerpc/64s: consolidate MCE counter increment. The code in machine_check_exception excludes 64s hvmode when incrementing the MCE counter only to call opal_machine_check to increment it specifically for this case. Remove the exclusion and special case. Fixes: a43c1590426c ("powerpc/pseries: Flush SLB contents on SLB MCE errors.") Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 4 +--- arch/powerpc/platforms/powernv/opal.c | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5ef85ab49dc3..fd58749b4d6b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -741,9 +741,7 @@ void machine_check_exception(struct pt_regs *regs) if (!nested) nmi_enter(); - /* 64s accounts the mce in machine_check_early when in HVMODE */ - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) - __this_cpu_inc(irq_stat.mce_exceptions); + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 06bf532fa000..a4641515956f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -578,8 +578,6 @@ int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; - __this_cpu_inc(irq_stat.mce_exceptions); - if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return 0; From 01b9870ea614518d48b6d2c787284b3a7d181a7c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 23 Sep 2018 08:12:08 +0000 Subject: [PATCH 071/221] powerpc: Remove duplicated include from pci_32.c Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Stephen Rothwell Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/pci_32.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index d63b488d34d7..4da8ed576229 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include From 5bd9b4445d1f02639cb5e5b151ef40d0a5dc4b47 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Wed, 26 Sep 2018 16:10:56 +0200 Subject: [PATCH 072/221] powerpc/config: Enable CONFIG_PRINTK_TIME for 64bit configs which use for CONFIG_LOG_BUF_SHIFT the same or higher value than the default (currently 17). Signed-off-by: Petr Vorel Signed-off-by: Michael Ellerman --- arch/powerpc/configs/g5_defconfig | 1 + arch/powerpc/configs/maple_defconfig | 1 + arch/powerpc/configs/powernv_defconfig | 1 + arch/powerpc/configs/ppc64_defconfig | 1 + arch/powerpc/configs/ps3_defconfig | 1 + arch/powerpc/configs/pseries_defconfig | 1 + 6 files changed, 6 insertions(+) diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 67c39f4acede..f686cc1eac0b 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -262,3 +262,4 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig index 59e47ec85336..f71eddafb02f 100644 --- a/arch/powerpc/configs/maple_defconfig +++ b/arch/powerpc/configs/maple_defconfig @@ -112,3 +112,4 @@ CONFIG_PPC_EARLY_DEBUG=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index b035d909e681..ef2ef98d3f28 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -353,3 +353,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 1b4753d4070b..f2515674a1e2 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -368,3 +368,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 187e2f7c12c8..cf8d55f67272 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -171,3 +171,4 @@ CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_LZO=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 0dd5cf7b566d..5e09a40cbcbf 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -325,3 +325,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y From 306b1c06172424d496656ae5b668ebfdd54ed506 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 27 Sep 2018 15:05:15 +1000 Subject: [PATCH 073/221] powerpc/tm: Reformat comments The comments in this file don't conform to the coding style so take them to "Comment Formatting Re-Education Camp". Suggested-by: Michael "Camp Drill Sergeant" Ellerman Signed-off-by: Michael Neuling [mpe: Reflow some comments and add full stops, fix spelling of Sergeant.] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 67 +++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 50e5cff10d0f..e6b527e5c90c 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -92,7 +92,8 @@ _GLOBAL(tm_abort) blr EXPORT_SYMBOL_GPL(tm_abort); -/* void tm_reclaim(struct thread_struct *thread, +/* + * void tm_reclaim(struct thread_struct *thread, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding @@ -163,26 +164,27 @@ _GLOBAL(tm_reclaim) */ TRECLAIM(R4) /* Cause in r4 */ - /* ******************** GPRs ******************** */ - /* Stash the checkpointed r13 away in the scratch SPR and get the real - * paca + /* + * ******************** GPRs ******************** + * Stash the checkpointed r13 in the scratch SPR and get the real paca. */ SET_SCRATCH0(r13) GET_PACA(r13) - /* Stash the checkpointed r1 away in paca tm_scratch and get the real - * stack pointer back + /* + * Stash the checkpointed r1 away in paca->tm_scratch and get the real + * stack pointer back into r1. */ std r1, PACATMSCRATCH(r13) ld r1, PACAR1(r13) - /* Store the PPR in r11 and reset to decent value */ std r11, GPR11(r1) /* Temporary stash */ /* Reset MSR RI so we can take SLB faults again */ li r11, MSR_RI mtmsrd r11, 1 + /* Store the PPR in r11 and reset to decent value */ mfspr r11, SPRN_PPR HMT_MEDIUM @@ -195,14 +197,15 @@ _GLOBAL(tm_reclaim) addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ - SAVE_GPR(2, r7) /* user r2 */ + SAVE_GPR(2, r7) /* user r2 */ SAVE_4GPRS(3, r7) /* user r3-r6 */ SAVE_GPR(8, r7) /* user r8 */ SAVE_GPR(9, r7) /* user r9 */ @@ -223,7 +226,8 @@ _GLOBAL(tm_reclaim) /* ******************** NIP ******************** */ mfspr r3, SPRN_TFHAR std r3, _NIP(r7) /* Returns to failhandler */ - /* The checkpointed NIP is ignored when rescheduling/rechkpting, + /* + * The checkpointed NIP is ignored when rescheduling/rechkpting, * but is used in signal return to 'wind back' to the abort handler. */ @@ -246,12 +250,13 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TAR(r12) std r4, THREAD_TM_DSCR(r12) - /* MSR and flags: We don't change CRs, and we don't need to alter - * MSR. + /* + * MSR and flags: We don't change CRs, and we don't need to alter MSR. */ - /* ******************** FPR/VR/VSRs ************ + /* + * ******************** FPR/VR/VSRs ************ * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these @@ -277,7 +282,8 @@ _GLOBAL(tm_reclaim) stfd fr0,FPSTATE_FPSCR(r7) - /* TM regs, incl TEXASR -- these live in thread_struct. Note they've + /* + * TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure * cause (aborted). */ @@ -313,7 +319,7 @@ _GLOBAL(tm_reclaim) blr - /* + /* * void __tm_recheckpoint(struct thread_struct *thread) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. @@ -329,7 +335,8 @@ _GLOBAL(__tm_recheckpoint) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) - /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + /* + * We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. * This is used for backing up the NVGPRs: */ SAVE_NVGPRS(r1) @@ -338,8 +345,9 @@ _GLOBAL(__tm_recheckpoint) addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is now NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD @@ -407,14 +415,15 @@ restore_gprs: REST_NVGPRS(r7) /* GPR14-31 */ - /* Load up PPR and DSCR here so we don't run with user values for long - */ + /* Load up PPR and DSCR here so we don't run with user values for long */ mtspr SPRN_DSCR, r5 mtspr SPRN_PPR, r6 - /* Do final sanity check on TEXASR to make sure FS is set. Do this + /* + * Do final sanity check on TEXASR to make sure FS is set. Do this * here before we load up the userspace r1 so any bugs we hit will get - * a call chain */ + * a call chain. + */ mfspr r5, SPRN_TEXASR srdi r5, r5, 16 li r6, (TEXASR_FS)@h @@ -422,8 +431,9 @@ restore_gprs: 1: tdeqi r6, 0 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 - /* Do final sanity check on MSR to make sure we are not transactional - * or suspended + /* + * Do final sanity check on MSR to make sure we are not transactional + * or suspended. */ mfmsr r6 li r5, (MSR_TS_MASK)@higher @@ -439,8 +449,8 @@ restore_gprs: REST_GPR(6, r7) /* - * Store r1 and r5 on the stack so that we can access them - * after we clear MSR RI. + * Store r1 and r5 on the stack so that we can access them after we + * clear MSR RI. */ REST_GPR(5, r7) @@ -470,7 +480,8 @@ restore_gprs: HMT_MEDIUM - /* Our transactional state has now changed. + /* + * Our transactional state has now changed. * * Now just get out of here. Transactional (current) state will be * updated once restore is called on the return path in the _switch-ed From 803d690e68f0c5230183f1a42c7d50a41d16e380 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Aug 2018 13:19:52 +0000 Subject: [PATCH 074/221] powerpc/mm: Don't report hugepage tables as memory leaks when using kmemleak When a process allocates a hugepage, the following leak is reported by kmemleak. This is a false positive which is due to the pointer to the table being stored in the PGD as physical memory address and not virtual memory pointer. unreferenced object 0xc30f8200 (size 512): comm "mmap", pid 374, jiffies 4872494 (age 627.630s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] huge_pte_alloc+0xdc/0x1f8 [<9e0df1e1>] hugetlb_fault+0x560/0x8f8 [<7938ec6c>] follow_hugetlb_page+0x14c/0x44c [] __get_user_pages+0x1c4/0x3dc [] __mm_populate+0xac/0x140 [<3215421e>] vm_mmap_pgoff+0xb4/0xb8 [] ksys_mmap_pgoff+0xcc/0x1fc [<4fcd760f>] ret_from_syscall+0x0/0x38 See commit a984506c542e2 ("powerpc/mm: Don't report PUDs as memory leaks when using kmemleak") for detailed explanation. To fix that, this patch tells kmemleak to ignore the allocated hugepage table. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9504641bd4d9..a7226ed9cae6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = i - 1 ; i >= 0; i--, hpdp--) *hpdp = __hugepd(0); kmem_cache_free(cachep, new); + } else { + kmemleak_ignore(new); } spin_unlock(ptl); return 0; From d90fe2acd9b2900790da01354dbca48dba37c20d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 28 Sep 2018 15:39:20 +0000 Subject: [PATCH 075/221] powerpc: Wire up memtest Add call to early_memtest() so that kernel compiled with CONFIG_MEMTEST really perform memtest at startup when requested via 'memtest' boot parameter. Tested-by: Daniel Axtens Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/kernel/setup-common.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 64a3bf54b974..1ab0797e4db6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2404,7 +2404,7 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. - memtest= [KNL,X86,ARM] Enable memtest + memtest= [KNL,X86,ARM,PPC] Enable memtest Format: default : 0 Specifies the number of memtest passes to be diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 93fa0c99681e..9ca9db707bcb 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -966,6 +967,8 @@ void __init setup_arch(char **cmdline_p) initmem_init(); + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); + #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif From f5e284803a7206d43e26f9ffcae5de9626d95e37 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Mon, 1 Oct 2018 16:21:51 +1000 Subject: [PATCH 076/221] powerpc/nohash: fix undefined behaviour when testing page size support When enumerating page size definitions to check hardware support, we construct a constant which is (1U << (def->shift - 10)). However, the array of page size definitions is only initalised for various MMU_PAGE_* constants, so it contains a number of 0-initialised elements with def->shift == 0. This means we end up shifting by a very large number, which gives the following UBSan splat: ================================================================================ UBSAN: Undefined behaviour in /home/dja/dev/linux/linux/arch/powerpc/mm/tlb_nohash.c:506:21 shift exponent 4294967286 is too large for 32-bit type 'unsigned int' CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc3-00045-ga604f927b012-dirty #6 Call Trace: [c00000000101bc20] [c000000000a13d54] .dump_stack+0xa8/0xec (unreliable) [c00000000101bcb0] [c0000000004f20a8] .ubsan_epilogue+0x18/0x64 [c00000000101bd30] [c0000000004f2b10] .__ubsan_handle_shift_out_of_bounds+0x110/0x1a4 [c00000000101be20] [c000000000d21760] .early_init_mmu+0x1b4/0x5a0 [c00000000101bf10] [c000000000d1ba28] .early_setup+0x100/0x130 [c00000000101bf90] [c000000000000528] start_here_multiplatform+0x68/0x80 ================================================================================ Fix this by first checking if the element exists (shift != 0) before constructing the constant. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb_nohash.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 15fe5f0c8665..ae5d568e267f 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -503,6 +503,9 @@ static void setup_page_sizes(void) for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { struct mmu_psize_def *def = &mmu_psize_defs[psize]; + if (!def->shift) + continue; + if (tlb1ps & (1U << (def->shift - 10))) { def->flags |= MMU_PAGE_SIZE_DIRECT; From bad96de8d31ba65dc26645af5550135315ea0b19 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Oct 2018 15:10:24 +0900 Subject: [PATCH 077/221] powerpc: remove leftover code of old GCC version checks Clean up the leftover of commit f2910f0e6835 ("powerpc: remove old GCC version checks"). Signed-off-by: Masahiro Yamada Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 81552c7b46eb..974103254aed 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -422,10 +422,6 @@ else endif endif -# Use the file '.tmp_gas_check' for binutils tests, as gas won't output -# to stdout and these checks are run even on install targets. -TOUT := .tmp_gas_check - # Check toolchain versions: # - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: @@ -436,7 +432,3 @@ checkbin: echo -n '*** Please use a different binutils version.' ; \ false ; \ fi - - -CLEAN_FILES += $(TOUT) - From 7ead15a1442b25e12a6f0791a7c7a5a72d1f3a0c Mon Sep 17 00:00:00 2001 From: Mark Hairgrove Date: Wed, 3 Oct 2018 11:51:32 -0700 Subject: [PATCH 078/221] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates There are two types of ATSDs issued to the NPU: invalidates targeting a specific virtual address and invalidates targeting the whole address space. In both cases prior to this change, the sequence was: for each NPU - Write the target address to the XTS_ATSD_AVA register - EIEIO - Write the launch value to issue the ATSD First, a target address is not required when invalidating the whole address space, so that write and the EIEIO have been removed. The AP (size) field in the launch is not needed either. Second, for per-address invalidates the above sequence is inefficient in the common case of multiple NPUs because an EIEIO is issued per NPU. This unnecessarily forces the launches of later ATSDs to be ordered with the launches of earlier ones. The new sequence only issues a single EIEIO: for each NPU - Write the target address to the XTS_ATSD_AVA register EIEIO for each NPU - Write the launch value to issue the ATSD Performance results were gathered using a microbenchmark which creates a 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in strides across the allocation. With only a single NPU active (one GPU) the difference is in the noise for both types of invalidates (+/-1%). With two NPUs active (on a 6-GPU system) the effect is more noticeable: mprotect rate (GB/s) Stride Before After Speedup 64K 5.9 6.5 10% 1M 31.2 33.4 7% 2M 36.3 38.7 7% 4M 322.6 356.7 11% Signed-off-by: Mark Hairgrove Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 105 +++++++++++------------ 1 file changed, 51 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 8006c54a91e3..c8f438a9c762 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) } /* MMIO ATSD register offsets */ -#define XTS_ATSD_AVA 1 -#define XTS_ATSD_STAT 2 +#define XTS_ATSD_LAUNCH 0 +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, - unsigned long launch, unsigned long va) +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, + bool flush) { - struct npu *npu = mmio_atsd_reg->npu; - int reg = mmio_atsd_reg->reg; + unsigned long launch = 0; - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); - eieio(); - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]); + if (psize == MMU_PAGE_COUNT) { + /* IS set to invalidate entire matching PID */ + launch |= PPC_BIT(12); + } else { + /* AP set to invalidate region of psize */ + launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17); + } + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + + return launch; +} + +static void mmio_atsd_regs_write(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset, + unsigned long val) +{ + struct npu *npu; + int i, reg; + + for (i = 0; i <= max_npu2_index; i++) { + reg = mmio_atsd_reg[i].reg; + if (reg < 0) + continue; + + npu = mmio_atsd_reg[i].npu; + __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset); + } } static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid, bool flush) { - int i; - unsigned long launch; + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush); - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; - - /* IS set to invalidate matching PID */ - launch = PPC_BIT(12); - - /* PRS set to process-scoped */ - launch |= PPC_BIT(13); - - /* AP */ - launch |= (u64) - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); - - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); - - /* Invalidating the entire process doesn't use a va */ - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); - } + /* Invalidating the entire process doesn't use a va */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); } static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], unsigned long va, unsigned long pid, bool flush) { - int i; unsigned long launch; - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; + launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush); - /* IS set to invalidate target VA */ - launch = 0; + /* Write all VAs first */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va); - /* PRS set to process scoped */ - launch |= PPC_BIT(13); + /* Issue one barrier for all address writes */ + eieio(); - /* AP */ - launch |= (u64) - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); - - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); - - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); - } + /* Launch */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); } #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) From 3689c37d23fceb786e74b1a71501f3583223ab39 Mon Sep 17 00:00:00 2001 From: Mark Hairgrove Date: Wed, 3 Oct 2018 11:51:33 -0700 Subject: [PATCH 079/221] powerpc/powernv/npu: Use size-based ATSD invalidates Prior to this change only two types of ATSDs were issued to the NPU: invalidates targeting a single page and invalidates targeting the whole address space. The crossover point happened at the configurable atsd_threshold which defaulted to 2M. Invalidates that size or smaller would issue per-page invalidates for the whole range. The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all. These invalidates target addresses aligned to their size. 2M is a common invalidation size for GPU-enabled applications because that is a GPU page size, so reducing the number of invalidates by 32x in that case is a clear improvement. ATSD latency is high in general so now we always issue a single invalidate rather than multiple. This will over-invalidate in some cases, but for any invalidation size over 2M it matches or improves the prior behavior. There's also an improvement for single-page invalidates since the prior version issued two invalidates for that case instead of one. With this change all issued ATSDs now perform a flush, so the flush parameter has been removed from all the helpers. To show the benefit here are some performance numbers from a microbenchmark which creates a 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in strides across the allocation. One NPU (1 GPU): mprotect rate (GB/s) Stride Before After Speedup 64K 5.3 5.6 5% 1M 39.3 57.4 46% 2M 49.7 82.6 66% 4M 286.6 285.7 0% Two NPUs (6 GPUs): mprotect rate (GB/s) Stride Before After Speedup 64K 6.5 7.4 13% 1M 33.4 67.9 103% 2M 38.7 93.1 141% 4M 356.7 354.6 -1% Anything over 2M is roughly the same as before since both cases issue a single ATSD. Signed-off-by: Mark Hairgrove Reviewed-By: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 103 ++++++++++++----------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c8f438a9c762..e4c0fabf37ac 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) #define XTS_ATSD_AVA 1 #define XTS_ATSD_STAT 2 -static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, - bool flush) +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize) { unsigned long launch = 0; @@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, /* PID */ launch |= pid << PPC_BITLSHIFT(38); - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); + /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */ return launch; } @@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg } static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long pid, bool flush) + unsigned long pid) { - unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush); + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT); /* Invalidating the entire process doesn't use a va */ mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); } -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long va, unsigned long pid, bool flush) +static void mmio_invalidate_range(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid, + unsigned long start, unsigned long psize) { - unsigned long launch; - - launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush); + unsigned long launch = get_atsd_launch_val(pid, psize); /* Write all VAs first */ - mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va); + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start); /* Issue one barrier for all address writes */ eieio(); @@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) } /* - * Invalidate either a single address or an entire PID depending on - * the value of va. + * Invalidate a virtual address range */ -static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address, bool flush) +static void mmio_invalidate(struct npu_context *npu_context, + unsigned long start, unsigned long size) { struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; + unsigned long atsd_start = 0; + unsigned long end = start + size - 1; + int atsd_psize = MMU_PAGE_COUNT; + + /* + * Convert the input range into one of the supported sizes. If the range + * doesn't fit, use the next larger supported size. Invalidation latency + * is high, so over-invalidation is preferred to issuing multiple + * invalidates. + * + * A 4K page size isn't supported by NPU/GPU ATS, so that case is + * ignored. + */ + if (size == SZ_64K) { + atsd_start = start; + atsd_psize = MMU_PAGE_64K; + } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) { + atsd_start = ALIGN_DOWN(start, SZ_2M); + atsd_psize = MMU_PAGE_2M; + } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) { + atsd_start = ALIGN_DOWN(start, SZ_1G); + atsd_psize = MMU_PAGE_1G; + } if (npu_context->nmmu_flush) /* @@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * an invalidate. */ acquire_atsd_reg(npu_context, mmio_atsd_reg); - if (va) - mmio_invalidate_va(mmio_atsd_reg, address, pid, flush); + + if (atsd_psize == MMU_PAGE_COUNT) + mmio_invalidate_pid(mmio_atsd_reg, pid); else - mmio_invalidate_pid(mmio_atsd_reg, pid, flush); + mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start, + atsd_psize); mmio_invalidate_wait(mmio_atsd_reg); - if (flush) { - /* - * The GPU requires two flush ATSDs to ensure all entries have - * been flushed. We use PID 0 as it will never be used for a - * process on the GPU. - */ - mmio_invalidate_pid(mmio_atsd_reg, 0, true); - mmio_invalidate_wait(mmio_atsd_reg); - mmio_invalidate_pid(mmio_atsd_reg, 0, true); - mmio_invalidate_wait(mmio_atsd_reg); - } + + /* + * The GPU requires two flush ATSDs to ensure all entries have been + * flushed. We use PID 0 as it will never be used for a process on the + * GPU. + */ + mmio_invalidate_pid(mmio_atsd_reg, 0); + mmio_invalidate_wait(mmio_atsd_reg); + mmio_invalidate_pid(mmio_atsd_reg, 0); + mmio_invalidate_wait(mmio_atsd_reg); + release_atsd_reg(mmio_atsd_reg); } @@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, * There should be no more translation requests for this PID, but we * need to ensure any entries for it are removed from the TLB. */ - mmio_invalidate(npu_context, 0, 0, true); + mmio_invalidate(npu_context, 0, ~0UL); } static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, @@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, pte_t pte) { struct npu_context *npu_context = mn_to_npu_context(mn); - - mmio_invalidate(npu_context, 1, address, true); + mmio_invalidate(npu_context, address, PAGE_SIZE); } static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, @@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, unsigned long start, unsigned long end) { struct npu_context *npu_context = mn_to_npu_context(mn); - unsigned long address; - - if (end - start > atsd_threshold) { - /* - * Just invalidate the entire PID if the address range is too - * large. - */ - mmio_invalidate(npu_context, 0, 0, true); - } else { - for (address = start; address < end; address += PAGE_SIZE) - mmio_invalidate(npu_context, 1, address, false); - - /* Do the flush only on the final addess == end */ - mmio_invalidate(npu_context, 1, address, true); - } + mmio_invalidate(npu_context, start, end - start); } static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { From f86ad3e0194b6964a058dc223ca80bf81b419cf0 Mon Sep 17 00:00:00 2001 From: Mark Hairgrove Date: Wed, 3 Oct 2018 11:51:34 -0700 Subject: [PATCH 080/221] powerpc/powernv/npu: Remove atsd_threshold debugfs setting This threshold is no longer used now that all invalidates issue a single ATSD to each active NPU. Signed-off-by: Mark Hairgrove Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index e4c0fabf37ac..6f60e0931922 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -42,14 +41,6 @@ */ static DEFINE_SPINLOCK(npu_context_lock); -/* - * When an address shootdown range exceeds this threshold we invalidate the - * entire TLB on the GPU for the given PID rather than each specific address in - * the range. - */ -static uint64_t atsd_threshold = 2 * 1024 * 1024; -static struct dentry *atsd_threshold_dentry; - /* * Other types of TCE cache invalidation are not functional in the * hardware. @@ -966,11 +957,6 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; - if (!atsd_threshold_dentry) { - atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", - 0600, powerpc_debugfs_root, &atsd_threshold); - } - phb->npu.nmmu_flush = of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { From 8b78fdb045de60a4eb35460092bbd3cffa925353 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 2 Oct 2018 09:01:04 +1000 Subject: [PATCH 081/221] powerpc/time: Use clockevents_register_device(), fixing an issue with large decrementer We currently cap the decrementer clockevent at 4 seconds, even on systems with large decrementer support. Fix this by converting the code to use clockevents_register_device() which calculates the upper bound based on the max_delta passed in. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 70f145e02487..6a1f0a084ca3 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -984,10 +984,10 @@ static void register_decrementer_clockevent(int cpu) *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); + clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max); + printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); - - clockevents_register_device(dec); } static void enable_large_decrementer(void) @@ -1035,18 +1035,7 @@ static void __init set_decrementer_max(void) static void __init init_decrementer_clockevent(void) { - int cpu = smp_processor_id(); - - clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); - - decrementer_clockevent.max_delta_ns = - clockevent_delta2ns(decrementer_max, &decrementer_clockevent); - decrementer_clockevent.max_delta_ticks = decrementer_max; - decrementer_clockevent.min_delta_ns = - clockevent_delta2ns(2, &decrementer_clockevent); - decrementer_clockevent.min_delta_ticks = 2; - - register_decrementer_clockevent(cpu); + register_decrementer_clockevent(smp_processor_id()); } void secondary_cpu_time_init(void) From 817593604e5b244dc55344d298d96122457c20bd Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 2 Oct 2018 09:01:05 +1000 Subject: [PATCH 082/221] powerpc/time: Add set_state_oneshot_stopped decrementer callback If CONFIG_PPC_WATCHDOG is enabled we always cap the decrementer to 0x7fffffff: if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) set_dec(0x7fffffff); else set_dec(decrementer_max); If there are no future events, we don't reprogram the decrementer after this and we end up with 0x7fffffff even on a large decrementer capable system. As suggested by Nick, add a set_state_oneshot_stopped callback so we program the decrementer with decrementer_max if there are no future events. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6a1f0a084ca3..40868f3ee113 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -111,6 +111,7 @@ struct clock_event_device decrementer_clockevent = { .rating = 200, .irq = 0, .set_next_event = decrementer_set_next_event, + .set_state_oneshot_stopped = decrementer_shutdown, .set_state_shutdown = decrementer_shutdown, .tick_resume = decrementer_shutdown, .features = CLOCK_EVT_FEAT_ONESHOT | From bc276ecba132caccb1fda5863a652c15def2b8c6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 27 Aug 2018 13:03:01 +1000 Subject: [PATCH 083/221] powerpc/64s/hash: Do not use PPC_INVALIDATE_ERAT on CPUs before POWER9 PPC_INVALIDATE_ERAT is slbia IH=7 which is a new variant introduced with POWER9, and the result is undefined on earlier CPUs. Commits 7b9f71f974 ("powerpc/64s: POWER9 machine check handler") and d4748276ae ("powerpc/64s: Improve local TLB flush for boot and MCE on POWER9") caused POWER7/8 code to use this instruction. Remove it. An ERAT flush can be made by invalidatig the SLB, but before POWER9 that requires a flush and rebolt. Fixes: 7b9f71f974 ("powerpc/64s: POWER9 machine check handler") Fixes: d4748276ae ("powerpc/64s: Improve local TLB flush for boot and MCE on POWER9") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce_power.c | 7 +++++++ arch/powerpc/mm/hash_native_64.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 2016b58d564f..6b800eec31f2 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -89,6 +89,13 @@ void flush_and_reload_slb(void) static void flush_erat(void) { +#ifdef CONFIG_PPC_BOOK3S_64 + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + flush_and_reload_slb(); + return; + } +#endif + /* PPC_INVALIDATE_ERAT can only be used on ISA v3 and newer */ asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 729f02df8290..aaa28fd918fe 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -115,6 +115,8 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) tlbiel_hash_set_isa300(0, is, 0, 2, 1); asm volatile("ptesync": : :"memory"); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } void hash__tlbiel_all(unsigned int action) @@ -140,8 +142,6 @@ void hash__tlbiel_all(unsigned int action) tlbiel_all_isa206(POWER7_TLB_SETS, is); else WARN(1, "%s called on pre-POWER7 CPU\n", __func__); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } static inline unsigned long ___tlbie(unsigned long vpn, int psize, From 053c5a753e951c5dd1729af2cf4d8107f2e6e09b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 27 Aug 2018 13:03:02 +1000 Subject: [PATCH 084/221] powerpc/64s/radix: Explicitly flush ERAT with local LPID invalidation Local radix TLB flush operations that operate on congruence classes have explicit ERAT flushes for POWER9. The process scoped LPID flush did not have a flush, so add it. Signed-off-by: Nicholas Piggin Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index fef3e1eb3a19..4e798f33c530 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -366,6 +366,7 @@ static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); } From 0792a2c8e0bbda3605b8d42c6b9635be7b19982a Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 085/221] macintosh: Use common code to access RTC Now that the 68k Mac port has adopted the via-pmu driver, the same RTC code can be shared between m68k and powerpc. Replace duplicated code in arch/powerpc and arch/m68k with common RTC accessors for Cuda and PMU. Drop the problematic WARN_ON which was introduced in commit 22db552b50fa ("powerpc/powermac: Fix rtc read/write functions"). Tested-by: Stan Johnson Signed-off-by: Finn Thain Cc: Geert Uytterhoeven Cc: Arnd Bergmann Acked-by: Geert Uytterhoeven Signed-off-by: Michael Ellerman --- arch/m68k/mac/misc.c | 75 ++------------- arch/powerpc/platforms/powermac/time.c | 126 ++++--------------------- drivers/macintosh/via-cuda.c | 35 +++++++ drivers/macintosh/via-pmu.c | 33 +++++++ include/linux/cuda.h | 4 + include/linux/pmu.h | 4 + 6 files changed, 106 insertions(+), 171 deletions(-) diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 1b083c500b9a..ebb3b6d169ea 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -37,35 +37,6 @@ static void (*rom_reset)(void); #ifdef CONFIG_ADB_CUDA -static time64_t cuda_read_time(void) -{ - struct adb_request req; - time64_t time; - - if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) - return 0; - while (!req.complete) - cuda_poll(); - - time = (u32)((req.reply[3] << 24) | (req.reply[4] << 16) | - (req.reply[5] << 8) | req.reply[6]); - - return time - RTC_OFFSET; -} - -static void cuda_write_time(time64_t time) -{ - struct adb_request req; - u32 data = lower_32_bits(time + RTC_OFFSET); - - if (cuda_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME, - (data >> 24) & 0xFF, (data >> 16) & 0xFF, - (data >> 8) & 0xFF, data & 0xFF) < 0) - return; - while (!req.complete) - cuda_poll(); -} - static __u8 cuda_read_pram(int offset) { struct adb_request req; @@ -91,33 +62,6 @@ static void cuda_write_pram(int offset, __u8 data) #endif /* CONFIG_ADB_CUDA */ #ifdef CONFIG_ADB_PMU -static time64_t pmu_read_time(void) -{ - struct adb_request req; - time64_t time; - - if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) - return 0; - pmu_wait_complete(&req); - - time = (u32)((req.reply[0] << 24) | (req.reply[1] << 16) | - (req.reply[2] << 8) | req.reply[3]); - - return time - RTC_OFFSET; -} - -static void pmu_write_time(time64_t time) -{ - struct adb_request req; - u32 data = lower_32_bits(time + RTC_OFFSET); - - if (pmu_request(&req, NULL, 5, PMU_SET_RTC, - (data >> 24) & 0xFF, (data >> 16) & 0xFF, - (data >> 8) & 0xFF, data & 0xFF) < 0) - return; - pmu_wait_complete(&req); -} - static __u8 pmu_read_pram(int offset) { struct adb_request req; @@ -295,13 +239,17 @@ static time64_t via_read_time(void) * is basically any machine with Mac II-style ADB. */ -static void via_write_time(time64_t time) +static void via_set_rtc_time(struct rtc_time *tm) { union { __u8 cdata[4]; __u32 idata; } data; __u8 temp; + time64_t time; + + time = mktime64(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); /* Clear the write protect bit */ @@ -641,12 +589,12 @@ int mac_hwclk(int op, struct rtc_time *t) #ifdef CONFIG_ADB_CUDA case MAC_ADB_EGRET: case MAC_ADB_CUDA: - now = cuda_read_time(); + now = cuda_get_time(); break; #endif #ifdef CONFIG_ADB_PMU case MAC_ADB_PB2: - now = pmu_read_time(); + now = pmu_get_time(); break; #endif default: @@ -665,24 +613,21 @@ int mac_hwclk(int op, struct rtc_time *t) __func__, t->tm_year + 1900, t->tm_mon + 1, t->tm_mday, t->tm_hour, t->tm_min, t->tm_sec); - now = mktime64(t->tm_year + 1900, t->tm_mon + 1, t->tm_mday, - t->tm_hour, t->tm_min, t->tm_sec); - switch (macintosh_config->adb_type) { case MAC_ADB_IOP: case MAC_ADB_II: case MAC_ADB_PB1: - via_write_time(now); + via_set_rtc_time(t); break; #ifdef CONFIG_ADB_CUDA case MAC_ADB_EGRET: case MAC_ADB_CUDA: - cuda_write_time(now); + cuda_set_rtc_time(t); break; #endif #ifdef CONFIG_ADB_PMU case MAC_ADB_PB2: - pmu_write_time(now); + pmu_set_rtc_time(t); break; #endif default: diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index f92c1918fb56..f157e3d071f2 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -44,13 +44,6 @@ #define DBG(x...) #endif -/* - * Offset between Unix time (1970-based) and Mac time (1904-based). Cuda and PMU - * times wrap in 2040. If we need to handle later times, the read_time functions - * need to be changed to interpret wrapped times as post-2040. - */ -#define RTC_OFFSET 2082844800 - /* * Calibrate the decrementer frequency with the VIA timer 1. */ @@ -90,98 +83,6 @@ long __init pmac_time_init(void) return delta; } -#ifdef CONFIG_ADB_CUDA -static time64_t cuda_get_time(void) -{ - struct adb_request req; - time64_t now; - - if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) - return 0; - while (!req.complete) - cuda_poll(); - if (req.reply_len != 7) - printk(KERN_ERR "cuda_get_time: got %d byte reply\n", - req.reply_len); - now = (u32)((req.reply[3] << 24) + (req.reply[4] << 16) + - (req.reply[5] << 8) + req.reply[6]); - /* it's either after year 2040, or the RTC has gone backwards */ - WARN_ON(now < RTC_OFFSET); - - return now - RTC_OFFSET; -} - -#define cuda_get_rtc_time(tm) rtc_time64_to_tm(cuda_get_time(), (tm)) - -static int cuda_set_rtc_time(struct rtc_time *tm) -{ - u32 nowtime; - struct adb_request req; - - nowtime = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); - if (cuda_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME, - nowtime >> 24, nowtime >> 16, nowtime >> 8, - nowtime) < 0) - return -ENXIO; - while (!req.complete) - cuda_poll(); - if ((req.reply_len != 3) && (req.reply_len != 7)) - printk(KERN_ERR "cuda_set_rtc_time: got %d byte reply\n", - req.reply_len); - return 0; -} - -#else -#define cuda_get_time() 0 -#define cuda_get_rtc_time(tm) -#define cuda_set_rtc_time(tm) 0 -#endif - -#ifdef CONFIG_ADB_PMU -static time64_t pmu_get_time(void) -{ - struct adb_request req; - time64_t now; - - if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) - return 0; - pmu_wait_complete(&req); - if (req.reply_len != 4) - printk(KERN_ERR "pmu_get_time: got %d byte reply from PMU\n", - req.reply_len); - now = (u32)((req.reply[0] << 24) + (req.reply[1] << 16) + - (req.reply[2] << 8) + req.reply[3]); - - /* it's either after year 2040, or the RTC has gone backwards */ - WARN_ON(now < RTC_OFFSET); - - return now - RTC_OFFSET; -} - -#define pmu_get_rtc_time(tm) rtc_time64_to_tm(pmu_get_time(), (tm)) - -static int pmu_set_rtc_time(struct rtc_time *tm) -{ - u32 nowtime; - struct adb_request req; - - nowtime = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); - if (pmu_request(&req, NULL, 5, PMU_SET_RTC, nowtime >> 24, - nowtime >> 16, nowtime >> 8, nowtime) < 0) - return -ENXIO; - pmu_wait_complete(&req); - if (req.reply_len != 0) - printk(KERN_ERR "pmu_set_rtc_time: %d byte reply from PMU\n", - req.reply_len); - return 0; -} - -#else -#define pmu_get_time() 0 -#define pmu_get_rtc_time(tm) -#define pmu_set_rtc_time(tm) 0 -#endif - #ifdef CONFIG_PMAC_SMU static time64_t smu_get_time(void) { @@ -191,11 +92,6 @@ static time64_t smu_get_time(void) return 0; return rtc_tm_to_time64(&tm); } - -#else -#define smu_get_time() 0 -#define smu_get_rtc_time(tm, spin) -#define smu_set_rtc_time(tm, spin) 0 #endif /* Can't be __init, it's called when suspending and resuming */ @@ -203,12 +99,18 @@ time64_t pmac_get_boot_time(void) { /* Get the time from the RTC, used only at boot time */ switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: return cuda_get_time(); +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: return pmu_get_time(); +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: return smu_get_time(); +#endif default: return 0; } @@ -218,15 +120,21 @@ void pmac_get_rtc_time(struct rtc_time *tm) { /* Get the time from the RTC, used only at boot time */ switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: - cuda_get_rtc_time(tm); + rtc_time64_to_tm(cuda_get_time(), tm); break; +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: - pmu_get_rtc_time(tm); + rtc_time64_to_tm(pmu_get_time(), tm); break; +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: smu_get_rtc_time(tm, 1); break; +#endif default: ; } @@ -235,12 +143,18 @@ void pmac_get_rtc_time(struct rtc_time *tm) int pmac_set_rtc_time(struct rtc_time *tm) { switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: return cuda_set_rtc_time(tm); +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: return pmu_set_rtc_time(tm); +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: return smu_set_rtc_time(tm, 1); +#endif default: return -ENODEV; } diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c index 98dd702eb867..bbec6ac0a966 100644 --- a/drivers/macintosh/via-cuda.c +++ b/drivers/macintosh/via-cuda.c @@ -766,3 +766,38 @@ cuda_input(unsigned char *buf, int nb) buf, nb, false); } } + +/* Offset between Unix time (1970-based) and Mac time (1904-based) */ +#define RTC_OFFSET 2082844800 + +time64_t cuda_get_time(void) +{ + struct adb_request req; + u32 now; + + if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) + return 0; + while (!req.complete) + cuda_poll(); + if (req.reply_len != 7) + pr_err("%s: got %d byte reply\n", __func__, req.reply_len); + now = (req.reply[3] << 24) + (req.reply[4] << 16) + + (req.reply[5] << 8) + req.reply[6]; + return (time64_t)now - RTC_OFFSET; +} + +int cuda_set_rtc_time(struct rtc_time *tm) +{ + u32 now; + struct adb_request req; + + now = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); + if (cuda_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME, + now >> 24, now >> 16, now >> 8, now) < 0) + return -ENXIO; + while (!req.complete) + cuda_poll(); + if ((req.reply_len != 3) && (req.reply_len != 7)) + pr_err("%s: got %d byte reply\n", __func__, req.reply_len); + return 0; +} diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index d72c450aebe5..60f57e2abf21 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -1737,6 +1737,39 @@ pmu_enable_irled(int on) pmu_wait_complete(&req); } +/* Offset between Unix time (1970-based) and Mac time (1904-based) */ +#define RTC_OFFSET 2082844800 + +time64_t pmu_get_time(void) +{ + struct adb_request req; + u32 now; + + if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) + return 0; + pmu_wait_complete(&req); + if (req.reply_len != 4) + pr_err("%s: got %d byte reply\n", __func__, req.reply_len); + now = (req.reply[0] << 24) + (req.reply[1] << 16) + + (req.reply[2] << 8) + req.reply[3]; + return (time64_t)now - RTC_OFFSET; +} + +int pmu_set_rtc_time(struct rtc_time *tm) +{ + u32 now; + struct adb_request req; + + now = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); + if (pmu_request(&req, NULL, 5, PMU_SET_RTC, + now >> 24, now >> 16, now >> 8, now) < 0) + return -ENXIO; + pmu_wait_complete(&req); + if (req.reply_len != 0) + pr_err("%s: got %d byte reply\n", __func__, req.reply_len); + return 0; +} + void pmu_restart(void) { diff --git a/include/linux/cuda.h b/include/linux/cuda.h index 056867f09a01..45bfe9d61271 100644 --- a/include/linux/cuda.h +++ b/include/linux/cuda.h @@ -8,6 +8,7 @@ #ifndef _LINUX_CUDA_H #define _LINUX_CUDA_H +#include #include @@ -16,4 +17,7 @@ extern int cuda_request(struct adb_request *req, void (*done)(struct adb_request *), int nbytes, ...); extern void cuda_poll(void); +extern time64_t cuda_get_time(void); +extern int cuda_set_rtc_time(struct rtc_time *tm); + #endif /* _LINUX_CUDA_H */ diff --git a/include/linux/pmu.h b/include/linux/pmu.h index 9ac8fc60ad49..52453a24a24f 100644 --- a/include/linux/pmu.h +++ b/include/linux/pmu.h @@ -9,6 +9,7 @@ #ifndef _LINUX_PMU_H #define _LINUX_PMU_H +#include #include @@ -36,6 +37,9 @@ static inline void pmu_resume(void) extern void pmu_enable_irled(int on); +extern time64_t pmu_get_time(void); +extern int pmu_set_rtc_time(struct rtc_time *tm); + extern void pmu_restart(void); extern void pmu_shutdown(void); extern void pmu_unlock(void); From 2341629eadc4a40aa46103c7f1ff5f38459688d3 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 086/221] macintosh/adb: Rework printk output again Avoid the KERN_CONT problem by avoiding message fragments. The problem arises during async ADB bus probing, when ADB messages may get mixed up with other messages. See also, commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines"). Remove a number of printk() continuation lines by logging handler changes in adb_try_handler_change() instead. This patch addresses the problematic use of "\n" at the beginning of pr_cont() messages, which got overlooked in commit f2be6295684b ("macintosh/adb: Properly mark continued kernel messages"). That commit also changed printk(KERN_DEBUG ...) to pr_debug(...), which hinders work on low-level ADB driver bugs. Revert that change. Cc: Andreas Schwab Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/adb.c | 8 +++--- drivers/macintosh/adbhid.c | 53 +++++++++++++++----------------------- 2 files changed, 26 insertions(+), 35 deletions(-) diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 76e98f0f7a3e..e49d1f287a17 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -203,15 +203,15 @@ static int adb_scan_bus(void) } /* Now fill in the handler_id field of the adb_handler entries. */ - pr_debug("adb devices:\n"); for (i = 1; i < 16; i++) { if (adb_handler[i].original_address == 0) continue; adb_request(&req, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1, (i << 4) | 0xf); adb_handler[i].handler_id = req.reply[2]; - pr_debug(" [%d]: %d %x\n", i, adb_handler[i].original_address, - adb_handler[i].handler_id); + printk(KERN_DEBUG "adb device [%d]: %d 0x%X\n", i, + adb_handler[i].original_address, + adb_handler[i].handler_id); devmask |= 1 << i; } return devmask; @@ -579,6 +579,8 @@ adb_try_handler_change(int address, int new_id) mutex_lock(&adb_handler_mutex); ret = try_handler_change(address, new_id); mutex_unlock(&adb_handler_mutex); + if (ret) + pr_debug("adb handler change: [%d] 0x%X\n", address, new_id); return ret; } EXPORT_SYMBOL(adb_try_handler_change); diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c index a261892c03b3..75482eeab2c4 100644 --- a/drivers/macintosh/adbhid.c +++ b/drivers/macintosh/adbhid.c @@ -757,6 +757,7 @@ adbhid_input_register(int id, int default_id, int original_handler_id, struct input_dev *input_dev; int err; int i; + char *keyboard_type; if (adbhid[id]) { pr_err("Trying to reregister ADB HID on ID %d\n", id); @@ -798,24 +799,23 @@ adbhid_input_register(int id, int default_id, int original_handler_id, memcpy(hid->keycode, adb_to_linux_keycodes, sizeof(adb_to_linux_keycodes)); - pr_info("Detected ADB keyboard, type "); switch (original_handler_id) { default: - pr_cont(".\n"); + keyboard_type = ""; input_dev->id.version = ADB_KEYBOARD_UNKNOWN; break; case 0x01: case 0x02: case 0x03: case 0x06: case 0x08: case 0x0C: case 0x10: case 0x18: case 0x1B: case 0x1C: case 0xC0: case 0xC3: case 0xC6: - pr_cont("ANSI.\n"); + keyboard_type = "ANSI"; input_dev->id.version = ADB_KEYBOARD_ANSI; break; case 0x04: case 0x05: case 0x07: case 0x09: case 0x0D: case 0x11: case 0x14: case 0x19: case 0x1D: case 0xC1: case 0xC4: case 0xC7: - pr_cont("ISO, swapping keys.\n"); + keyboard_type = "ISO, swapping keys"; input_dev->id.version = ADB_KEYBOARD_ISO; i = hid->keycode[10]; hid->keycode[10] = hid->keycode[50]; @@ -824,10 +824,11 @@ adbhid_input_register(int id, int default_id, int original_handler_id, case 0x12: case 0x15: case 0x16: case 0x17: case 0x1A: case 0x1E: case 0xC2: case 0xC5: case 0xC8: case 0xC9: - pr_cont("JIS.\n"); + keyboard_type = "JIS"; input_dev->id.version = ADB_KEYBOARD_JIS; break; } + pr_info("Detected ADB keyboard, type %s.\n", keyboard_type); for (i = 0; i < 128; i++) if (hid->keycode[i]) @@ -972,16 +973,13 @@ adbhid_probe(void) ->get it to send separate codes for left and right shift, control, option keys */ #if 0 /* handler 5 doesn't send separate codes for R modifiers */ - if (adb_try_handler_change(id, 5)) - printk("ADB keyboard at %d, handler set to 5\n", id); - else + if (!adb_try_handler_change(id, 5)) #endif - if (adb_try_handler_change(id, 3)) - printk("ADB keyboard at %d, handler set to 3\n", id); - else - printk("ADB keyboard at %d, handler 1\n", id); + adb_try_handler_change(id, 3); adb_get_infos(id, &default_id, &cur_handler_id); + printk(KERN_DEBUG "ADB keyboard at %d has handler 0x%X\n", + id, cur_handler_id); reg |= adbhid_input_reregister(id, default_id, org_handler_id, cur_handler_id, 0); } @@ -999,48 +997,44 @@ adbhid_probe(void) for (i = 0; i < mouse_ids.nids; i++) { int id = mouse_ids.id[i]; int mouse_kind; + char *desc = "standard"; adb_get_infos(id, &default_id, &org_handler_id); if (adb_try_handler_change(id, 4)) { - printk("ADB mouse at %d, handler set to 4", id); mouse_kind = ADBMOUSE_EXTENDED; } else if (adb_try_handler_change(id, 0x2F)) { - printk("ADB mouse at %d, handler set to 0x2F", id); mouse_kind = ADBMOUSE_MICROSPEED; } else if (adb_try_handler_change(id, 0x42)) { - printk("ADB mouse at %d, handler set to 0x42", id); mouse_kind = ADBMOUSE_TRACKBALLPRO; } else if (adb_try_handler_change(id, 0x66)) { - printk("ADB mouse at %d, handler set to 0x66", id); mouse_kind = ADBMOUSE_MICROSPEED; } else if (adb_try_handler_change(id, 0x5F)) { - printk("ADB mouse at %d, handler set to 0x5F", id); mouse_kind = ADBMOUSE_MICROSPEED; } else if (adb_try_handler_change(id, 3)) { - printk("ADB mouse at %d, handler set to 3", id); mouse_kind = ADBMOUSE_MS_A3; } else if (adb_try_handler_change(id, 2)) { - printk("ADB mouse at %d, handler set to 2", id); mouse_kind = ADBMOUSE_STANDARD_200; } else { - printk("ADB mouse at %d, handler 1", id); mouse_kind = ADBMOUSE_STANDARD_100; } if ((mouse_kind == ADBMOUSE_TRACKBALLPRO) || (mouse_kind == ADBMOUSE_MICROSPEED)) { + desc = "Microspeed/MacPoint or compatible"; init_microspeed(id); } else if (mouse_kind == ADBMOUSE_MS_A3) { + desc = "Mouse Systems A3 Mouse or compatible"; init_ms_a3(id); } else if (mouse_kind == ADBMOUSE_EXTENDED) { + desc = "extended"; /* * Register 1 is usually used for device * identification. Here, we try to identify @@ -1054,32 +1048,36 @@ adbhid_probe(void) (req.reply[1] == 0x9a) && ((req.reply[2] == 0x21) || (req.reply[2] == 0x20))) { mouse_kind = ADBMOUSE_TRACKBALL; + desc = "trackman/mouseman"; init_trackball(id); } else if ((req.reply_len >= 4) && (req.reply[1] == 0x74) && (req.reply[2] == 0x70) && (req.reply[3] == 0x61) && (req.reply[4] == 0x64)) { mouse_kind = ADBMOUSE_TRACKPAD; + desc = "trackpad"; init_trackpad(id); } else if ((req.reply_len >= 4) && (req.reply[1] == 0x4b) && (req.reply[2] == 0x4d) && (req.reply[3] == 0x4c) && (req.reply[4] == 0x31)) { mouse_kind = ADBMOUSE_TURBOMOUSE5; + desc = "TurboMouse 5"; init_turbomouse(id); } else if ((req.reply_len == 9) && (req.reply[1] == 0x4b) && (req.reply[2] == 0x4f) && (req.reply[3] == 0x49) && (req.reply[4] == 0x54)) { if (adb_try_handler_change(id, 0x42)) { - pr_cont("\nADB MacAlly 2-button mouse at %d, handler set to 0x42", id); mouse_kind = ADBMOUSE_MACALLY2; + desc = "MacAlly 2-button"; } } } - pr_cont("\n"); adb_get_infos(id, &default_id, &cur_handler_id); + printk(KERN_DEBUG "ADB mouse (%s) at %d has handler 0x%X\n", + desc, id, cur_handler_id); reg |= adbhid_input_reregister(id, default_id, org_handler_id, cur_handler_id, mouse_kind); } @@ -1092,12 +1090,10 @@ init_trackpad(int id) struct adb_request req; unsigned char r1_buffer[8]; - pr_cont(" (trackpad)"); - adb_request(&req, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1, ADB_READREG(id,1)); if (req.reply_len < 8) - pr_cont("bad length for reg. 1\n"); + pr_err("%s: bad length for reg. 1\n", __func__); else { memcpy(r1_buffer, &req.reply[1], 8); @@ -1145,8 +1141,6 @@ init_trackball(int id) { struct adb_request req; - pr_cont(" (trackman/mouseman)"); - adb_request(&req, NULL, ADBREQ_SYNC, 3, ADB_WRITEREG(id,1), 00,0x81); @@ -1177,8 +1171,6 @@ init_turbomouse(int id) { struct adb_request req; - pr_cont(" (TurboMouse 5)"); - adb_request(&req, NULL, ADBREQ_SYNC, 1, ADB_FLUSH(id)); adb_request(&req, NULL, ADBREQ_SYNC, 1, ADB_FLUSH(3)); @@ -1213,8 +1205,6 @@ init_microspeed(int id) { struct adb_request req; - pr_cont(" (Microspeed/MacPoint or compatible)"); - adb_request(&req, NULL, ADBREQ_SYNC, 1, ADB_FLUSH(id)); /* This will initialize mice using the Microspeed, MacPoint and @@ -1253,7 +1243,6 @@ init_ms_a3(int id) { struct adb_request req; - pr_cont(" (Mouse Systems A3 Mouse, or compatible)"); adb_request(&req, NULL, ADBREQ_SYNC, 3, ADB_WRITEREG(id, 0x2), 0x00, From b52dce87389385aad54f5680329b41257ee0f7ac Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 087/221] macintosh/via-macii: Synchronous bus reset Make the reset operation synchronous, like the other ADB drivers. The reset request is static data but callers may not know that. This way the struct is not in use when the reset method returns. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/via-macii.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index cf6f7d52d6be..36a4f49e79b5 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -331,7 +331,8 @@ static int macii_reset_bus(void) return 0; /* Command = 0, Address = ignored */ - adb_request(&req, NULL, 0, 1, ADB_BUSRESET); + adb_request(&req, NULL, ADBREQ_NOSEND, 1, ADB_BUSRESET); + macii_send_request(&req, 1); /* Don't want any more requests during the Global Reset low time. */ udelay(3000); From 5f93d7081a47e1972031ccf57c4b2779eee162fb Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 088/221] macintosh/via-macii: Remove BUG_ON assertions The BUG_ON assertions I added to the via-macii driver over a decade ago haven't fired AFAIK. Some can never fire (by inspection). One assertion checks for a NULL pointer, but that would merely substitute a BUG crash for an Oops crash. Remove the pointless BUG_ON assertions and replace the others with a WARN_ON and an array bounds check. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/via-macii.c | 49 +++++------------------------------ 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index 36a4f49e79b5..7e0e32fa7eb2 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -120,23 +120,6 @@ static int srq_asserted; /* have to poll for the device that asserted it */ static int command_byte; /* the most recent command byte transmitted */ static int autopoll_devs; /* bits set are device addresses to be polled */ -/* Sanity check for request queue. Doesn't check for cycles. */ -static int request_is_queued(struct adb_request *req) { - struct adb_request *cur; - unsigned long flags; - local_irq_save(flags); - cur = current_req; - while (cur) { - if (cur == req) { - local_irq_restore(flags); - return 1; - } - cur = cur->next; - } - local_irq_restore(flags); - return 0; -} - /* Check for MacII style ADB */ static int macii_probe(void) { @@ -213,8 +196,6 @@ static void macii_queue_poll(void) else next_device = ffs(autopoll_devs) - 1; - BUG_ON(request_is_queued(&req)); - adb_request(&req, NULL, ADBREQ_NOSEND, 1, ADB_READREG(next_device, 0)); @@ -237,18 +218,13 @@ static int macii_send_request(struct adb_request *req, int sync) int err; unsigned long flags; - BUG_ON(request_is_queued(req)); - local_irq_save(flags); err = macii_write(req); local_irq_restore(flags); - if (!err && sync) { - while (!req->complete) { + if (!err && sync) + while (!req->complete) macii_poll(); - } - BUG_ON(request_is_queued(req)); - } return err; } @@ -327,9 +303,6 @@ static int macii_reset_bus(void) { static struct adb_request req; - if (request_is_queued(&req)) - return 0; - /* Command = 0, Address = ignored */ adb_request(&req, NULL, ADBREQ_NOSEND, 1, ADB_BUSRESET); macii_send_request(&req, 1); @@ -347,10 +320,6 @@ static void macii_start(void) req = current_req; - BUG_ON(req == NULL); - - BUG_ON(macii_state != idle); - /* Now send it. Be careful though, that first byte of the request * is actually ADB_PACKET; the real data begins at index 1! * And req->nbytes is the number of bytes of real data plus one. @@ -388,7 +357,6 @@ static void macii_start(void) static irqreturn_t macii_interrupt(int irq, void *arg) { int x; - static int entered; struct adb_request *req; if (!arg) { @@ -399,8 +367,6 @@ static irqreturn_t macii_interrupt(int irq, void *arg) return IRQ_NONE; } - BUG_ON(entered++); - last_status = status; status = via[B] & (ST_MASK|CTLR_IRQ); @@ -409,7 +375,7 @@ static irqreturn_t macii_interrupt(int irq, void *arg) if (reading_reply) { reply_ptr = current_req->reply; } else { - BUG_ON(current_req != NULL); + WARN_ON(current_req); reply_ptr = reply_buf; } @@ -474,8 +440,8 @@ static irqreturn_t macii_interrupt(int irq, void *arg) case reading: x = via[SR]; - BUG_ON((status & ST_MASK) == ST_CMD || - (status & ST_MASK) == ST_IDLE); + WARN_ON((status & ST_MASK) == ST_CMD || + (status & ST_MASK) == ST_IDLE); /* Bus timeout with SRQ sequence: * data is "XX FF" while CTLR_IRQ is "L L" @@ -502,8 +468,8 @@ static irqreturn_t macii_interrupt(int irq, void *arg) } } - if (macii_state == reading) { - BUG_ON(reply_len > 15); + if (macii_state == reading && + reply_len < ARRAY_SIZE(reply_buf)) { reply_ptr++; *reply_ptr = x; reply_len++; @@ -546,6 +512,5 @@ static irqreturn_t macii_interrupt(int irq, void *arg) break; } - entered--; return IRQ_HANDLED; } From 5ce6185c2ef4e5bcb268f0fdbfca90e787e1bf6d Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 089/221] macintosh/via-macii: Simplify locking Modifying the request queue or changing the current state requires mutual exclusion. Use local_irq_disable() consistently for this rather than disabling the ADB interrupt. This simplifies the locking scheme and brings via-macii into line with the other ADB drivers. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/via-macii.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index 7e0e32fa7eb2..6ed9ac91aca1 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -216,22 +216,23 @@ static void macii_queue_poll(void) static int macii_send_request(struct adb_request *req, int sync) { int err; - unsigned long flags; - local_irq_save(flags); err = macii_write(req); - local_irq_restore(flags); + if (err) + return err; - if (!err && sync) + if (sync) while (!req->complete) macii_poll(); - return err; + return 0; } /* Send an ADB request (append to request queue) */ static int macii_write(struct adb_request *req) { + unsigned long flags; + if (req->nbytes < 2 || req->data[0] != ADB_PACKET || req->nbytes > 15) { req->complete = 1; return -EINVAL; @@ -242,6 +243,8 @@ static int macii_write(struct adb_request *req) req->complete = 0; req->reply_len = 0; + local_irq_save(flags); + if (current_req != NULL) { last_req->next = req; last_req = req; @@ -250,6 +253,9 @@ static int macii_write(struct adb_request *req) last_req = req; if (macii_state == idle) macii_start(); } + + local_irq_restore(flags); + return 0; } @@ -293,9 +299,7 @@ static inline int need_autopoll(void) { /* Prod the chip without interrupts */ static void macii_poll(void) { - disable_irq(IRQ_MAC_ADB); macii_interrupt(0, NULL); - enable_irq(IRQ_MAC_ADB); } /* Reset the bus */ @@ -358,13 +362,18 @@ static irqreturn_t macii_interrupt(int irq, void *arg) { int x; struct adb_request *req; + unsigned long flags; + + local_irq_save(flags); if (!arg) { /* Clear the SR IRQ flag when polling. */ if (via[IFR] & SR_INT) via[IFR] = SR_INT; - else + else { + local_irq_restore(flags); return IRQ_NONE; + } } last_status = status; @@ -512,5 +521,6 @@ static irqreturn_t macii_interrupt(int irq, void *arg) break; } + local_irq_restore(flags); return IRQ_HANDLED; } From 351e5ad327d078386144af9a34346eaeb3e1ea1e Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 090/221] macintosh/via-macii, macintosh/adb-iop: Modernize printk calls Add missing severity level to log messages. Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/adb-iop.c | 2 +- drivers/macintosh/via-macii.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/adb-iop.c b/drivers/macintosh/adb-iop.c index ca623e6446e4..3a1e995ecc0e 100644 --- a/drivers/macintosh/adb-iop.c +++ b/drivers/macintosh/adb-iop.c @@ -208,7 +208,7 @@ int adb_iop_probe(void) int adb_iop_init(void) { - printk("adb: IOP ISM driver v0.4 for Unified ADB.\n"); + pr_info("adb: IOP ISM driver v0.4 for Unified ADB\n"); iop_listen(ADB_IOP, ADB_CHAN, adb_iop_listen, "ADB"); return 0; } diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index 6ed9ac91aca1..a38f57ba50cb 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -127,7 +127,7 @@ static int macii_probe(void) via = via1; - printk("adb: Mac II ADB Driver v1.0 for Unified ADB\n"); + pr_info("adb: Mac II ADB Driver v1.0 for Unified ADB\n"); return 0; } From 47fd2060660e62b169990a6fcd9eb61bc1a85c5c Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 11 Sep 2018 20:18:44 -0400 Subject: [PATCH 091/221] macintosh/via-macii, macintosh/adb-iop: Clean up whitespace Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman --- drivers/macintosh/adb-iop.c | 48 +++--- drivers/macintosh/via-macii.c | 296 +++++++++++++++++----------------- 2 files changed, 179 insertions(+), 165 deletions(-) diff --git a/drivers/macintosh/adb-iop.c b/drivers/macintosh/adb-iop.c index 3a1e995ecc0e..fca31640e3ef 100644 --- a/drivers/macintosh/adb-iop.c +++ b/drivers/macintosh/adb-iop.c @@ -20,13 +20,13 @@ #include #include -#include -#include +#include +#include #include #include #include -#include +#include /*#define DEBUG_ADB_IOP*/ @@ -38,9 +38,9 @@ static unsigned char *reply_ptr; #endif static enum adb_iop_state { - idle, - sending, - awaiting_reply + idle, + sending, + awaiting_reply } adb_iop_state; static void adb_iop_start(void); @@ -66,7 +66,8 @@ static void adb_iop_end_req(struct adb_request *req, int state) { req->complete = 1; current_req = req->next; - if (req->done) (*req->done)(req); + if (req->done) + (*req->done)(req); adb_iop_state = state; } @@ -100,7 +101,7 @@ static void adb_iop_complete(struct iop_msg *msg) static void adb_iop_listen(struct iop_msg *msg) { - struct adb_iopmsg *amsg = (struct adb_iopmsg *) msg->message; + struct adb_iopmsg *amsg = (struct adb_iopmsg *)msg->message; struct adb_request *req; unsigned long flags; #ifdef DEBUG_ADB_IOP @@ -113,9 +114,9 @@ static void adb_iop_listen(struct iop_msg *msg) #ifdef DEBUG_ADB_IOP printk("adb_iop_listen %p: rcvd packet, %d bytes: %02X %02X", req, - (uint) amsg->count + 2, (uint) amsg->flags, (uint) amsg->cmd); + (uint)amsg->count + 2, (uint)amsg->flags, (uint)amsg->cmd); for (i = 0; i < amsg->count; i++) - printk(" %02X", (uint) amsg->data[i]); + printk(" %02X", (uint)amsg->data[i]); printk("\n"); #endif @@ -168,14 +169,15 @@ static void adb_iop_start(void) /* get the packet to send */ req = current_req; - if (!req) return; + if (!req) + return; local_irq_save(flags); #ifdef DEBUG_ADB_IOP printk("adb_iop_start %p: sending packet, %d bytes:", req, req->nbytes); - for (i = 0 ; i < req->nbytes ; i++) - printk(" %02X", (uint) req->data[i]); + for (i = 0; i < req->nbytes; i++) + printk(" %02X", (uint)req->data[i]); printk("\n"); #endif @@ -196,13 +198,14 @@ static void adb_iop_start(void) /* Now send it. The IOP manager will call adb_iop_complete */ /* when the packet has been sent. */ - iop_send_message(ADB_IOP, ADB_CHAN, req, - sizeof(amsg), (__u8 *) &amsg, adb_iop_complete); + iop_send_message(ADB_IOP, ADB_CHAN, req, sizeof(amsg), (__u8 *)&amsg, + adb_iop_complete); } int adb_iop_probe(void) { - if (!iop_ism_present) return -ENODEV; + if (!iop_ism_present) + return -ENODEV; return 0; } @@ -218,10 +221,12 @@ int adb_iop_send_request(struct adb_request *req, int sync) int err; err = adb_iop_write(req); - if (err) return err; + if (err) + return err; if (sync) { - while (!req->complete) adb_iop_poll(); + while (!req->complete) + adb_iop_poll(); } return 0; } @@ -251,7 +256,9 @@ static int adb_iop_write(struct adb_request *req) } local_irq_restore(flags); - if (adb_iop_state == idle) adb_iop_start(); + + if (adb_iop_state == idle) + adb_iop_start(); return 0; } @@ -263,7 +270,8 @@ int adb_iop_autopoll(int devs) void adb_iop_poll(void) { - if (adb_iop_state == idle) adb_iop_start(); + if (adb_iop_state == idle) + adb_iop_start(); iop_ism_irq_poll(ADB_IOP); } diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index a38f57ba50cb..ac824d7b2dcf 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -12,7 +12,7 @@ * * 1999-08-02 (jmt) - Initial rewrite for Unified ADB. * 2000-03-29 Tony Mantler - * - Big overhaul, should actually work now. + * - Big overhaul, should actually work now. * 2006-12-31 Finn Thain - Another overhaul. * * Suggested reading: @@ -23,7 +23,7 @@ * Apple's "ADB Analyzer" bus sniffer is invaluable: * ftp://ftp.apple.com/developer/Tool_Chest/Devices_-_Hardware/Apple_Desktop_Bus/ */ - + #include #include #include @@ -77,7 +77,7 @@ static volatile unsigned char *via; #define ST_ODD 0x20 /* ADB state: odd data byte */ #define ST_IDLE 0x30 /* ADB state: idle, nothing to send */ -static int macii_init_via(void); +static int macii_init_via(void); static void macii_start(void); static irqreturn_t macii_interrupt(int irq, void *arg); static void macii_queue_poll(void); @@ -123,7 +123,8 @@ static int autopoll_devs; /* bits set are device addresses to be polled */ /* Check for MacII style ADB */ static int macii_probe(void) { - if (macintosh_config->adb_type != MAC_ADB_II) return -ENODEV; + if (macintosh_config->adb_type != MAC_ADB_II) + return -ENODEV; via = via1; @@ -136,15 +137,17 @@ int macii_init(void) { unsigned long flags; int err; - + local_irq_save(flags); - + err = macii_init_via(); - if (err) goto out; + if (err) + goto out; err = request_irq(IRQ_MAC_ADB, macii_interrupt, 0, "ADB", macii_interrupt); - if (err) goto out; + if (err) + goto out; macii_state = idle; out: @@ -152,7 +155,7 @@ int macii_init(void) return err; } -/* initialize the hardware */ +/* initialize the hardware */ static int macii_init_via(void) { unsigned char x; @@ -162,7 +165,7 @@ static int macii_init_via(void) /* Set up state: idle */ via[B] |= ST_IDLE; - last_status = via[B] & (ST_MASK|CTLR_IRQ); + last_status = via[B] & (ST_MASK | CTLR_IRQ); /* Shift register on input */ via[ACR] = (via[ACR] & ~SR_CTRL) | SR_EXT; @@ -188,7 +191,8 @@ static void macii_queue_poll(void) int next_device; static struct adb_request req; - if (!autopoll_devs) return; + if (!autopoll_devs) + return; device_mask = (1 << (((command_byte & 0xF0) >> 4) + 1)) - 1; if (autopoll_devs & ~device_mask) @@ -196,8 +200,7 @@ static void macii_queue_poll(void) else next_device = ffs(autopoll_devs) - 1; - adb_request(&req, NULL, ADBREQ_NOSEND, 1, - ADB_READREG(next_device, 0)); + adb_request(&req, NULL, ADBREQ_NOSEND, 1, ADB_READREG(next_device, 0)); req.sent = 0; req.complete = 0; @@ -237,7 +240,7 @@ static int macii_write(struct adb_request *req) req->complete = 1; return -EINVAL; } - + req->next = NULL; req->sent = 0; req->complete = 0; @@ -251,7 +254,8 @@ static int macii_write(struct adb_request *req) } else { current_req = req; last_req = req; - if (macii_state == idle) macii_start(); + if (macii_state == idle) + macii_start(); } local_irq_restore(flags); @@ -269,7 +273,8 @@ static int macii_autopoll(int devs) /* bit 1 == device 1, and so on. */ autopoll_devs = devs & 0xFFFE; - if (!autopoll_devs) return 0; + if (!autopoll_devs) + return 0; local_irq_save(flags); @@ -286,7 +291,8 @@ static int macii_autopoll(int devs) return err; } -static inline int need_autopoll(void) { +static inline int need_autopoll(void) +{ /* Was the last command Talk Reg 0 * and is the target on the autopoll list? */ @@ -306,7 +312,7 @@ static void macii_poll(void) static int macii_reset_bus(void) { static struct adb_request req; - + /* Command = 0, Address = ignored */ adb_request(&req, NULL, ADBREQ_NOSEND, 1, ADB_BUSRESET); macii_send_request(&req, 1); @@ -349,7 +355,7 @@ static void macii_start(void) * to be activity on the ADB bus. The chip will poll to achieve this. * * The basic ADB state machine was left unchanged from the original MacII code - * by Alan Cox, which was based on the CUDA driver for PowerMac. + * by Alan Cox, which was based on the CUDA driver for PowerMac. * The syntax of the ADB status lines is totally different on MacII, * though. MacII uses the states Command -> Even -> Odd -> Even ->...-> Idle * for sending and Idle -> Even -> Odd -> Even ->...-> Idle for receiving. @@ -377,147 +383,147 @@ static irqreturn_t macii_interrupt(int irq, void *arg) } last_status = status; - status = via[B] & (ST_MASK|CTLR_IRQ); + status = via[B] & (ST_MASK | CTLR_IRQ); switch (macii_state) { - case idle: - if (reading_reply) { - reply_ptr = current_req->reply; - } else { - WARN_ON(current_req); - reply_ptr = reply_buf; - } + case idle: + if (reading_reply) { + reply_ptr = current_req->reply; + } else { + WARN_ON(current_req); + reply_ptr = reply_buf; + } - x = via[SR]; + x = via[SR]; - if ((status & CTLR_IRQ) && (x == 0xFF)) { - /* Bus timeout without SRQ sequence: - * data is "FF" while CTLR_IRQ is "H" - */ - reply_len = 0; - srq_asserted = 0; - macii_state = read_done; - } else { - macii_state = reading; - *reply_ptr = x; - reply_len = 1; - } - - /* set ADB state = even for first data byte */ - via[B] = (via[B] & ~ST_MASK) | ST_EVEN; - break; - - case sending: - req = current_req; - if (data_index >= req->nbytes) { - req->sent = 1; - macii_state = idle; - - if (req->reply_expected) { - reading_reply = 1; - } else { - req->complete = 1; - current_req = req->next; - if (req->done) (*req->done)(req); - - if (current_req) - macii_start(); - else - if (need_autopoll()) - macii_autopoll(autopoll_devs); - } - - if (macii_state == idle) { - /* reset to shift in */ - via[ACR] &= ~SR_OUT; - x = via[SR]; - /* set ADB state idle - might get SRQ */ - via[B] = (via[B] & ~ST_MASK) | ST_IDLE; - } - } else { - via[SR] = req->data[data_index++]; - - if ( (via[B] & ST_MASK) == ST_CMD ) { - /* just sent the command byte, set to EVEN */ - via[B] = (via[B] & ~ST_MASK) | ST_EVEN; - } else { - /* invert state bits, toggle ODD/EVEN */ - via[B] ^= ST_MASK; - } - } - break; - - case reading: - x = via[SR]; - WARN_ON((status & ST_MASK) == ST_CMD || - (status & ST_MASK) == ST_IDLE); - - /* Bus timeout with SRQ sequence: - * data is "XX FF" while CTLR_IRQ is "L L" - * End of packet without SRQ sequence: - * data is "XX...YY 00" while CTLR_IRQ is "L...H L" - * End of packet SRQ sequence: - * data is "XX...YY 00" while CTLR_IRQ is "L...L L" - * (where XX is the first response byte and - * YY is the last byte of valid response data.) + if ((status & CTLR_IRQ) && (x == 0xFF)) { + /* Bus timeout without SRQ sequence: + * data is "FF" while CTLR_IRQ is "H" */ - + reply_len = 0; srq_asserted = 0; - if (!(status & CTLR_IRQ)) { - if (x == 0xFF) { - if (!(last_status & CTLR_IRQ)) { - macii_state = read_done; - reply_len = 0; - srq_asserted = 1; - } - } else if (x == 0x00) { - macii_state = read_done; - if (!(last_status & CTLR_IRQ)) - srq_asserted = 1; - } - } + macii_state = read_done; + } else { + macii_state = reading; + *reply_ptr = x; + reply_len = 1; + } - if (macii_state == reading && - reply_len < ARRAY_SIZE(reply_buf)) { - reply_ptr++; - *reply_ptr = x; - reply_len++; - } - - /* invert state bits, toggle ODD/EVEN */ - via[B] ^= ST_MASK; - break; - - case read_done: - x = via[SR]; - - if (reading_reply) { - reading_reply = 0; - req = current_req; - req->reply_len = reply_len; - req->complete = 1; - current_req = req->next; - if (req->done) (*req->done)(req); - } else if (reply_len && autopoll_devs) - adb_input(reply_buf, reply_len, 0); + /* set ADB state = even for first data byte */ + via[B] = (via[B] & ~ST_MASK) | ST_EVEN; + break; + case sending: + req = current_req; + if (data_index >= req->nbytes) { + req->sent = 1; macii_state = idle; - /* SRQ seen before, initiate poll now */ - if (srq_asserted) - macii_queue_poll(); + if (req->reply_expected) { + reading_reply = 1; + } else { + req->complete = 1; + current_req = req->next; + if (req->done) + (*req->done)(req); - if (current_req) - macii_start(); - else - if (need_autopoll()) + if (current_req) + macii_start(); + else if (need_autopoll()) macii_autopoll(autopoll_devs); + } - if (macii_state == idle) + if (macii_state == idle) { + /* reset to shift in */ + via[ACR] &= ~SR_OUT; + x = via[SR]; + /* set ADB state idle - might get SRQ */ via[B] = (via[B] & ~ST_MASK) | ST_IDLE; - break; + } + } else { + via[SR] = req->data[data_index++]; - default: + if ((via[B] & ST_MASK) == ST_CMD) { + /* just sent the command byte, set to EVEN */ + via[B] = (via[B] & ~ST_MASK) | ST_EVEN; + } else { + /* invert state bits, toggle ODD/EVEN */ + via[B] ^= ST_MASK; + } + } + break; + + case reading: + x = via[SR]; + WARN_ON((status & ST_MASK) == ST_CMD || + (status & ST_MASK) == ST_IDLE); + + /* Bus timeout with SRQ sequence: + * data is "XX FF" while CTLR_IRQ is "L L" + * End of packet without SRQ sequence: + * data is "XX...YY 00" while CTLR_IRQ is "L...H L" + * End of packet SRQ sequence: + * data is "XX...YY 00" while CTLR_IRQ is "L...L L" + * (where XX is the first response byte and + * YY is the last byte of valid response data.) + */ + + srq_asserted = 0; + if (!(status & CTLR_IRQ)) { + if (x == 0xFF) { + if (!(last_status & CTLR_IRQ)) { + macii_state = read_done; + reply_len = 0; + srq_asserted = 1; + } + } else if (x == 0x00) { + macii_state = read_done; + if (!(last_status & CTLR_IRQ)) + srq_asserted = 1; + } + } + + if (macii_state == reading && + reply_len < ARRAY_SIZE(reply_buf)) { + reply_ptr++; + *reply_ptr = x; + reply_len++; + } + + /* invert state bits, toggle ODD/EVEN */ + via[B] ^= ST_MASK; + break; + + case read_done: + x = via[SR]; + + if (reading_reply) { + reading_reply = 0; + req = current_req; + req->reply_len = reply_len; + req->complete = 1; + current_req = req->next; + if (req->done) + (*req->done)(req); + } else if (reply_len && autopoll_devs) + adb_input(reply_buf, reply_len, 0); + + macii_state = idle; + + /* SRQ seen before, initiate poll now */ + if (srq_asserted) + macii_queue_poll(); + + if (current_req) + macii_start(); + else if (need_autopoll()) + macii_autopoll(autopoll_devs); + + if (macii_state == idle) + via[B] = (via[B] & ~ST_MASK) | ST_IDLE; + break; + + default: break; } From 7241d26e8175e95290a6549a470c330dbfc63442 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 13 Oct 2018 09:45:12 +0000 Subject: [PATCH 092/221] powerpc/64: properly initialise the stackprotector canary on SMP. commit 06ec27aea9fc ("powerpc/64: add stack protector support") doesn't initialise the stack canary on SMP secondary CPU's paca, leading to the following false positive report from the stack protector. smp: Bringing up secondary CPUs ... Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: __schedule+0x978/0xa80 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.19.0-rc7-next-20181010-autotest-autotest #1 Call Trace: [c000001fed5b3bf0] [c000000000a0ef3c] dump_stack+0xb0/0xf4 (unreliable) [c000001fed5b3c30] [c0000000000f9d68] panic+0x140/0x308 [c000001fed5b3cc0] [c0000000000f9844] __stack_chk_fail+0x24/0x30 [c000001fed5b3d20] [c000000000a2c3a8] __schedule+0x978/0xa80 [c000001fed5b3e00] [c000000000a2c9b4] schedule_idle+0x34/0x60 [c000001fed5b3e30] [c00000000013d344] do_idle+0x224/0x3d0 [c000001fed5b3ec0] [c00000000013d6e0] cpu_startup_entry+0x30/0x50 [c000001fed5b3ef0] [c000000000047f34] start_secondary+0x4d4/0x520 [c000001fed5b3f90] [c00000000000b370] start_secondary_prolog+0x10/0x14 This patch properly initialises the stack_canary of the secondary idle tasks. Reported-by: Abdul Haleem Fixes: 06ec27aea9fc ("powerpc/64: add stack protector support") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 61c1fadbc644..e774d3bf3a03 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -810,9 +811,16 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) { struct thread_info *ti = task_thread_info(idle); +#ifdef CONFIG_STACKPROTECTOR + idle->stack_canary = get_random_canary(); +#endif + #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; +#ifdef CONFIG_STACKPROTECTOR + paca_ptrs[cpu]->canary = idle->stack_canary; +#endif #endif ti->cpu = cpu; secondary_ti = current_set[cpu] = ti; From 3b35bd48b8a06e02a25af84baba782876b8a6572 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 6 Oct 2018 16:51:12 +0000 Subject: [PATCH 093/221] powerpc/process: Fix sparse address space warnings This patch fixes the following warnings, which are leftovers from when __get_user() was replaced by probe_kernel_address(). arch/powerpc/kernel/process.c:1287:22: warning: incorrect type in argument 2 (different address spaces) arch/powerpc/kernel/process.c:1287:22: expected void const *src arch/powerpc/kernel/process.c:1287:22: got unsigned int [noderef] * arch/powerpc/kernel/process.c:1319:21: warning: incorrect type in argument 2 (different address spaces) arch/powerpc/kernel/process.c:1319:21: expected void const *src arch/powerpc/kernel/process.c:1319:21: got unsigned int [noderef] * Fixes: 7b051f665c32d ("powerpc: Use probe_kernel_address in show_instructions") Reviewed-by: Murilo Opsfelder Araujo Signed-off-by: Christophe Leroy [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index d9d4eb2ea6c9..c7af42052041 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1281,7 +1281,7 @@ static void show_instructions(struct pt_regs *regs) #endif if (!__kernel_text_address(pc) || - probe_kernel_address((unsigned int __user *)pc, instr)) { + probe_kernel_address((const void *)pc, instr)) { pr_cont("XXXXXXXX "); } else { if (regs->nip == pc) @@ -1323,7 +1323,7 @@ void show_user_instructions(struct pt_regs *regs) pr_info("%s[%d]: code: ", current->comm, current->pid); } - if (probe_kernel_address((unsigned int __user *)pc, instr)) { + if (probe_kernel_address((const void *)pc, instr)) { pr_cont("XXXXXXXX "); } else { if (regs->nip == pc) From c9386bfd37d37f29588de9ea9add455510049c33 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 16:46:25 +1100 Subject: [PATCH 094/221] powerpc/process: Add missing include of stacktrace.h As spotted by sparse: arch/powerpc/kernel/process.c:1302:6: warning: symbol 'show_user_instructions' was not declared. Should it be static? Fixes: 88b0fe1757359 ("powerpc: Add show_user_instructions()") Reviewed-by: Murilo Opsfelder Araujo Signed-off-by: Christophe Leroy [mpe: Split out of larger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index c7af42052041..3396c419abf2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include From fb2d9505c0dbd4f5e00db70f7ca0ca7a3d75ca63 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 6 Oct 2018 16:51:14 +0000 Subject: [PATCH 095/221] powerpc/process: Fix interleaved output in show_user_instructions() When two processes crash at the same time, we sometimes encounter interleaving in the middle of a line: init[1]: segfault (11) at 0 nip 0 lr 0 code 1 init[1]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX init[74]: segfault (11) at 10a74 nip 1000c198 lr 100078c8 code 1 in sh[10000000+14000] XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX init[1]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX init[74]: code: 90010024 bf61000c 91490a7c 3fa01002 3be00000 7d3e4b78 3bbd0c20 3b600000 init[74]: code: 3b9d0040 7c7fe02e 2f830000 419e0028 <89230000> 2f890000 41be001c 4b7f6e79 This patch fixes it by preparing complete lines in a buffer and printing it at once. Fixes: 88b0fe1757359 ("powerpc: Add show_user_instructions()") Reviewed-by: Murilo Opsfelder Araujo Signed-off-by: Christophe Leroy [mpe: Use seq_buf_printf() not seq_buf_puts() which doesn't NULL terminate] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 39 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3396c419abf2..050f1136f587 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1300,7 +1301,9 @@ static void show_instructions(struct pt_regs *regs) void show_user_instructions(struct pt_regs *regs) { unsigned long pc; - int i; + int n = instructions_to_print; + struct seq_buf s; + char buf[96]; /* enough for 8 times 9 + 2 chars */ pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); @@ -1314,29 +1317,27 @@ void show_user_instructions(struct pt_regs *regs) return; } - pr_info("%s[%d]: code: ", current->comm, current->pid); + seq_buf_init(&s, buf, sizeof(buf)); - for (i = 0; i < instructions_to_print; i++) { - int instr; + while (n) { + int i; - if (!(i % 8) && (i > 0)) { - pr_cont("\n"); - pr_info("%s[%d]: code: ", current->comm, current->pid); + seq_buf_clear(&s); + + for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { + int instr; + + if (probe_kernel_address((const void *)pc, instr)) { + seq_buf_printf(&s, "XXXXXXXX "); + continue; + } + seq_buf_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr); } - if (probe_kernel_address((const void *)pc, instr)) { - pr_cont("XXXXXXXX "); - } else { - if (regs->nip == pc) - pr_cont("<%08x> ", instr); - else - pr_cont("%08x ", instr); - } - - pc += sizeof(int); + if (!seq_buf_has_overflowed(&s)) + pr_info("%s[%d]: code: %s\n", current->comm, + current->pid, s.buffer); } - - pr_cont("\n"); } struct regbit { From df13102f82f1c8d0a1f43505275bf18246d7f9a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 6 Oct 2018 16:51:16 +0000 Subject: [PATCH 096/221] powerpc/process: Constify the number of insns printed by show instructions functions. instructions_to_print var is assigned value 16 and there is no way to change it. This patch replaces it by a constant. Reviewed-by: Murilo Opsfelder Araujo Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 050f1136f587..0ed8d0968515 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1258,17 +1258,16 @@ struct task_struct *__switch_to(struct task_struct *prev, return last; } -static int instructions_to_print = 16; +#define NR_INSN_TO_PRINT 16 static void show_instructions(struct pt_regs *regs) { int i; - unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * - sizeof(int)); + unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); printk("Instruction dump:"); - for (i = 0; i < instructions_to_print; i++) { + for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; if (!(i % 8)) @@ -1301,17 +1300,17 @@ static void show_instructions(struct pt_regs *regs) void show_user_instructions(struct pt_regs *regs) { unsigned long pc; - int n = instructions_to_print; + int n = NR_INSN_TO_PRINT; struct seq_buf s; char buf[96]; /* enough for 8 times 9 + 2 chars */ - pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); + pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); /* * Make sure the NIP points at userspace, not kernel text/data or * elsewhere. */ - if (!__access_ok(pc, instructions_to_print * sizeof(int), USER_DS)) { + if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", current->comm, current->pid); return; From 6233b6da0c00a9768bdab1c502115b960929889a Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 9 Oct 2018 16:50:38 +1030 Subject: [PATCH 097/221] powerpc/perf: Quiet IMC PMU registration message On a Power9 box we get a few screens full of these on boot. Drop them to pr_debug. [ 5.993645] nest_centaur6_imc performance monitor hardware support registered [ 5.993728] nest_centaur7_imc performance monitor hardware support registered [ 5.996510] core_imc performance monitor hardware support registered [ 5.996569] nest_mba0_imc performance monitor hardware support registered [ 5.996631] nest_mba1_imc performance monitor hardware support registered [ 5.996685] nest_mba2_imc performance monitor hardware support registered Signed-off-by: Joel Stanley Reviewed-by: Madhavan Srinivasan Reviewed-by: Stewart Smith Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 1fafc32b12a0..6954636b16d1 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1392,7 +1392,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id if (ret) goto err_free_cpuhp_mem; - pr_info("%s performance monitor hardware support registered\n", + pr_debug("%s performance monitor hardware support registered\n", pmu_ptr->pmu.name); return 0; From 014704e6f54189a203cc14c7c0bb411b940241bc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 1 Oct 2018 19:44:58 +0300 Subject: [PATCH 098/221] powerpc: Fix signedness bug in update_flash_db() The "count < sizeof(struct os_area_db)" comparison is type promoted to size_t so negative values of "count" are treated as very high values and we accidentally return success instead of a negative error code. This doesn't really change runtime much but it fixes a static checker warning. Signed-off-by: Dan Carpenter Acked-by: Geoff Levand Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/ps3/os-area.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index cdbfc5cfd6f3..f5387ad82279 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -664,7 +664,7 @@ static int update_flash_db(void) db_set_64(db, &os_area_db_id_rtc_diff, saved_params.rtc_diff); count = os_area_flash_write(db, sizeof(struct os_area_db), pos); - if (count < sizeof(struct os_area_db)) { + if (count < 0 || count < sizeof(struct os_area_db)) { pr_debug("%s: os_area_flash_write failed %zd\n", __func__, count); error = count < 0 ? count : -EIO; From aea447141c7e7824b81b49acd1bc785506fba46e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Mon, 17 Sep 2018 17:16:21 +0930 Subject: [PATCH 099/221] powerpc: Disable -Wbuiltin-requires-header when setjmp is used The powerpc kernel uses setjmp which causes a warning when building with clang: In file included from arch/powerpc/xmon/xmon.c:51: ./arch/powerpc/include/asm/setjmp.h:15:13: error: declaration of built-in function 'setjmp' requires inclusion of the header [-Werror,-Wbuiltin-requires-header] extern long setjmp(long *); ^ ./arch/powerpc/include/asm/setjmp.h:16:13: error: declaration of built-in function 'longjmp' requires inclusion of the header [-Werror,-Wbuiltin-requires-header] extern void longjmp(long *, long); ^ This *is* the header and we're not using the built-in setjump but rather the one in arch/powerpc/kernel/misc.S. As the compiler warning does not make sense, it for the files where setjmp is used. Signed-off-by: Joel Stanley Reviewed-by: Nick Desaulniers [mpe: Move subdir-ccflags in xmon/Makefile to not clobber -Werror] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 3 +++ arch/powerpc/xmon/Makefile | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 85ffa488dfb5..fb70e9b6fa67 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -5,6 +5,9 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' +# Disable clang warning for using setjmp without setjmp.h header +CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) + subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ifdef CONFIG_PPC64 diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 93cc1f1b8b61..9d7d8e6d705c 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -1,7 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for xmon -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +# Disable clang warning for using setjmp without setjmp.h header +subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) + +subdir-ccflags-$(CONFIG_PPC_WERROR) += -Werror GCOV_PROFILE := n UBSAN_SANITIZE := n From b27e5f939b6d50e1b83e9febfee1c3f8de7789d7 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 10 Oct 2018 16:23:02 +1100 Subject: [PATCH 100/221] powerpc/rtasd: Improve unknown error logging Currently when we get an unknown RTAS event it prints the type as "Unknown" and no other useful information. Add the raw type code to the log message so that we have something to work off. Signed-off-by: Oliver O'Halloran Reviewed-by: Vasant Hegde Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtasd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 23b88b923f06..c1378661b12f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -150,8 +150,10 @@ static void printk_log_rtas(char *buf, int len) } else { struct rtas_error_log *errlog = (struct rtas_error_log *)buf; - printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", - error_log_cnt, rtas_event_type(rtas_error_type(errlog)), + printk(RTAS_DEBUG "event: %d, Type: %s (%d), Severity: %d\n", + error_log_cnt, + rtas_event_type(rtas_error_type(errlog)), + rtas_error_type(errlog), rtas_error_severity(errlog)); } } From 719736e1cc12b2fc28eba2122893a449eee66d08 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 9 Oct 2018 17:39:46 +0200 Subject: [PATCH 101/221] powerpc: remove redundant 'default n' from Kconfig-s 'default n' is the default value for any bool or tristate Kconfig setting so there is no need to write it explicitly. Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO is not set' for visible symbols") the Kconfig behavior is the same regardless of 'default n' being present or not: ... One side effect of (and the main motivation for) this change is making the following two definitions behave exactly the same: config FOO bool config FOO bool default n With this change, neither of these will generate a '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied). That might make it clearer to people that a bare 'default n' is redundant. ... Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 14 -------------- arch/powerpc/Kconfig.debug | 6 ------ arch/powerpc/platforms/40x/Kconfig | 9 --------- arch/powerpc/platforms/44x/Kconfig | 22 ---------------------- arch/powerpc/platforms/82xx/Kconfig | 1 - arch/powerpc/platforms/Kconfig | 21 --------------------- arch/powerpc/platforms/Kconfig.cputype | 4 ---- arch/powerpc/platforms/cell/Kconfig | 3 --- arch/powerpc/platforms/maple/Kconfig | 1 - arch/powerpc/platforms/pasemi/Kconfig | 1 - arch/powerpc/platforms/powernv/Kconfig | 1 - arch/powerpc/platforms/ps3/Kconfig | 2 -- arch/powerpc/platforms/pseries/Kconfig | 2 -- arch/powerpc/sysdev/Kconfig | 5 ----- arch/powerpc/sysdev/xive/Kconfig | 3 --- 15 files changed, 95 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 602eea723624..1888636c9eb6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -286,12 +286,10 @@ config ARCH_MAY_HAVE_PC_FDC config PPC_UDBG_16550 bool - default n config GENERIC_TBSYNC bool default y if PPC32 && SMP - default n config AUDIT_ARCH bool @@ -310,13 +308,11 @@ config EPAPR_BOOT bool help Used to allow a board to specify it wants an ePAPR compliant wrapper. - default n config DEFAULT_UIMAGE bool help Used to allow a board to specify it wants a uImage built by default - default n config ARCH_HIBERNATION_POSSIBLE bool @@ -330,11 +326,9 @@ config ARCH_SUSPEND_POSSIBLE config PPC_DCR_NATIVE bool - default n config PPC_DCR_MMIO bool - default n config PPC_DCR bool @@ -345,7 +339,6 @@ config PPC_OF_PLATFORM_PCI bool depends on PCI depends on PPC64 # not supported on 32 bits yet - default n config ARCH_SUPPORTS_DEBUG_PAGEALLOC depends on PPC32 || PPC_BOOK3S_64 @@ -448,14 +441,12 @@ config PPC_TRANSACTIONAL_MEM depends on SMP select ALTIVEC select VSX - default n ---help--- Support user-mode Transactional Memory on POWERPC. config LD_HEAD_STUB_CATCH bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT depends on PPC64 - default n help Very large kernels can cause linker branch stubs to be generated by code in head_64.S, which moves the head text sections out of their @@ -558,7 +549,6 @@ config RELOCATABLE config RELOCATABLE_TEST bool "Test relocatable kernel" depends on (PPC64 && RELOCATABLE) - default n help This runs the relocatable kernel at the address it was initially loaded at, which tends to be non-zero and therefore test the @@ -770,7 +760,6 @@ config PPC_SUBPAGE_PROT config PPC_COPRO_BASE bool - default n config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -893,7 +882,6 @@ config PPC_INDIRECT_PCI bool depends on PCI default y if 40x || 44x - default n config EISA bool @@ -990,7 +978,6 @@ source "drivers/pcmcia/Kconfig" config HAS_RAPIDIO bool - default n config RAPIDIO tristate "RapidIO support" @@ -1013,7 +1000,6 @@ endmenu config NONSTATIC_KERNEL bool - default n menu "Advanced setup" depends on PPC32 diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index fd63cd914a74..f4961fbcb48d 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -2,7 +2,6 @@ config PPC_DISABLE_WERROR bool "Don't build arch/powerpc code with -Werror" - default n help This option tells the compiler NOT to build the code under arch/powerpc with the -Werror flag (which means warnings @@ -56,7 +55,6 @@ config PPC_EMULATED_STATS config CODE_PATCHING_SELFTEST bool "Run self-tests of the code-patching code" depends on DEBUG_KERNEL - default n config JUMP_LABEL_FEATURE_CHECKS bool "Enable use of jump label for cpu/mmu_has_feature()" @@ -70,7 +68,6 @@ config JUMP_LABEL_FEATURE_CHECKS config JUMP_LABEL_FEATURE_CHECK_DEBUG bool "Do extra check on feature fixup calls" depends on DEBUG_KERNEL && JUMP_LABEL_FEATURE_CHECKS - default n help This tries to catch incorrect usage of cpu_has_feature() and mmu_has_feature() in the code. @@ -80,16 +77,13 @@ config JUMP_LABEL_FEATURE_CHECK_DEBUG config FTR_FIXUP_SELFTEST bool "Run self-tests of the feature-fixup code" depends on DEBUG_KERNEL - default n config MSI_BITMAP_SELFTEST bool "Run self-tests of the MSI bitmap code" depends on DEBUG_KERNEL - default n config PPC_IRQ_SOFT_MASK_DEBUG bool "Include extra checks for powerpc irq soft masking" - default n config XMON bool "Include xmon kernel debugger" diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index 60254a321a91..2a9d66254ffc 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -2,7 +2,6 @@ config ACADIA bool "Acadia" depends on 40x - default n select PPC40x_SIMPLE select 405EZ help @@ -11,7 +10,6 @@ config ACADIA config EP405 bool "EP405/EP405PC" depends on 40x - default n select 405GP select PCI help @@ -20,7 +18,6 @@ config EP405 config HOTFOOT bool "Hotfoot" depends on 40x - default n select PPC40x_SIMPLE select PCI help @@ -29,7 +26,6 @@ config HOTFOOT config KILAUEA bool "Kilauea" depends on 40x - default n select 405EX select PPC40x_SIMPLE select PPC4xx_PCI_EXPRESS @@ -41,7 +37,6 @@ config KILAUEA config MAKALU bool "Makalu" depends on 40x - default n select 405EX select PCI select PPC4xx_PCI_EXPRESS @@ -62,7 +57,6 @@ config WALNUT config XILINX_VIRTEX_GENERIC_BOARD bool "Generic Xilinx Virtex board" depends on 40x - default n select XILINX_VIRTEX_II_PRO select XILINX_VIRTEX_4_FX select XILINX_INTC @@ -80,7 +74,6 @@ config XILINX_VIRTEX_GENERIC_BOARD config OBS600 bool "OpenBlockS 600" depends on 40x - default n select 405EX select PPC40x_SIMPLE help @@ -90,7 +83,6 @@ config OBS600 config PPC40x_SIMPLE bool "Simple PowerPC 40x board support" depends on 40x - default n help This option enables the simple PowerPC 40x platform support. @@ -156,7 +148,6 @@ config IBM405_ERR51 config APM8018X bool "APM8018X" depends on 40x - default n select PPC40x_SIMPLE help This option enables support for the AppliedMicro APM8018X evaluation diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index a6011422b861..f024efd5a4c2 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -2,7 +2,6 @@ config PPC_47x bool "Support for 47x variant" depends on 44x - default n select MPIC help This option enables support for the 47x family of processors and is @@ -11,7 +10,6 @@ config PPC_47x config BAMBOO bool "Bamboo" depends on 44x - default n select PPC44x_SIMPLE select 440EP select PCI @@ -21,7 +19,6 @@ config BAMBOO config BLUESTONE bool "Bluestone" depends on 44x - default n select PPC44x_SIMPLE select APM821xx select PCI_MSI @@ -44,7 +41,6 @@ config EBONY config SAM440EP bool "Sam440ep" depends on 44x - default n select 440EP select PCI help @@ -53,7 +49,6 @@ config SAM440EP config SEQUOIA bool "Sequoia" depends on 44x - default n select PPC44x_SIMPLE select 440EPX help @@ -62,7 +57,6 @@ config SEQUOIA config TAISHAN bool "Taishan" depends on 44x - default n select PPC44x_SIMPLE select 440GX select PCI @@ -73,7 +67,6 @@ config TAISHAN config KATMAI bool "Katmai" depends on 44x - default n select PPC44x_SIMPLE select 440SPe select PCI @@ -86,7 +79,6 @@ config KATMAI config RAINIER bool "Rainier" depends on 44x - default n select PPC44x_SIMPLE select 440GRX select PCI @@ -96,7 +88,6 @@ config RAINIER config WARP bool "PIKA Warp" depends on 44x - default n select 440EP help This option enables support for the PIKA Warp(tm) Appliance. The Warp @@ -109,7 +100,6 @@ config WARP config ARCHES bool "Arches" depends on 44x - default n select PPC44x_SIMPLE select 460EX # Odd since it uses 460GT but the effects are the same select PCI @@ -120,7 +110,6 @@ config ARCHES config CANYONLANDS bool "Canyonlands" depends on 44x - default n select 460EX select PCI select PPC4xx_PCI_EXPRESS @@ -134,7 +123,6 @@ config CANYONLANDS config GLACIER bool "Glacier" depends on 44x - default n select PPC44x_SIMPLE select 460EX # Odd since it uses 460GT but the effects are the same select PCI @@ -147,7 +135,6 @@ config GLACIER config REDWOOD bool "Redwood" depends on 44x - default n select PPC44x_SIMPLE select 460SX select PCI @@ -160,7 +147,6 @@ config REDWOOD config EIGER bool "Eiger" depends on 44x - default n select PPC44x_SIMPLE select 460SX select PCI @@ -172,7 +158,6 @@ config EIGER config YOSEMITE bool "Yosemite" depends on 44x - default n select PPC44x_SIMPLE select 440EP select PCI @@ -182,7 +167,6 @@ config YOSEMITE config ISS4xx bool "ISS 4xx Simulator" depends on (44x || 40x) - default n select 405GP if 40x select 440GP if 44x && !PPC_47x select PPC_FPU @@ -193,7 +177,6 @@ config ISS4xx config CURRITUCK bool "IBM Currituck (476fpe) Support" depends on PPC_47x - default n select SWIOTLB select 476FPE select PPC4xx_PCI_EXPRESS @@ -203,7 +186,6 @@ config CURRITUCK config FSP2 bool "IBM FSP2 (476fpe) Support" depends on PPC_47x - default n select 476FPE select IBM_EMAC_EMAC4 if IBM_EMAC select IBM_EMAC_RGMII if IBM_EMAC @@ -215,7 +197,6 @@ config FSP2 config AKEBONO bool "IBM Akebono (476gtr) Support" depends on PPC_47x - default n select SWIOTLB select 476FPE select PPC4xx_PCI_EXPRESS @@ -241,7 +222,6 @@ config AKEBONO config ICON bool "Icon" depends on 44x - default n select PPC44x_SIMPLE select 440SPe select PCI @@ -252,7 +232,6 @@ config ICON config XILINX_VIRTEX440_GENERIC_BOARD bool "Generic Xilinx Virtex 5 FXT board support" depends on 44x - default n select XILINX_VIRTEX_5_FXT select XILINX_INTC help @@ -280,7 +259,6 @@ config XILINX_ML510 config PPC44x_SIMPLE bool "Simple PowerPC 44x board support" depends on 44x - default n help This option enables the simple PowerPC 44x platform support. diff --git a/arch/powerpc/platforms/82xx/Kconfig b/arch/powerpc/platforms/82xx/Kconfig index 6e04099361b9..1947a88bc69f 100644 --- a/arch/powerpc/platforms/82xx/Kconfig +++ b/arch/powerpc/platforms/82xx/Kconfig @@ -51,7 +51,6 @@ endif config PQ2ADS bool - default n config 8260 bool diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 14ef17e10ec9..260a56b7602d 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -23,7 +23,6 @@ source "arch/powerpc/platforms/amigaone/Kconfig" config KVM_GUEST bool "KVM Guest support" - default n select EPAPR_PARAVIRT ---help--- This option enables various optimizations for running under the KVM @@ -34,7 +33,6 @@ config KVM_GUEST config EPAPR_PARAVIRT bool "ePAPR para-virtualization support" - default n help Enables ePAPR para-virtualization support for guests. @@ -74,7 +72,6 @@ config PPC_DT_CPU_FTRS config UDBG_RTAS_CONSOLE bool "RTAS based debug console" depends on PPC_RTAS - default n config PPC_SMP_MUXED_IPI bool @@ -86,16 +83,13 @@ config PPC_SMP_MUXED_IPI config IPIC bool - default n config MPIC bool - default n config MPIC_TIMER bool "MPIC Global Timer" depends on MPIC && FSL_SOC - default n help The MPIC global timer is a hardware timer inside the Freescale PIC complying with OpenPIC standard. When the @@ -107,7 +101,6 @@ config MPIC_TIMER config FSL_MPIC_TIMER_WAKEUP tristate "Freescale MPIC global timer wakeup driver" depends on FSL_SOC && MPIC_TIMER && PM - default n help The driver provides a way to wake up the system by MPIC timer. @@ -115,43 +108,35 @@ config FSL_MPIC_TIMER_WAKEUP config PPC_EPAPR_HV_PIC bool - default n select EPAPR_PARAVIRT config MPIC_WEIRD bool - default n config MPIC_MSGR bool "MPIC message register support" depends on MPIC - default n help Enables support for the MPIC message registers. These registers are used for inter-processor communication. config PPC_I8259 bool - default n config U3_DART bool depends on PPC64 - default n config PPC_RTAS bool - default n config RTAS_ERROR_LOGGING bool depends on PPC_RTAS - default n config PPC_RTAS_DAEMON bool depends on PPC_RTAS - default n config RTAS_PROC bool "Proc interface to RTAS" @@ -164,11 +149,9 @@ config RTAS_FLASH config MMIO_NVRAM bool - default n config MPIC_U3_HT_IRQS bool - default n config MPIC_BROKEN_REGREAD bool @@ -187,15 +170,12 @@ config EEH config PPC_MPC106 bool - default n config PPC_970_NAP bool - default n config PPC_P7_NAP bool - default n config PPC_INDIRECT_PIO bool @@ -295,7 +275,6 @@ config CPM2 config FSL_ULI1575 bool - default n select GENERIC_ISA_DMA help Supports for the ULI1575 PCIe south bridge that exists on some diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 495db17dcbca..f4e2c5729374 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config PPC64 bool "64-bit kernel" - default n select ZLIB_DEFLATE help This option selects whether a 32-bit or a 64-bit kernel @@ -369,7 +368,6 @@ config PPC_MM_SLICES bool default y if PPC_BOOK3S_64 default y if PPC_8xx && HUGETLB_PAGE - default n config PPC_HAVE_PMU_SUPPORT bool @@ -383,7 +381,6 @@ config PPC_PERF_CTRS config FORCE_SMP # Allow platforms to force SMP=y by selecting this bool - default n select SMP config SMP @@ -424,7 +421,6 @@ config CHECK_CACHE_COHERENCY config PPC_DOORBELL bool - default n endmenu diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index 9f5958f16923..4b2f114f3116 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config PPC_CELL bool - default n config PPC_CELL_COMMON bool @@ -22,7 +21,6 @@ config PPC_CELL_NATIVE select IBM_EMAC_RGMII if IBM_EMAC select IBM_EMAC_ZMII if IBM_EMAC #test only select IBM_EMAC_TAH if IBM_EMAC #test only - default n config PPC_IBM_CELL_BLADE bool "IBM Cell Blade" @@ -54,7 +52,6 @@ config SPU_FS config SPU_BASE bool - default n select PPC_COPRO_BASE config CBE_RAS diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig index 376d0be36b66..2601fac50354 100644 --- a/arch/powerpc/platforms/maple/Kconfig +++ b/arch/powerpc/platforms/maple/Kconfig @@ -13,7 +13,6 @@ config PPC_MAPLE select PPC_RTAS select MMIO_NVRAM select ATA_NONSTANDARD if ATA - default n help This option enables support for the Maple 970FX Evaluation Board. For more information, refer to diff --git a/arch/powerpc/platforms/pasemi/Kconfig b/arch/powerpc/platforms/pasemi/Kconfig index d458a791d35b..98e3bc22bebc 100644 --- a/arch/powerpc/platforms/pasemi/Kconfig +++ b/arch/powerpc/platforms/pasemi/Kconfig @@ -2,7 +2,6 @@ config PPC_PASEMI depends on PPC64 && PPC_BOOK3S && CPU_BIG_ENDIAN bool "PA Semi SoC-based platforms" - default n select MPIC select PCI select PPC_UDBG_16550 diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 028ac941c05c..99083fe992d5 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -30,7 +30,6 @@ config OPAL_PRD config PPC_MEMTRACE bool "Enable removal of RAM from kernel mappings for tracing" depends on PPC_POWERNV && MEMORY_HOTREMOVE - default n help Enabling this option allows for the removal of memory (RAM) from the kernel mappings to be used for hardware tracing. diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index 6f7525555b19..24864b8aaf5d 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -49,7 +49,6 @@ config PS3_HTAB_SIZE config PS3_DYNAMIC_DMA depends on PPC_PS3 bool "PS3 Platform dynamic DMA page table management" - default n help This option will enable kernel support to take advantage of the per device dynamic DMA page table management provided by the Cell @@ -89,7 +88,6 @@ config PS3_SYS_MANAGER config PS3_REPOSITORY_WRITE bool "PS3 Repository write support" if PS3_ADVANCED depends on PPC_PS3 - default n help Enables support for writing to the PS3 System Repository. diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 0c698fd6d491..39032d9b316c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -28,7 +28,6 @@ config PPC_PSERIES config PPC_SPLPAR depends on PPC_PSERIES bool "Support for shared-processor logical partitions" - default n help Enabling this option will make the kernel run more efficiently on logically-partitioned pSeries systems which use shared @@ -99,7 +98,6 @@ config PPC_SMLPAR bool "Support for shared-memory logical partitions" depends on PPC_PSERIES select LPARCFG - default n help Select this option to enable shared memory partition support. With this option a system running in an LPAR can be given more diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index bcef2ac56479..e0dbec780fe9 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -6,19 +6,16 @@ config PPC4xx_PCI_EXPRESS bool depends on PCI && 4xx - default n config PPC4xx_HSTA_MSI bool depends on PCI_MSI depends on PCI && 4xx - default n config PPC4xx_MSI bool depends on PCI_MSI depends on PCI && 4xx - default n config PPC_MSI_BITMAP bool @@ -37,11 +34,9 @@ config PPC_SCOM config SCOM_DEBUGFS bool "Expose SCOM controllers via debugfs" depends on PPC_SCOM && DEBUG_FS - default n config GE_FPGA bool - default n config FSL_CORENET_RCPM bool diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig index 70ee976e1de0..785c292d104b 100644 --- a/arch/powerpc/sysdev/xive/Kconfig +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -1,17 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 config PPC_XIVE bool - default n select PPC_SMP_MUXED_IPI select HARDIRQS_SW_RESEND config PPC_XIVE_NATIVE bool - default n select PPC_XIVE depends on PPC_POWERNV config PPC_XIVE_SPAPR bool - default n select PPC_XIVE From 5e9dcb6188a40e604e66dc30fab30c2be89aa1cc Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 10 Oct 2018 09:58:02 +1030 Subject: [PATCH 102/221] powerpc/boot: Expose Kconfig symbols to wrapper Currently the wrapper is built without including anything in $(src)/include/, which means there are no CONFIG_ symbols defined. This means the platform specific serial drivers were never enabled. We now copy the definitions into the boot directory, so any C file can now include autoconf.h to depend on configuration options. Fixes: 866bfc75f40e ("powerpc: conditionally compile platform-specific serial drivers") Signed-off-by: Joel Stanley [mpe: Fix to use $(objtree) to find autoconf.h] Signed-off-by: Michael Ellerman --- arch/powerpc/boot/.gitignore | 1 + arch/powerpc/boot/Makefile | 7 ++++++- arch/powerpc/boot/serial.c | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore index f92d0530ceb1..32034a0cc554 100644 --- a/arch/powerpc/boot/.gitignore +++ b/arch/powerpc/boot/.gitignore @@ -44,4 +44,5 @@ fdt_sw.c fdt_wip.c libfdt.h libfdt_internal.h +autoconf.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 0fb96c26136f..5d5ab6ee48e0 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -197,9 +197,14 @@ $(obj)/empty.c: $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ +$(obj)/serial.c: $(obj)/autoconf.h + +$(obj)/autoconf.h: $(obj)/%: $(objtree)/include/generated/% + $(Q)cp $< $@ + clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ $(zlib-decomp-) $(libfdt) $(libfdtheader) \ - empty.c zImage.coff.lds zImage.ps3.lds zImage.lds + autoconf.h empty.c zImage.coff.lds zImage.ps3.lds zImage.lds quiet_cmd_bootcc = BOOTCC $@ cmd_bootcc = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $< diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c index 48e3743faedf..f045f8494bf9 100644 --- a/arch/powerpc/boot/serial.c +++ b/arch/powerpc/boot/serial.c @@ -18,6 +18,7 @@ #include "stdio.h" #include "io.h" #include "ops.h" +#include "autoconf.h" static int serial_open(void) { From 1a855eaccf353f7ed1d51a3d4b3af727ccbd81ca Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 10 Oct 2018 09:58:03 +1030 Subject: [PATCH 103/221] powerpc/boot: Fix opal console in boot wrapper As of commit 10c77dba40ff ("powerpc/boot: Fix build failure in 32-bit boot wrapper") the opal code is hidden behind CONFIG_PPC64_BOOT_WRAPPER, but the boot wrapper avoids include/linux, so it does not get the normal Kconfig flags. We can drop the guard entirely as in commit f8e8e69cea49 ("powerpc/boot: Only build OPAL code when necessary") the makefile only includes opal.c in the build if CONFIG_PPC64_BOOT_WRAPPER is set. Fixes: 10c77dba40ff ("powerpc/boot: Fix build failure in 32-bit boot wrapper") Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/boot/opal.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c index 0272570d02de..dfb199ef5b94 100644 --- a/arch/powerpc/boot/opal.c +++ b/arch/powerpc/boot/opal.c @@ -13,8 +13,6 @@ #include #include "../include/asm/opal-api.h" -#ifdef CONFIG_PPC64_BOOT_WRAPPER - /* Global OPAL struct used by opal-call.S */ struct opal { u64 base; @@ -101,9 +99,3 @@ int opal_console_init(void *devp, struct serial_console_data *scdp) return 0; } -#else -int opal_console_init(void *devp, struct serial_console_data *scdp) -{ - return -1; -} -#endif /* __powerpc64__ */ From e8e132e6885962582784b6fa16a80d07ea739c0f Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 10 Oct 2018 13:15:22 +1030 Subject: [PATCH 104/221] powerpc/boot: Disable vector instructions This will avoid auto-vectorisation when building with higher optimisation levels. We don't know if the machine can support VSX and even if it's present it's probably not going to be enabled at this point in boot. These flag were both added prior to GCC 4.6 which is the minimum compiler version supported by upstream, thanks to Segher for the details. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 5d5ab6ee48e0..a90197f90149 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -32,8 +32,8 @@ else endif BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ - -fno-strict-aliasing -Os -msoft-float -pipe \ - -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ + -fno-strict-aliasing -Os -msoft-float -mno-altivec -mno-vsx \ + -pipe -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ -D$(compress-y) ifdef CONFIG_PPC64_BOOT_WRAPPER From 747b21760822957750d41a430b4f3974852895f5 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 10 Oct 2018 13:15:23 +1030 Subject: [PATCH 105/221] powerpc/boot: Build boot wrapper with optimisations The boot wrapper is currently built with -Os. By building with O2 we can meaningfully reduce the time decompressing the kernel. I tested by comparing 10 runs of each option in Qemu and on hardware. The kernel is compressed with KERNEL_XZ built with GCC 8.2.0-7ubuntu1. The values are counts of the timebase. Qemu TCG powernv Power8: Os O2 O3 median 10221123889 6201518438 6568186825 stddev 1361267211 429090641 657930076 improvement 39.33% 35.74% Palmetto Power8: Os O2 O3 median 50279 50599 35790 stddev 992144533 627130655 623721078 improvement 36.79% 37.13% Romulus Power9: Os O2 O3 median 670312391 454733720 448881398 stddev 157569 107276 108760 improvement 32.16% 33.03% TCG was quite noisy, with every few runs producing an outlier. Even so, O2 is faster than O3. On hardware the numbers were less noisy and O3 is slightly faster than O2. The wrapper size increases when moving from Os. Comparing zImage.epapr to the existing Os build using bloat-o-meter: Before=43401, After=56837 (13KB), chg +30.96% Before=43401, After=64305 (20KB), chg +48.16% I chose O2 for a balance between Qemu and hardware speed up. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index a90197f90149..b69a97fa2e09 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -32,7 +32,7 @@ else endif BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ - -fno-strict-aliasing -Os -msoft-float -mno-altivec -mno-vsx \ + -fno-strict-aliasing -O2 -msoft-float -mno-altivec -mno-vsx \ -pipe -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ -D$(compress-y) From f9bc28aedfb5bbd572d2d365f3095c1becd7209b Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:20 +1000 Subject: [PATCH 106/221] powerpc/eeh: Fix possible null deref in eeh_dump_dev_log() If an error occurs during an unplug operation, it's possible for eeh_dump_dev_log() to be called when edev->pdn is null, which currently leads to dereferencing a null pointer. Handle this by skipping the error log for those devices. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6ebba3e48b01..c72767a5327a 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -169,6 +169,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) int n = 0, l = 0; char buffer[128]; + if (!pdn) { + pr_warn("EEH: Note: No error log for absent device.\n"); + return 0; + } + n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); From bcbe3730531239abd45ab6c6af4a18078b37dd47 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:21 +1000 Subject: [PATCH 107/221] powerpc/eeh: Fix null deref for devices removed during EEH If a device is removed during EEH processing (either by a driver's handler or as part of recovery), it can lead to a null dereference in eeh_pe_report_edev(). To handle this, skip devices that have been removed. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 67619b4b3f96..4115d353c349 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -281,6 +281,10 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, struct pci_driver *driver; enum pci_ers_result new_result; + if (!edev->pdev) { + eeh_edev_info(edev, "no device"); + return; + } device_lock(&edev->pdev->dev); if (eeh_edev_actionable(edev)) { driver = eeh_pcid_get(edev->pdev); From 473af09b56dc4be68e4af33220ceca6be67aa60d Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:22 +1000 Subject: [PATCH 108/221] powerpc/eeh: Fix use of EEH_PE_KEEP on wrong field eeh_add_to_parent_pe() sometimes removes the EEH_PE_KEEP flag, but it incorrectly removes it from pe->type, instead of pe->state. However, rather than clearing it from the correct field, remove it. Inspection of the code shows that it can't ever have had any effect (even if it had been cleared from the correct field), because the field is never tested after it is cleared by the statement in question. The clear statement was added by commit 807a827d4e74 ("powerpc/eeh: Keep PE during hotplug"), but it didn't explain why it was necessary. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_pe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 1b238ecc553e..210d239a9395 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -379,7 +379,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) while (parent) { if (!(parent->type & EEH_PE_INVALID)) break; - parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP); + parent->type &= ~EEH_PE_INVALID; parent = parent->parent; } From bffc0176e7d0ae0f560aaa9b702dd9264433d780 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:23 +1000 Subject: [PATCH 109/221] powerpc/eeh: Cleanup EEH_POSTPONED_PROBE Currently a flag, EEH_POSTPONED_PROBE, is used to prevent an incorrect message "EEH: No capable adapters found" from being displayed during the boot of powernv systems. It is necessary because, on powernv, the call to eeh_probe_devices() made from eeh_init() is too early and EEH can't yet be enabled. A second call is made later from eeh_pnv_post_init(), which succeeds. (On pseries, the first call succeeds because PCI devices are set up early enough and no second call is made.) This can be simplified by moving the early call to eeh_probe_devices() from eeh_init() (where it's seen by both platforms) to pSeries_final_fixup(), so that each platform only calls eeh_probe_devices() once, at a point where it can succeed. This is slightly later in the boot sequence, but but still early enough and it is now in the same place in the sequence for both platforms (the pcibios_fixup hook). The display of the message can be cleaned up as well, by moving it into eeh_probe_devices(). Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 1 - arch/powerpc/kernel/eeh.c | 18 ++++++------------ arch/powerpc/platforms/powernv/eeh-powernv.c | 14 -------------- arch/powerpc/platforms/pseries/pci.c | 1 + 4 files changed, 7 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 219637ea69a1..147f0117e56f 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -43,7 +43,6 @@ struct pci_dn; #define EEH_VALID_PE_ZERO 0x10 /* PE#0 is valid */ #define EEH_ENABLE_IO_FOR_LOG 0x20 /* Enable IO for log */ #define EEH_EARLY_DUMP_LOG 0x40 /* Dump log immediately */ -#define EEH_POSTPONED_PROBE 0x80 /* Powernv may postpone device probe */ /* * Delay for PE reset, all in ms diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c72767a5327a..8801ada4082b 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1041,6 +1041,11 @@ void eeh_probe_devices(void) pdn = hose->pci_data; traverse_pci_dn(pdn, eeh_ops->probe, NULL); } + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_info("EEH: No capable adapters found\n"); + } /** @@ -1084,18 +1089,7 @@ static int eeh_init(void) eeh_dev_phb_init_dynamic(hose); /* Initialize EEH event */ - ret = eeh_event_init(); - if (ret) - return ret; - - eeh_probe_devices(); - - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else if (!eeh_has_flag(EEH_POSTPONED_PROBE)) - pr_info("EEH: No capable adapters found\n"); - - return ret; + return eeh_event_init(); } core_initcall_sync(eeh_init); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3c1beae29f2d..d0764f2c0733 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -223,14 +223,6 @@ int pnv_eeh_post_init(void) eeh_probe_devices(); eeh_addr_cache_build(); - if (eeh_has_flag(EEH_POSTPONED_PROBE)) { - eeh_clear_flag(EEH_POSTPONED_PROBE); - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - pr_info("EEH: No capable adapters found\n"); - } - /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); if (eeh_event_irq < 0) { @@ -391,12 +383,6 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; - /* Skip if we haven't probed yet */ - if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) { - eeh_add_flag(EEH_POSTPONED_PROBE); - return NULL; - } - /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index eab96637d6cf..41d8a4d1d02e 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -239,6 +239,7 @@ void __init pSeries_final_fixup(void) { pSeries_request_regions(); + eeh_probe_devices(); eeh_addr_cache_build(); #ifdef CONFIG_PCI_IOV From b95a46062bd44ae7d5685d467c39fb3181b05798 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:24 +1000 Subject: [PATCH 110/221] powerpc/eeh: Cleanup unused field in eeh_dev The 'bus' member of struct eeh_dev is assigned to once but never used, so remove it. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 1 - arch/powerpc/kernel/eeh_driver.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 147f0117e56f..703d1f96ee8b 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -147,7 +147,6 @@ struct eeh_dev { struct pci_dev *pdev; /* Associated PCI device */ bool in_error; /* Error flag for edev */ struct pci_dev *physfn; /* Associated SRIOV PF */ - struct pci_bus *bus; /* PCI bus for partial hotplug */ }; static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 4115d353c349..7766766bab57 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -543,7 +543,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) /* Remove it from PCI subsystem */ pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); - edev->bus = dev->bus; edev->mode |= EEH_DEV_DISCONNECTED; if (removed) (*removed)++; From bf773df9d12f73daaa60584b43d6deb21f9c9fc1 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:25 +1000 Subject: [PATCH 111/221] powerpc/eeh: Cleanup eeh_add_virt_device() Remove the unnecessary cast through void * on the first parameter and remove the unused second parameter (always NULL). Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 7766766bab57..cc300eb9585c 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -469,10 +469,9 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, return rc; } -static void *eeh_add_virt_device(void *data, void *userdata) +static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; - struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -743,7 +742,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, edev = list_first_entry(&pe->edevs, struct eeh_dev, list); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); if (pe->type & EEH_PE_VF) { - eeh_add_virt_device(edev, NULL); + eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); @@ -936,7 +935,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * recovered properly. */ list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { - eeh_add_virt_device(edev, NULL); + eeh_add_virt_device(edev); list_del(&edev->rmv_list); } From 80e65b009413e3d36eb7f24cbcab49a201c3088d Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:26 +1000 Subject: [PATCH 112/221] powerpc/eeh: Cleanup list_head field names Instances of struct eeh_pe are placed in a tree structure using the fields "child_list" and "child", so place these next to each other in the definition. The field "child" is a list entry, so remove the unnecessary and misleading use of the list initializer, LIST_HEAD(), on it. The eeh_dev struct contains two list entry fields, called "list" and "rmv_list". Rename them to "entry" and "rmv_entry" and, as above, stop initializing them with LIST_HEAD(). Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 12 ++++++------ arch/powerpc/kernel/eeh_dev.c | 2 -- arch/powerpc/kernel/eeh_driver.c | 10 +++++----- arch/powerpc/kernel/eeh_pe.c | 11 +++++------ arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- arch/powerpc/platforms/pseries/msi.c | 3 ++- 6 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 703d1f96ee8b..b48b08ed9be3 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -98,13 +98,13 @@ struct eeh_pe { atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ void *data; /* PE auxillary data */ - struct list_head child_list; /* Link PE to the child list */ - struct list_head edevs; /* Link list of EEH devices */ - struct list_head child; /* Child PEs */ + struct list_head child_list; /* List of PEs below this PE */ + struct list_head child; /* Memb. child_list/eeh_phb_pe */ + struct list_head edevs; /* List of eeh_dev in this PE */ }; #define eeh_pe_for_each_dev(pe, edev, tmp) \ - list_for_each_entry_safe(edev, tmp, &pe->edevs, list) + list_for_each_entry_safe(edev, tmp, &pe->edevs, entry) #define eeh_for_each_pe(root, pe) \ for (pe = root; pe; pe = eeh_pe_next(pe, root)) @@ -141,8 +141,8 @@ struct eeh_dev { int aer_cap; /* Saved AER capability */ int af_cap; /* Saved AF capability */ struct eeh_pe *pe; /* Associated PE */ - struct list_head list; /* Form link list in the PE */ - struct list_head rmv_list; /* Record the removed edevs */ + struct list_head entry; /* Membership in eeh_pe.edevs */ + struct list_head rmv_entry; /* Membership in rmv_list */ struct pci_dn *pdn; /* Associated PCI device node */ struct pci_dev *pdev; /* Associated PCI device */ bool in_error; /* Error flag for edev */ diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index a34e6912c15e..d8c90f3284b5 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -60,8 +60,6 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - INIT_LIST_HEAD(&edev->list); - INIT_LIST_HEAD(&edev->rmv_list); return edev; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index cc300eb9585c..7859af897058 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -404,7 +404,7 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * EEH device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { - if (list_is_last(&edev->list, &edev->pe->edevs)) + if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); return NULL; @@ -560,7 +560,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) - list_add(&edev->rmv_list, &rmv_data->edev_list); + list_add(&edev->rmv_entry, &rmv_data->edev_list); } else { pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); @@ -739,7 +739,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * PE. We should disconnect it so the binding can be * rebuilt when adding PCI devices. */ - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); if (pe->type & EEH_PE_VF) { eeh_add_virt_device(edev); @@ -934,9 +934,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * For those hot removed VFs, we should add back them after PF get * recovered properly. */ - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { + list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_entry) { eeh_add_virt_device(edev); - list_del(&edev->rmv_list); + list_del(&edev->rmv_entry); } /* Tell all device drivers that they can resume operations */ diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 210d239a9395..7d6d93cd67e1 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -75,7 +75,6 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) pe->type = type; pe->phb = phb; INIT_LIST_HEAD(&pe->child_list); - INIT_LIST_HEAD(&pe->child); INIT_LIST_HEAD(&pe->edevs); pe->data = (void *)pe + ALIGN(sizeof(struct eeh_pe), @@ -360,7 +359,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; /* Put the edev to PE */ - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", pdn->phb->global_number, pdn->busno, @@ -369,7 +368,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; /* * We're running to here because of PCI hotplug caused by @@ -429,7 +428,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * link the EEH device accordingly. */ list_add_tail(&pe->child, &parent->child_list); - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", @@ -469,7 +468,7 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) /* Remove the EEH device */ pe = eeh_dev_to_pe(edev); edev->pe = NULL; - list_del(&edev->list); + list_del(&edev->entry); /* * Check if the parent PE includes any EEH devices. @@ -945,7 +944,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) return pe->bus; /* Retrieve the parent PCI bus of first (top) PCI device */ - edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); pdev = eeh_dev_to_pci_dev(edev); if (pdev) return pdev->bus; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index d0764f2c0733..a7e59dbf2696 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1040,7 +1040,7 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option) int ret; /* The VF PE should have only one child device */ - edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); pdn = eeh_dev_to_pdn(edev); if (!pdn) return -ENXIO; diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index b7496948129e..8011b4129e3a 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -203,7 +203,8 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) /* Get the top level device in the PE */ edev = pdn_to_eeh_dev(PCI_DN(dn)); if (edev->pe) - edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, + entry); dn = pci_device_to_OF_node(edev->pdev); if (!dn) return NULL; From 1c5c533b149f02d0ce00fc7ab5810766398acc11 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:27 +1000 Subject: [PATCH 113/221] powerpc/eeh: Cleanup field names in eeh_rmv_data Change the name of the fields in eeh_rmv_data to clarify their usage. Change "edev_list" to "removed_vf_list" because it does not contain generic edevs, but rather only edevs that contain virtual functions (which need to be removed during recovery). Similarly, change "removed" to "removed_dev_count" because it is a count of any removed devices, not just those in the above list. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 7859af897058..ffe8293d1f06 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -35,8 +35,8 @@ #include struct eeh_rmv_data { - struct list_head edev_list; - int removed; + struct list_head removed_vf_list; + int removed_dev_count; }; static int eeh_result_priority(enum pci_ers_result result) @@ -502,7 +502,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; - int *removed = rmv_data ? &rmv_data->removed : NULL; /* * Actually, we should remove the PCI bridges as well. @@ -524,7 +523,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (eeh_dev_removed(edev)) return NULL; - if (removed) { + if (rmv_data) { if (eeh_pe_passed(edev->pe)) return NULL; driver = eeh_pcid_get(dev); @@ -543,8 +542,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); edev->mode |= EEH_DEV_DISCONNECTED; - if (removed) - (*removed)++; + if (rmv_data) + rmv_data->removed_dev_count++; if (edev->physfn) { #ifdef CONFIG_PCI_IOV @@ -560,7 +559,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) - list_add(&edev->rmv_entry, &rmv_data->edev_list); + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); } else { pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); @@ -729,7 +728,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * the device up before the scripts have taken it down, * potentially weird things happen. */ - if (!driver_eeh_aware || rmv_data->removed) { + if (!driver_eeh_aware || rmv_data->removed_dev_count) { pr_info("EEH: Sleep 5s ahead of %s hotplug\n", (driver_eeh_aware ? "partial" : "complete")); ssleep(5); @@ -791,7 +790,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) struct eeh_pe *tmp_pe; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; + struct eeh_rmv_data rmv_data = + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -934,7 +934,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * For those hot removed VFs, we should add back them after PF get * recovered properly. */ - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_entry) { + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { eeh_add_virt_device(edev); list_del(&edev->rmv_entry); } From 9a3eda266fb55fee0e19f2afbbc57607fa379d23 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:28 +1000 Subject: [PATCH 114/221] powerpc/eeh: Cleanup logic in eeh_rmv_from_parent_pe() Move the call to eeh_dev_to_pe() up, so that later it's clear that "pe" isn't NULL. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_pe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 7d6d93cd67e1..78f125d24bd0 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -456,7 +456,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) int cnt; struct pci_dn *pdn = eeh_dev_to_pdn(edev); - if (!edev->pe) { + pe = eeh_dev_to_pe(edev); + if (!pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", __func__, pdn->phb->global_number, pdn->busno, @@ -466,7 +467,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) } /* Remove the EEH device */ - pe = eeh_dev_to_pe(edev); edev->pe = NULL; list_del(&edev->entry); From 54644927a01d53b69e0e27e4778d329049c1c335 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:29 +1000 Subject: [PATCH 115/221] powerpc/eeh: Cleanup eeh_enabled() Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index b48b08ed9be3..247f09ce44de 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -241,11 +241,7 @@ static inline bool eeh_has_flag(int flag) static inline bool eeh_enabled(void) { - if (eeh_has_flag(EEH_FORCE_DISABLED) || - !eeh_has_flag(EEH_ENABLED)) - return false; - - return true; + return eeh_has_flag(EEH_ENABLED) && !eeh_has_flag(EEH_FORCE_DISABLED); } static inline void eeh_serialize_lock(unsigned long *flags) From eed4bdbeecd0b59d3e487d1a2b726d51810015ab Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:30 +1000 Subject: [PATCH 116/221] powerpc/eeh: Cleanup unnecessary eeh_pe_state_mark_with_cfg() The function eeh_pe_state_mark_with_cfg() just performs the work of eeh_pe_state_mark() and then, conditionally, the work of eeh_pe_state_clear(). However it is only ever called with a constant state such that the condition is always true, so replace it by direct calls. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 6 ++++-- arch/powerpc/kernel/eeh_pe.c | 22 ---------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 8801ada4082b..d5d0390f1d30 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -830,7 +830,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_pe_state_clear(pe, EEH_PE_ISOLATED); break; case pcie_hot_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -838,7 +839,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 78f125d24bd0..2b376718237f 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -670,28 +670,6 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state) eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); } -/** - * eeh_pe_state_mark_with_cfg - Mark PE state with unblocked config space - * @pe: PE - * @state: PE state to be set - * - * Set specified flag to PE and its child PEs. The PCI config space - * of some PEs is blocked automatically when EEH_PE_ISOLATED is set, - * which isn't needed in some situations. The function allows to set - * the specified flag to indicated PEs without blocking their PCI - * config space. - */ -void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); - if (!(state & EEH_PE_ISOLATED)) - return; - - /* Clear EEH_PE_CFG_BLOCKED, which might be set just now */ - state = EEH_PE_CFG_BLOCKED; - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); -} - /* * Some PCI bridges (e.g. PLX bridges) have primary/secondary * buses assigned explicitly by firmware, and we probably have From e762bb891a294af00b83f54062dae4e24565edf8 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:31 +1000 Subject: [PATCH 117/221] powerpc/eeh: Cleanup eeh_pe_state_mark() Currently, eeh_pe_state_mark() marks a PE (and it's children) with a state and then performs additional processing if that state included EEH_PE_ISOLATED. The state parameter is always a constant at the call site, so rearrange eeh_pe_state_mark() into two functions and just call the appropriate one at each site. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-pci.h | 1 + arch/powerpc/kernel/eeh.c | 8 +-- arch/powerpc/kernel/eeh_driver.c | 10 ++- arch/powerpc/kernel/eeh_pe.c | 74 +++++++++----------- arch/powerpc/platforms/powernv/eeh-powernv.c | 8 +-- drivers/pci/hotplug/pnv_php.c | 2 +- 6 files changed, 48 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 726288048652..f67da277d652 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -58,6 +58,7 @@ void eeh_save_bars(struct eeh_dev *edev); int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); +void eeh_pe_mark_isolated(struct eeh_pe *pe); void eeh_pe_state_clear(struct eeh_pe *pe, int state); void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index d5d0390f1d30..12e5311d06ed 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -404,7 +404,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) } /* Isolate the PHB and send event */ - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); pr_err("EEH: PHB#%x failure detected, location: %s\n", @@ -563,7 +563,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * with other functions on this device, and functions under * bridges. */ - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); /* Most EEH events are due to device driver bugs. Having @@ -830,7 +830,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_pe_state_clear(pe, EEH_PE_ISOLATED); break; case pcie_hot_reset: - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); @@ -839,7 +839,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index ffe8293d1f06..c827617613c1 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1029,7 +1029,7 @@ void eeh_handle_special_event(void) phb_pe = eeh_phb_pe_get(hose); if (!phb_pe) continue; - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); } eeh_serialize_unlock(flags); @@ -1044,11 +1044,9 @@ void eeh_handle_special_event(void) /* Purge all events of the PHB */ eeh_remove_event(pe, true); - if (rc == EEH_NEXT_ERR_DEAD_PHB) - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - else - eeh_pe_state_mark(pe, - EEH_PE_ISOLATED | EEH_PE_RECOVERING); + if (rc != EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2b376718237f..e43dcefbe73f 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -539,44 +539,6 @@ void eeh_pe_update_time_stamp(struct eeh_pe *pe) } } -/** - * __eeh_pe_state_mark - Mark the state for the PE - * @data: EEH PE - * @flag: state - * - * The function is used to mark the indicated state for the given - * PE. Also, the associated PCI devices will be put into IO frozen - * state as well. - */ -static void *__eeh_pe_state_mark(struct eeh_pe *pe, void *flag) -{ - int state = *((int *)flag); - struct eeh_dev *edev, *tmp; - struct pci_dev *pdev; - - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; - - pe->state |= state; - - /* Offline PCI devices if applicable */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; - - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (pdev) - pdev->error_state = pci_channel_io_frozen; - } - - /* Block PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state |= EEH_PE_CFG_BLOCKED; - - return NULL; -} - /** * eeh_pe_state_mark - Mark specified state for PE and its associated device * @pe: EEH PE @@ -585,12 +547,44 @@ static void *__eeh_pe_state_mark(struct eeh_pe *pe, void *flag) * is used to mark appropriate state for the affected PEs and the * associated devices. */ -void eeh_pe_state_mark(struct eeh_pe *pe, int state) +void eeh_pe_state_mark(struct eeh_pe *root, int state) { - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); + struct eeh_pe *pe; + + eeh_for_each_pe(root, pe) + if (!(pe->state & EEH_PE_REMOVED)) + pe->state |= state; } EXPORT_SYMBOL_GPL(eeh_pe_state_mark); +/** + * eeh_pe_mark_isolated + * @pe: EEH PE + * + * Record that a PE has been isolated by marking the PE and it's children as + * EEH_PE_ISOLATED (and EEH_PE_CFG_BLOCKED, if required) and their PCI devices + * as pci_channel_io_frozen. + */ +void eeh_pe_mark_isolated(struct eeh_pe *root) +{ + struct eeh_pe *pe; + struct eeh_dev *edev; + struct pci_dev *pdev; + + eeh_pe_state_mark(root, EEH_PE_ISOLATED); + eeh_for_each_pe(root, pe) { + list_for_each_entry(edev, &pe->edevs, entry) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + /* Block PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state |= EEH_PE_CFG_BLOCKED; + } +} +EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); + static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { int mode = *((int *)flag); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index a7e59dbf2696..fd1db9f286f1 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -590,7 +590,7 @@ static int pnv_eeh_get_phb_state(struct eeh_pe *pe) EEH_STATE_MMIO_ENABLED | EEH_STATE_DMA_ENABLED); } else if (!(pe->state & EEH_PE_ISOLATED)) { - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); pnv_eeh_get_phb_diag(pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -692,7 +692,7 @@ static int pnv_eeh_get_pe_state(struct eeh_pe *pe) if (phb->freeze_pe) phb->freeze_pe(phb, pe->addr); - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); pnv_eeh_get_phb_diag(pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -1597,7 +1597,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) if ((ret == EEH_NEXT_ERR_FROZEN_PE || ret == EEH_NEXT_ERR_FENCED_PHB) && !((*pe)->state & EEH_PE_ISOLATED)) { - eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(*pe); pnv_eeh_get_phb_diag(*pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -1626,7 +1626,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) } /* We possibly migrate to another PE */ - eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(*pe); } /* diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 3276a5e4c430..b5ba26d14a9a 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -736,7 +736,7 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data) pe = edev ? edev->pe : NULL; if (pe) { eeh_serialize_lock(&flags); - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE); } From fef7f905523fb96b431e5e73487a689c10c77875 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:32 +1000 Subject: [PATCH 118/221] powerpc/eeh: Cleanup eeh_ops.wait_state() The wait_state member of eeh_ops does not need to be platform dependent; it's just logic around eeh_ops.get_state(). Therefore, merge the two (slightly different!) platform versions into a new function, eeh_wait_state() and remove the eeh_ops member. While doing this, also correct: * The wait logic, so that it never waits longer than max_wait. * The wait logic, so that it never waits less than EEH_STATE_MIN_WAIT_TIME. * One call site where the result is treated like a bit field before it's checked for negative error values. * In pseries_eeh_get_state(), rename the "state" parameter to "delay" because that's what it is. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 4 +- arch/powerpc/kernel/eeh.c | 9 ++- arch/powerpc/kernel/eeh_driver.c | 2 +- arch/powerpc/kernel/eeh_pe.c | 51 +++++++++++++++ arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ----------- arch/powerpc/platforms/pseries/eeh_pseries.c | 66 ++------------------ 6 files changed, 62 insertions(+), 108 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 247f09ce44de..8b596d096ebe 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -205,9 +205,8 @@ struct eeh_ops { void* (*probe)(struct pci_dn *pdn, void *data); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); - int (*get_state)(struct eeh_pe *pe, int *state); + int (*get_state)(struct eeh_pe *pe, int *delay); int (*reset)(struct eeh_pe *pe, int option); - int (*wait_state)(struct eeh_pe *pe, int max_wait); int (*get_log)(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len); int (*configure_bridge)(struct eeh_pe *pe); int (*err_inject)(struct eeh_pe *pe, int type, int func, @@ -264,6 +263,7 @@ typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); +int eeh_wait_state(struct eeh_pe *pe, int max_wait); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 12e5311d06ed..6cae6b56ffd6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -681,7 +681,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) /* Check if the request is finished successfully */ if (active_flag) { - rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + rc = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (rc < 0) return rc; @@ -920,16 +920,15 @@ int eeh_pe_reset_full(struct eeh_pe *pe) break; /* Wait until the PE is in a functioning state */ - state = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); - if (eeh_state_active(state)) - break; - + state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", __func__, pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } + if (eeh_state_active(state)) + break; /* Set error in case this is our last attempt */ ret = -EIO; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index c827617613c1..e7f757cd839b 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -836,7 +836,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { pr_warn("EEH: Permanent failure\n"); goto hard_fail; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index e43dcefbe73f..6fa2032e0594 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -108,6 +108,57 @@ int eeh_phb_pe_create(struct pci_controller *phb) return 0; } +/** + * eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in millisecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +int eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + /* + * According to PAPR, the state of PE might be temporarily + * unavailable. Under the circumstance, we have to wait + * for indicated time determined by firmware. The maximal + * wait time is 5 minutes, which is acquired from the original + * EEH implementation. Also, the original implementation + * also defined the minimal wait time as 1 second. + */ +#define EEH_STATE_MIN_WAIT_TIME (1000) +#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) + + while (1) { + ret = eeh_ops->get_state(pe, &mwait); + + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + if (max_wait <= 0) { + pr_warn("%s: Timeout when getting PE's state (%d)\n", + __func__, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + if (mwait < EEH_STATE_MIN_WAIT_TIME) { + pr_warn("%s: Firmware returned bad wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MIN_WAIT_TIME; + } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { + pr_warn("%s: Firmware returned too long wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MAX_WAIT_TIME; + } + + msleep(min(mwait, max_wait)); + max_wait -= mwait; + } +} + /** * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB * @phb: PCI controller diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index fd1db9f286f1..abc0be7507c8 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1133,43 +1133,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return pnv_eeh_bridge_reset(bus->self, option); } -/** - * pnv_eeh_wait_state - Wait for PE state - * @pe: EEH PE - * @max_wait: maximal period in millisecond - * - * Wait for the state of associated PE. It might take some time - * to retrieve the PE's state. - */ -static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait) -{ - int ret; - int mwait; - - while (1) { - ret = pnv_eeh_get_state(pe, &mwait); - - /* - * If the PE's state is temporarily unavailable, - * we have to wait for the specified time. Otherwise, - * the PE's state will be returned immediately. - */ - if (ret != EEH_STATE_UNAVAILABLE) - return ret; - - if (max_wait <= 0) { - pr_warn("%s: Timeout getting PE#%x's state (%d)\n", - __func__, pe->addr, max_wait); - return EEH_STATE_NOT_SUPPORT; - } - - max_wait -= mwait; - msleep(mwait); - } - - return EEH_STATE_NOT_SUPPORT; -} - /** * pnv_eeh_get_log - Retrieve error log * @pe: EEH PE @@ -1688,7 +1651,6 @@ static struct eeh_ops pnv_eeh_ops = { .get_pe_addr = pnv_eeh_get_pe_addr, .get_state = pnv_eeh_get_state, .reset = pnv_eeh_reset, - .wait_state = pnv_eeh_wait_state, .get_log = pnv_eeh_get_log, .configure_bridge = pnv_eeh_configure_bridge, .err_inject = pnv_eeh_err_inject, diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 823cb27efa8b..c9e5ca4afb26 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -438,7 +438,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) /** * pseries_eeh_get_state - Retrieve PE state * @pe: EEH PE - * @state: return value + * @delay: suggested time to wait if state is unavailable * * Retrieve the state of the specified PE. On RTAS compliant * pseries platform, there already has one dedicated RTAS function @@ -448,7 +448,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) * RTAS calls for the purpose, we need to try the new one and back * to the old one if the new one couldn't work properly. */ -static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) +static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) { int config_addr; int ret; @@ -499,7 +499,8 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) break; case 5: if (rets[2]) { - if (state) *state = rets[2]; + if (delay) + *delay = rets[2]; result = EEH_STATE_UNAVAILABLE; } else { result = EEH_STATE_NOT_SUPPORT; @@ -553,64 +554,6 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option) return ret; } -/** - * pseries_eeh_wait_state - Wait for PE state - * @pe: EEH PE - * @max_wait: maximal period in millisecond - * - * Wait for the state of associated PE. It might take some time - * to retrieve the PE's state. - */ -static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait) -{ - int ret; - int mwait; - - /* - * According to PAPR, the state of PE might be temporarily - * unavailable. Under the circumstance, we have to wait - * for indicated time determined by firmware. The maximal - * wait time is 5 minutes, which is acquired from the original - * EEH implementation. Also, the original implementation - * also defined the minimal wait time as 1 second. - */ -#define EEH_STATE_MIN_WAIT_TIME (1000) -#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) - - while (1) { - ret = pseries_eeh_get_state(pe, &mwait); - - /* - * If the PE's state is temporarily unavailable, - * we have to wait for the specified time. Otherwise, - * the PE's state will be returned immediately. - */ - if (ret != EEH_STATE_UNAVAILABLE) - return ret; - - if (max_wait <= 0) { - pr_warn("%s: Timeout when getting PE's state (%d)\n", - __func__, max_wait); - return EEH_STATE_NOT_SUPPORT; - } - - if (mwait <= 0) { - pr_warn("%s: Firmware returned bad wait value %d\n", - __func__, mwait); - mwait = EEH_STATE_MIN_WAIT_TIME; - } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { - pr_warn("%s: Firmware returned too long wait value %d\n", - __func__, mwait); - mwait = EEH_STATE_MAX_WAIT_TIME; - } - - max_wait -= mwait; - msleep(mwait); - } - - return EEH_STATE_NOT_SUPPORT; -} - /** * pseries_eeh_get_log - Retrieve error log * @pe: EEH PE @@ -849,7 +792,6 @@ static struct eeh_ops pseries_eeh_ops = { .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, .reset = pseries_eeh_reset, - .wait_state = pseries_eeh_wait_state, .get_log = pseries_eeh_get_log, .configure_bridge = pseries_eeh_configure_bridge, .err_inject = NULL, From b90484ec1137424f606832a22f24d6cfc62a1427 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 12 Sep 2018 11:23:33 +1000 Subject: [PATCH 119/221] powerpc/eeh: Cleanup control flow in eeh_handle_normal_event() Rather than mixing "if (state)" blocks and gotos, convert entirely to "if (state)" blocks to make the state machine behaviour clearer. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 202 +++++++++++++++---------------- 1 file changed, 97 insertions(+), 105 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index e7f757cd839b..9446248eb6b8 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, - &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if ((pe->type & EEH_PE_PHB) && + result != PCI_ERS_RESULT_NONE && + result != PCI_ERS_RESULT_NEED_RESET) + result = PCI_ERS_RESULT_NEED_RESET; + } /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - goto hard_fail; + if (result != PCI_ERS_RESULT_DISCONNECT) { + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + result = PCI_ERS_RESULT_DISCONNECT; + } } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + } /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } } @@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: Device driver gave up\n"); - goto hard_fail; - } - /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); @@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; + } else { + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); + } + } + + if ((result == PCI_ERS_RESULT_RECOVERED) || + (result == PCI_ERS_RESULT_NONE)) { + /* + * For those hot removed VFs, we should add back them after PF + * get recovered properly. + */ + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { + eeh_add_virt_device(edev); + list_del(&edev->rmv_entry); } - pr_info("EEH: Notify device drivers " - "the completion of reset\n"); - result = PCI_ERS_RESULT_NONE; + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); eeh_set_channel_state(pe, pci_channel_io_normal); eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); - } + eeh_pe_report("resume", pe, eeh_report_resume, NULL); + eeh_for_each_pe(pe, tmp_pe) { + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + edev->in_error = false; + } + } - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - pr_warn("EEH: Not recovered\n"); - goto hard_fail; - } + pr_info("EEH: Recovery successful.\n"); + } else { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - /* - * For those hot removed VFs, we should add back them after PF get - * recovered properly. - */ - list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, - rmv_entry) { - eeh_add_virt_device(edev); - list_del(&edev->rmv_entry); - } + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Tell all device drivers that they can resume operations */ - pr_info("EEH: Notify device driver to resume\n"); - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("resume", pe, eeh_report_resume, NULL); - eeh_for_each_pe(pe, tmp_pe) { - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { - edev->mode &= ~EEH_DEV_NO_HANDLER; - edev->in_error = false; + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); + + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } } - - pr_info("EEH: Recovery successful.\n"); - goto final; - -hard_fail: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); - - eeh_slot_error_detail(pe, EEH_LOG_PERM); - - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); - - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); - - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; - } -final: eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } From b45e9d761ba2d60044b610297e3ef9f947ac157f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 9 Oct 2018 21:59:13 +0800 Subject: [PATCH 120/221] powerpc/pseries/memory-hotplug: Fix return value type of find_aa_index The variable 'aa_index' is defined as an unsigned value in update_lmb_associativity_index(), but find_aa_index() may return -1 when dlpar_clone_property() fails. So change find_aa_index() to return a bool, which indicates whether 'aa_index' was found or not. Fixes: c05a5a40969e ("powerpc/pseries: Dynamic add entires to associativity lookup array") Signed-off-by: YueHaibing Reviewed-by: Nathan Fontenot nfont@linux.vnet.ibm.com> [mpe: Tweak changelog, rename is_found to just found] Signed-off-by: Michael Ellerman --- .../platforms/pseries/hotplug-memory.c | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9a15d39995e5..2b796da822c2 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -101,11 +101,12 @@ static struct property *dlpar_clone_property(struct property *prop, return new_prop; } -static u32 find_aa_index(struct device_node *dr_node, - struct property *ala_prop, const u32 *lmb_assoc) +static bool find_aa_index(struct device_node *dr_node, + struct property *ala_prop, + const u32 *lmb_assoc, u32 *aa_index) { - u32 *assoc_arrays; - u32 aa_index; + u32 *assoc_arrays, new_prop_size; + struct property *new_prop; int aa_arrays, aa_array_entries, aa_array_sz; int i, index; @@ -121,46 +122,39 @@ static u32 find_aa_index(struct device_node *dr_node, aa_array_entries = be32_to_cpu(assoc_arrays[1]); aa_array_sz = aa_array_entries * sizeof(u32); - aa_index = -1; for (i = 0; i < aa_arrays; i++) { index = (i * aa_array_entries) + 2; if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz)) continue; - aa_index = i; - break; + *aa_index = i; + return true; } - if (aa_index == -1) { - struct property *new_prop; - u32 new_prop_size; + new_prop_size = ala_prop->length + aa_array_sz; + new_prop = dlpar_clone_property(ala_prop, new_prop_size); + if (!new_prop) + return false; - new_prop_size = ala_prop->length + aa_array_sz; - new_prop = dlpar_clone_property(ala_prop, new_prop_size); - if (!new_prop) - return -1; + assoc_arrays = new_prop->value; - assoc_arrays = new_prop->value; + /* increment the number of entries in the lookup array */ + assoc_arrays[0] = cpu_to_be32(aa_arrays + 1); - /* increment the number of entries in the lookup array */ - assoc_arrays[0] = cpu_to_be32(aa_arrays + 1); + /* copy the new associativity into the lookup array */ + index = aa_arrays * aa_array_entries + 2; + memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz); - /* copy the new associativity into the lookup array */ - index = aa_arrays * aa_array_entries + 2; - memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz); + of_update_property(dr_node, new_prop); - of_update_property(dr_node, new_prop); - - /* - * The associativity lookup array index for this lmb is - * number of entries - 1 since we added its associativity - * to the end of the lookup array. - */ - aa_index = be32_to_cpu(assoc_arrays[0]) - 1; - } - - return aa_index; + /* + * The associativity lookup array index for this lmb is + * number of entries - 1 since we added its associativity + * to the end of the lookup array. + */ + *aa_index = be32_to_cpu(assoc_arrays[0]) - 1; + return true; } static int update_lmb_associativity_index(struct drmem_lmb *lmb) @@ -169,6 +163,7 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) struct property *ala_prop; const u32 *lmb_assoc; u32 aa_index; + bool found; parent = of_find_node_by_path("/"); if (!parent) @@ -200,11 +195,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } - aa_index = find_aa_index(dr_node, ala_prop, lmb_assoc); + found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index); dlpar_free_cc_nodes(lmb_node); - if (aa_index < 0) { + if (!found) { pr_err("Could not find LMB associativity\n"); return -1; } From 960e30029863db95ec79a71009272d4661db5991 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 11 Oct 2018 13:13:03 +1030 Subject: [PATCH 121/221] powerpc/Makefile: Fix PPC_BOOK3S_64 ASFLAGS Ever since commit 15a3204d24a3 ("powerpc/64s: Set assembler machine type to POWER4") we force -mpower4 to be passed to the assembler irrespective of the CFLAGS used (for Book3s 64). When building a powerpc64 kernel with clang, clang will not add -many to the assembler flags, so any instructions that the compiler has generated that are not available on power4 will cause an error: /usr/bin/as -a64 -mppc64 -mlittle-endian -mpower8 \ -I ./arch/powerpc/include -I ./arch/powerpc/include/generated \ -I ./include -I ./arch/powerpc/include/uapi \ -I ./arch/powerpc/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi -I arch/powerpc -I arch/powerpc \ -maltivec -mpower4 -o init/do_mounts.o /tmp/do_mounts-3b0a3d.s /tmp/do_mounts-51ce54.s:748: Error: unrecognized opcode: `isel' GCC does include -many, so the GCC driven gas call will succeed: as -v -I ./arch/powerpc/include -I ./arch/powerpc/include/generated -I ./include -I ./arch/powerpc/include/uapi -I ./arch/powerpc/include/generated/uapi -I ./include/uapi -I ./include/generated/uapi -I arch/powerpc -I arch/powerpc -a64 -mpower8 -many -mlittle -maltivec -mpower4 -o init/do_mounts.o Note that isel is power7 and above for IBM CPUs. GCC only generates it for Power9 and above, but the above test was run against the clang generated assembly. Peter Bergner explains: When using -many -mpower4, gas will first try and find a matching power4 mnemonic and failing that, it will then allow any valid mnemonic that gas knows about. GCC's use of -many predates me though. IIRC, Alan looked at trying to remove it, but I forget why he didn't. Could be either a gcc or gas issue at the time. I'm not sure whether issue still exists or not. He and I have modified how gas works internally a fair amount since he tried removing gcc use of -many. I will also note that when using -many, gas will choose the first mnemonic that matches in the mnemonic table and we have (mostly) sorted the table so that server mnemonics show up earlier in the table than other mnemonics, so they'll be seen/chosen first. By explicitly setting -many we can build with Clang and GCC while retaining the -mpower4 option. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 974103254aed..6c4f8a099bbb 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -249,7 +249,11 @@ cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) cpu-as-$(CONFIG_E200) += -Wa,-me200 cpu-as-$(CONFIG_E500) += -Wa,-me500 -cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 + +# When using '-many -mpower4' gas will first try and find a matching power4 +# mnemonic and failing that it will allow any valid mnemonic that GAS knows +# about. GCC will pass -many to GAS when assembling, clang does not. +cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 -Wa,-many cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) From 65b9fdadfc4d87e2577b791fb3495cd39c93d8c0 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Tue, 9 Oct 2018 15:12:14 -0500 Subject: [PATCH 122/221] powerpc/pseries/mobility: Extend start/stop topology update scope The powerpc mobility code may receive RTAS requests to perform PRRN (Platform Resource Reassignment Notification) topology changes at any time, including during LPAR migration operations. In some configurations where the affinity of CPUs or memory is being changed on that platform, the PRRN requests may apply or refer to outdated information prior to the complete update of the device-tree. This patch changes the duration for which topology updates are suppressed during LPAR migrations from just the rtas_ibm_suspend_me() / 'ibm,suspend-me' call(s) to cover the entire migration_store() operation to allow all changes to the device-tree to be applied prior to accepting and applying any PRRN requests. For tracking purposes, pr_info notices are added to the functions start_topology_update() and stop_topology_update() of 'numa.c'. Signed-off-by: Michael Bringmann Reviewed-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtas.c | 2 -- arch/powerpc/mm/numa.c | 6 ++++++ arch/powerpc/platforms/pseries/mobility.c | 5 +++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 2c7ed31c736e..e02ac377ed64 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -982,7 +982,6 @@ int rtas_ibm_suspend_me(u64 handle) } cpu_hotplug_disable(); - stop_topology_update(); /* Call function on all CPUs. One of us will make the * rtas call @@ -995,7 +994,6 @@ int rtas_ibm_suspend_me(u64 handle) if (atomic_read(&data.error) != 0) printk(KERN_ERR "Error doing global join\n"); - start_topology_update(); cpu_hotplug_enable(); /* Take down CPUs not online prior to suspend */ diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 055b211b7126..693ae1c1acba 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1521,6 +1521,10 @@ int start_topology_update(void) } } + pr_info("Starting topology update%s%s\n", + (prrn_enabled ? " prrn_enabled" : ""), + (vphn_enabled ? " vphn_enabled" : "")); + return rc; } @@ -1542,6 +1546,8 @@ int stop_topology_update(void) rc = del_timer_sync(&topology_timer); } + pr_info("Stopping topology update\n"); + return rc; } diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 6f27d00505cf..88925f8ca8a0 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -360,6 +360,8 @@ static ssize_t migration_store(struct class *class, if (rc) return rc; + stop_topology_update(); + do { rc = rtas_ibm_suspend_me(streamid); if (rc == -EAGAIN) @@ -370,6 +372,9 @@ static ssize_t migration_store(struct class *class, return rc; post_mobility_fixup(); + + start_topology_update(); + return count; } From ed9e84a4d703243a232e6549a13dedfaf0d5d2d8 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Fri, 12 Oct 2018 13:14:06 +1030 Subject: [PATCH 123/221] powerpc: Use SWITCH_FRAME_SIZE for prom and rtas entry Commit 6c1719942e19 ("powerpc/of: Remove useless register save/restore when calling OF back") removed the saving of srr0 and srr1 when calling into OpenFirmware. Commit e31aa453bbc4 ("powerpc: Use LOAD_REG_IMMEDIATE only for constants on 64-bit") did the same for rtas. This means we don't need to save the extra stack space and can use the common SWITCH_FRAME_SIZE. There were already no users of _SRR0 and _SRR1 so we can remove them too. Link: https://github.com/linuxppc/linux/issues/83 Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/asm-offsets.c | 9 --------- arch/powerpc/kernel/entry_64.S | 10 +++++----- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a6d70fd2e499..2eb4923f8468 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -280,11 +280,6 @@ int main(void) /* Interrupt register frame */ DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); -#ifdef CONFIG_PPC64 - /* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */ - DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); - DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); -#endif /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(GPR0, gpr[0]); STACK_PT_REGS_OFFSET(GPR1, gpr[1]); STACK_PT_REGS_OFFSET(GPR2, gpr[2]); @@ -328,10 +323,6 @@ int main(void) STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); - - /* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */ - DEFINE(_SRR0, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)); - DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_PPC32) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index ed6f6c7f4264..7db00ee6be48 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1124,7 +1124,7 @@ _ASM_NOKPROBE_SYMBOL(fast_exception_return); _GLOBAL(enter_rtas) mflr r0 std r0,16(r1) - stdu r1,-RTAS_FRAME_SIZE(r1) /* Save SP and create stack space. */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space. */ /* Because RTAS is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1256,7 +1256,7 @@ rtas_restore_regs: ld r8,_DSISR(r1) mtdsisr r8 - addi r1,r1,RTAS_FRAME_SIZE /* Unstack our frame */ + addi r1,r1,SWITCH_FRAME_SIZE /* Unstack our frame */ ld r0,16(r1) /* get return address */ mtlr r0 @@ -1267,7 +1267,7 @@ rtas_restore_regs: _GLOBAL(enter_prom) mflr r0 std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space */ /* Because PROM is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1324,8 +1324,8 @@ _GLOBAL(enter_prom) REST_10GPRS(22, r1) ld r4,_CCR(r1) mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE + + addi r1,r1,SWITCH_FRAME_SIZE ld r0,16(r1) mtlr r0 blr From 50530f5eac0c023cfc313d7ed342d4f1731becdb Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 12 Oct 2018 13:58:52 +1100 Subject: [PATCH 124/221] powerpc/xmon: Show the stack protector canary in xmon This is helpful for debugging stack protector crashes. Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index c70d17c9a6ba..d139741f26fe 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2418,6 +2418,9 @@ static void dump_one_paca(int cpu) DUMP(p, __current, "%-*px"); DUMP(p, kstack, "%#-*llx"); printf(" %-*s = 0x%016llx\n", 25, "kstack_base", p->kstack & ~(THREAD_SIZE - 1)); +#ifdef CONFIG_STACKPROTECTOR + DUMP(p, canary, "%#-*lx"); +#endif DUMP(p, saved_r1, "%#-*llx"); DUMP(p, trap_save, "%#-*x"); DUMP(p, irq_soft_mask, "%#-*x"); From bf6cbd0c87f30d0e4401be91a8161ce11079027a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 12 Oct 2018 22:09:09 +1100 Subject: [PATCH 125/221] powerpc: Fix stackprotector detection for non-glibc toolchains If GCC is not built with glibc support then we must explicitly tell it which register to use for TLS mode stack protector, otherwise it will error out and the cc-option check will fail. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1888636c9eb6..3d008115fe18 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,7 +180,8 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 - select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) + select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) + select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW From 425752c63b6f3fed7b5a9cba2b8101a92cf36995 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 11 Oct 2018 11:03:01 +0530 Subject: [PATCH 126/221] powerpc: Detect the presence of big-cores via "ibm, thread-groups" On IBM POWER9, the device tree exposes a property array identifed by "ibm,thread-groups" which will indicate which groups of threads share a particular set of resources. As of today we only have one form of grouping identifying the group of threads in the core that share the L1 cache, translation cache and instruction data flow. This patch adds helper functions to parse the contents of "ibm,thread-groups" and populate a per-cpu variable to cache information about siblings of each CPU that share the L1, traslation cache and instruction data-flow. It also defines a new global variable named "has_big_cores" which indicates if the cores on this configuration have multiple groups of threads that share L1 cache. For each online CPU, it maintains a cpu_smallcore_mask, which indicates the online siblings which share the L1-cache with it. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputhreads.h | 2 + arch/powerpc/include/asm/smp.h | 11 ++ arch/powerpc/kernel/smp.c | 222 ++++++++++++++++++++++++++ 3 files changed, 235 insertions(+) diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index d71a90924f3b..deb99fd6e060 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -23,11 +23,13 @@ extern int threads_per_core; extern int threads_per_subcore; extern int threads_shift; +extern bool has_big_cores; extern cpumask_t threads_core_mask; #else #define threads_per_core 1 #define threads_per_subcore 1 #define threads_shift 0 +#define has_big_cores 0 #define threads_core_mask (*get_cpu_mask(0)) #endif diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 95b66a0c639b..41695745032c 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -100,6 +100,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys) DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_smallcore_map); static inline struct cpumask *cpu_sibling_mask(int cpu) { @@ -116,6 +117,11 @@ static inline struct cpumask *cpu_l2_cache_mask(int cpu) return per_cpu(cpu_l2_cache_map, cpu); } +static inline struct cpumask *cpu_smallcore_mask(int cpu) +{ + return per_cpu(cpu_smallcore_map, cpu); +} + extern int cpu_to_core_id(int cpu); /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. @@ -166,6 +172,11 @@ static inline const struct cpumask *cpu_sibling_mask(int cpu) return cpumask_of(cpu); } +static inline const struct cpumask *cpu_smallcore_mask(int cpu) +{ + return cpumask_of(cpu); +} + #endif /* CONFIG_SMP */ #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e774d3bf3a03..8d245ff059c9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -75,14 +75,32 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct thread_info *secondary_ti; +bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +EXPORT_SYMBOL_GPL(has_big_cores); + +#define MAX_THREAD_LIST_SIZE 8 +#define THREAD_GROUP_SHARE_L1 1 +struct thread_groups { + unsigned int property; + unsigned int nr_groups; + unsigned int threads_per_group; + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; +}; + +/* + * On big-cores system, cpu_l1_cache_map for each CPU corresponds to + * the set its siblings that share the L1-cache. + */ +DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -675,6 +693,185 @@ static void set_cpus_unrelated(int i, int j, } #endif +/* + * parse_thread_groups: Parses the "ibm,thread-groups" device tree + * property for the CPU device node @dn and stores + * the parsed output in the thread_groups + * structure @tg if the ibm,thread-groups[0] + * matches @property. + * + * @dn: The device node of the CPU device. + * @tg: Pointer to a thread group structure into which the parsed + * output of "ibm,thread-groups" is stored. + * @property: The property of the thread-group that the caller is + * interested in. + * + * ibm,thread-groups[0..N-1] array defines which group of threads in + * the CPU-device node can be grouped together based on the property. + * + * ibm,thread-groups[0] tells us the property based on which the + * threads are being grouped together. If this value is 1, it implies + * that the threads in the same group share L1, translation cache. + * + * ibm,thread-groups[1] tells us how many such thread groups exist. + * + * ibm,thread-groups[2] tells us the number of threads in each such + * group. + * + * ibm,thread-groups[3..N-1] is the list of threads identified by + * "ibm,ppc-interrupt-server#s" arranged as per their membership in + * the grouping. + * + * Example: If ibm,thread-groups = [1,2,4,5,6,7,8,9,10,11,12] it + * implies that there are 2 groups of 4 threads each, where each group + * of threads share L1, translation cache. + * + * The "ibm,ppc-interrupt-server#s" of the first group is {5,6,7,8} + * and the "ibm,ppc-interrupt-server#s" of the second group is {9, 10, + * 11, 12} structure + * + * Returns 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + */ +static int parse_thread_groups(struct device_node *dn, + struct thread_groups *tg, + unsigned int property) +{ + int i; + u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE]; + u32 *thread_list; + size_t total_threads; + int ret; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, 3); + if (ret) + return ret; + + tg->property = thread_group_array[0]; + tg->nr_groups = thread_group_array[1]; + tg->threads_per_group = thread_group_array[2]; + if (tg->property != property || + tg->nr_groups < 1 || + tg->threads_per_group < 1) + return -ENODATA; + + total_threads = tg->nr_groups * tg->threads_per_group; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, + 3 + total_threads); + if (ret) + return ret; + + thread_list = &thread_group_array[3]; + + for (i = 0 ; i < total_threads; i++) + tg->thread_list[i] = thread_list[i]; + + return 0; +} + +/* + * get_cpu_thread_group_start : Searches the thread group in tg->thread_list + * that @cpu belongs to. + * + * @cpu : The logical CPU whose thread group is being searched. + * @tg : The thread-group structure of the CPU node which @cpu belongs + * to. + * + * Returns the index to tg->thread_list that points to the the start + * of the thread_group that @cpu belongs to. + * + * Returns -1 if cpu doesn't belong to any of the groups pointed to by + * tg->thread_list. + */ +static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) +{ + int hw_cpu_id = get_hard_smp_processor_id(cpu); + int i, j; + + for (i = 0; i < tg->nr_groups; i++) { + int group_start = i * tg->threads_per_group; + + for (j = 0; j < tg->threads_per_group; j++) { + int idx = group_start + j; + + if (tg->thread_list[idx] == hw_cpu_id) + return group_start; + } + } + + return -1; +} + +static int init_cpu_l1_cache_map(int cpu) + +{ + struct device_node *dn = of_get_cpu_node(cpu, NULL); + struct thread_groups tg = {.property = 0, + .nr_groups = 0, + .threads_per_group = 0}; + int first_thread = cpu_first_thread_sibling(cpu); + int i, cpu_group_start = -1, err = 0; + + if (!dn) + return -ENODATA; + + err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1); + if (err) + goto out; + + zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + + cpu_group_start = get_cpu_thread_group_start(cpu, &tg); + + if (unlikely(cpu_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + int i_group_start = get_cpu_thread_group_start(i, &tg); + + if (unlikely(i_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + if (i_group_start == cpu_group_start) + cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu)); + } + +out: + of_node_put(dn); + return err; +} + +static int init_big_cores(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int err = init_cpu_l1_cache_map(cpu); + + if (err) + return err; + + zalloc_cpumask_var_node(&per_cpu(cpu_smallcore_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + } + + has_big_cores = true; + return 0; +} + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -713,6 +910,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + init_big_cores(); + if (has_big_cores) { + cpumask_set_cpu(boot_cpuid, + cpu_smallcore_mask(boot_cpuid)); + } + if (smp_ops && smp_ops->probe) smp_ops->probe(); } @@ -1003,10 +1206,28 @@ static void remove_cpu_from_masks(int cpu) set_cpus_unrelated(cpu, i, cpu_core_mask); set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); set_cpus_unrelated(cpu, i, cpu_sibling_mask); + if (has_big_cores) + set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } } #endif +static inline void add_cpu_to_smallcore_masks(int cpu) +{ + struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); + int i, first_thread = cpu_first_thread_sibling(cpu); + + if (!has_big_cores) + return; + + cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) + set_cpus_related(i, cpu, cpu_smallcore_mask); + } +} + static void add_cpu_to_masks(int cpu) { int first_thread = cpu_first_thread_sibling(cpu); @@ -1023,6 +1244,7 @@ static void add_cpu_to_masks(int cpu) if (cpu_online(i)) set_cpus_related(i, cpu, cpu_sibling_mask); + add_cpu_to_smallcore_masks(cpu); /* * Copy the thread sibling mask into the cache sibling mask * and mark any CPUs that share an L2 with this CPU. From 8e8a31d7fd54d68fc9c6c1e69f52ccdaf43b01ea Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 11 Oct 2018 11:03:02 +0530 Subject: [PATCH 127/221] powerpc: Use cpu_smallcore_sibling_mask at SMT level on bigcores POWER9 SMT8 cores consist of two groups of threads, where threads in each group shares L1-cache. The scheduler is not aware of this distinction as the current sched-domain hierarchy has all the threads of the core defined at the SMT domain. SMT [Thread siblings of the SMT8 core] DIE [CPUs in the same die] NUMA [All the CPUs in the system] Due to this, we can observe run-to-run variance when we run a multi-threaded benchmark bound to a single core based on how the scheduler spreads the software threads across the two groups in the core. We fix this in this patch by defining each group of threads which share L1-cache to be the SMT level. The group of threads in the SMT8 core is defined to be the CACHE level. The sched-domain hierarchy after this patch will be : SMT [Thread siblings in the core that share L1 cache] CACHE [Thread siblings that are in the SMT8 core] DIE [CPUs in the same die] NUMA [All the CPUs in the system] Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8d245ff059c9..8e3a5da24d59 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1274,6 +1274,7 @@ static bool shared_caches; void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -1299,11 +1300,13 @@ void start_secondary(void *unused) /* Update topology CPU masks */ add_cpu_to_masks(cpu); + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); @@ -1370,6 +1373,13 @@ static const struct cpumask *shared_cache_mask(int cpu) return cpu_l2_cache_mask(cpu); } +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + static struct sched_domain_topology_level power9_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, @@ -1397,6 +1407,13 @@ void __init smp_cpus_done(unsigned int max_cpus) shared_proc_topology_init(); dump_numa_cpu_topology(); +#ifdef CONFIG_SCHED_SMT + if (has_big_cores) { + pr_info("Using small cores at SMT level\n"); + power9_topology[0].mask = smallcore_smt_mask; + powerpc_topology[0].mask = smallcore_smt_mask; + } +#endif /* * If any CPU detects that it's sharing a cache with another CPU then * use the deeper topology that is aware of this sharing. From 500fe5f550ec6dbc1d7d3154d8eccc7cb834d434 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 11 Oct 2018 11:03:03 +0530 Subject: [PATCH 128/221] powerpc/cacheinfo: Report the correct shared_cpu_map on big-cores Currently on POWER9 SMT8 cores systems, in sysfs, we report the shared_cache_map for L1 caches (both data and instruction) to be the cpu-ids of the threads in SMT8 cores. This is incorrect since on POWER9 SMT8 cores there are two groups of threads, each of which shares its own L1 cache. This patch addresses this by reporting the shared_cpu_map correctly in sysfs for L1 caches. Before the patch /sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_map : 000000ff /sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_map : 000000ff /sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_map : 000000ff /sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_map : 000000ff After the patch /sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_map : 00000055 /sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_map : 00000055 /sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_map : 000000aa /sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_map : 000000aa Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/cacheinfo.c | 37 +++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index a8f20e5928e1..be57bd07596d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "cacheinfo.h" @@ -627,17 +629,48 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); +static unsigned int index_dir_to_cpu(struct cache_index_dir *index) +{ + struct kobject *index_dir_kobj = &index->kobj; + struct kobject *cache_dir_kobj = index_dir_kobj->parent; + struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; + struct device *dev = kobj_to_dev(cpu_dev_kobj); + + return dev->id; +} + +/* + * On big-core systems, each core has two groups of CPUs each of which + * has its own L1-cache. The thread-siblings which share l1-cache with + * @cpu can be obtained via cpu_smallcore_mask(). + */ +static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache *cache) +{ + if (cache->level == 1) + return cpu_smallcore_mask(cpu); + + return &cache->shared_cpu_map; +} + static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) { struct cache_index_dir *index; struct cache *cache; - int ret; + const struct cpumask *mask; + int ret, cpu; index = kobj_to_cache_index_dir(k); cache = index->cache; + if (has_big_cores) { + cpu = index_dir_to_cpu(index); + mask = get_big_core_shared_cpu_map(cpu, cache); + } else { + mask = &cache->shared_cpu_map; + } + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n", - cpumask_pr_args(&cache->shared_cpu_map)); + cpumask_pr_args(mask)); buf[ret++] = '\n'; buf[ret] = '\0'; return ret; From dfd718a2ed1f678e66749ffe41bdeafedf3f4314 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Mon, 1 Oct 2018 16:10:39 +0530 Subject: [PATCH 129/221] powerpc/rtas: Fix a potential race between CPU-Offline & Migration Live Partition Migrations require all the present CPUs to execute the H_JOIN call, and hence rtas_ibm_suspend_me() onlines any offline CPUs before initiating the migration for this purpose. The commit 85a88cabad57 ("powerpc/pseries: Disable CPU hotplug across migrations") disables any CPU-hotplug operations once all the offline CPUs are brought online to prevent any further state change. Once the CPU-Hotplug operation is disabled, the code assumes that all the CPUs are online. However, there is a minor window in rtas_ibm_suspend_me() between onlining the offline CPUs and disabling CPU-Hotplug when a concurrent CPU-offline operations initiated by the userspace can succeed thereby nullifying the the aformentioned assumption. In this unlikely case these offlined CPUs will not call H_JOIN, resulting in a system hang. Fix this by verifying that all the present CPUs are actually online after CPU-Hotplug has been disabled, failing which we restore the state of the offline CPUs in rtas_ibm_suspend_me() and return an -EBUSY. Cc: Nathan Fontenot Cc: Tyrel Datwyler Suggested-by: Michael Ellerman Signed-off-by: Gautham R. Shenoy Reviewed-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtas.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index e02ac377ed64..de35bd8f047f 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -983,6 +983,14 @@ int rtas_ibm_suspend_me(u64 handle) cpu_hotplug_disable(); + /* Check if we raced with a CPU-Offline Operation */ + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) { + pr_err("%s: Raced against a concurrent CPU-Offline\n", + __func__); + atomic_set(&data.error, -EBUSY); + goto out_hotplug_enable; + } + /* Call function on all CPUs. One of us will make the * rtas call */ @@ -994,6 +1002,7 @@ int rtas_ibm_suspend_me(u64 handle) if (atomic_read(&data.error) != 0) printk(KERN_ERR "Error doing global join\n"); +out_hotplug_enable: cpu_hotplug_enable(); /* Take down CPUs not online prior to suspend */ From 86c391bd5f47101acf1f3e0abd9fe0616ae2a4fb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:33 +0000 Subject: [PATCH 130/221] powerpc/32: Add ioremap_wt() and ioremap_coherent() Other arches have ioremap_wt() to map IO areas write-through. Implement it on PPC as well in order to avoid drivers using __ioremap(_PAGE_WRITETHRU) Also implement ioremap_coherent() to avoid drivers using __ioremap(_PAGE_COHERENT) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/io.h | 9 +++++++++ arch/powerpc/mm/pgtable_32.c | 16 ++++++++++++++++ arch/powerpc/mm/pgtable_64.c | 10 ++++++++++ 3 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index e0331e754568..cdccab3938db 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -3,6 +3,9 @@ #ifdef __KERNEL__ #define ARCH_HAS_IOREMAP_WC +#ifdef CONFIG_PPC32 +#define ARCH_HAS_IOREMAP_WT +#endif /* * This program is free software; you can redistribute it and/or @@ -746,6 +749,10 @@ static inline void iosync(void) * * * ioremap_wc enables write combining * + * * ioremap_wt enables write through + * + * * ioremap_coherent maps coherent cached memory + * * * iounmap undoes such a mapping and can be hooked * * * __ioremap_at (and the pending __iounmap_at) are low level functions to @@ -767,6 +774,8 @@ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, unsigned long flags); extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); +void __iomem *ioremap_wt(phys_addr_t address, unsigned long size); +void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) #define ioremap_uc(addr, size) ioremap((addr), (size)) #define ioremap_cache(addr, size) \ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 120a49bfb9c6..4c3adde09d95 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -89,6 +89,22 @@ ioremap_wc(phys_addr_t addr, unsigned long size) } EXPORT_SYMBOL(ioremap_wc); +void __iomem * +ioremap_wt(phys_addr_t addr, unsigned long size) +{ + return __ioremap_caller(addr, size, _PAGE_WRITETHRU, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + return __ioremap_caller(addr, size, _PAGE_COHERENT, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_coherent); + void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e15e63079ba8..c0f356d9b135 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -222,6 +222,16 @@ void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, flags, caller); } +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + unsigned long flags = pgprot_val(pgprot_cached(__pgprot(0))); + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); +} + void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { From e04e39507c3c3da9cba31ee2e52f51b10b6350d0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:35 +0000 Subject: [PATCH 131/221] drivers/video/fbdev: use ioremap_wc/wt() instead of __ioremap() _PAGE_NO_CACHE is a platform specific flag. In addition, this flag is misleading because one would think it requests a noncached page whereas a noncached page is _PAGE_NO_CACHE | _PAGE_GUARDED _PAGE_NO_CACHE alone means write combined noncached page, so lets use ioremap_wc() instead. _PAGE_WRITETHRU is also platform specific flag. Use ioremap_wt() instead. Signed-off-by: Christophe Leroy Acked-by: Daniel Vetter Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: Michael Ellerman --- drivers/video/fbdev/chipsfb.c | 3 +-- drivers/video/fbdev/controlfb.c | 5 +---- drivers/video/fbdev/platinumfb.c | 5 +---- drivers/video/fbdev/valkyriefb.c | 12 ++++++------ 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/video/fbdev/chipsfb.c b/drivers/video/fbdev/chipsfb.c index f103665cad43..40182ed85648 100644 --- a/drivers/video/fbdev/chipsfb.c +++ b/drivers/video/fbdev/chipsfb.c @@ -27,7 +27,6 @@ #include #include #include -#include #ifdef CONFIG_PMAC_BACKLIGHT #include @@ -401,7 +400,7 @@ static int chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) #endif /* CONFIG_PMAC_BACKLIGHT */ #ifdef CONFIG_PPC - p->screen_base = __ioremap(addr, 0x200000, _PAGE_NO_CACHE); + p->screen_base = ioremap_wc(addr, 0x200000); #else p->screen_base = ioremap(addr, 0x200000); #endif diff --git a/drivers/video/fbdev/controlfb.c b/drivers/video/fbdev/controlfb.c index 8d14b29aafea..9cb0ef7ac29e 100644 --- a/drivers/video/fbdev/controlfb.c +++ b/drivers/video/fbdev/controlfb.c @@ -48,9 +48,7 @@ #include #include #include -#include #include -#include #include #include "macmodes.h" @@ -715,8 +713,7 @@ static int __init control_of_init(struct device_node *dp) goto error_out; } /* map at most 8MB for the frame buffer */ - p->frame_buffer = __ioremap(p->frame_buffer_phys, 0x800000, - _PAGE_WRITETHRU); + p->frame_buffer = ioremap_wt(p->frame_buffer_phys, 0x800000); if (!p->control_regs_phys || !request_mem_region(p->control_regs_phys, p->control_regs_size, diff --git a/drivers/video/fbdev/platinumfb.c b/drivers/video/fbdev/platinumfb.c index 377d3399a3ad..bf6b7fb83cf4 100644 --- a/drivers/video/fbdev/platinumfb.c +++ b/drivers/video/fbdev/platinumfb.c @@ -32,9 +32,7 @@ #include #include #include -#include #include -#include #include "macmodes.h" #include "platinumfb.h" @@ -577,8 +575,7 @@ static int platinumfb_probe(struct platform_device* odev) /* frame buffer - map only 4MB */ pinfo->frame_buffer_phys = pinfo->rsrc_fb.start; - pinfo->frame_buffer = __ioremap(pinfo->rsrc_fb.start, 0x400000, - _PAGE_WRITETHRU); + pinfo->frame_buffer = ioremap_wt(pinfo->rsrc_fb.start, 0x400000); pinfo->base_frame_buffer = pinfo->frame_buffer; /* registers */ diff --git a/drivers/video/fbdev/valkyriefb.c b/drivers/video/fbdev/valkyriefb.c index 275fb98236d3..d51c3a8009cb 100644 --- a/drivers/video/fbdev/valkyriefb.c +++ b/drivers/video/fbdev/valkyriefb.c @@ -54,13 +54,11 @@ #include #include #include -#include #ifdef CONFIG_MAC #include #else #include #endif -#include #include "macmodes.h" #include "valkyriefb.h" @@ -318,7 +316,7 @@ static void __init valkyrie_choose_mode(struct fb_info_valkyrie *p) int __init valkyriefb_init(void) { struct fb_info_valkyrie *p; - unsigned long frame_buffer_phys, cmap_regs_phys, flags; + unsigned long frame_buffer_phys, cmap_regs_phys; int err; char *option = NULL; @@ -337,7 +335,6 @@ int __init valkyriefb_init(void) /* Hardcoded addresses... welcome to 68k Macintosh country :-) */ frame_buffer_phys = 0xf9000000; cmap_regs_phys = 0x50f24000; - flags = IOMAP_NOCACHE_SER; /* IOMAP_WRITETHROUGH?? */ #else /* ppc (!CONFIG_MAC) */ { struct device_node *dp; @@ -354,7 +351,6 @@ int __init valkyriefb_init(void) frame_buffer_phys = r.start; cmap_regs_phys = r.start + 0x304000; - flags = _PAGE_WRITETHRU; } #endif /* ppc (!CONFIG_MAC) */ @@ -369,7 +365,11 @@ int __init valkyriefb_init(void) } p->total_vram = 0x100000; p->frame_buffer_phys = frame_buffer_phys; - p->frame_buffer = __ioremap(frame_buffer_phys, p->total_vram, flags); +#ifdef CONFIG_MAC + p->frame_buffer = ioremap_nocache(frame_buffer_phys, p->total_vram); +#else + p->frame_buffer = ioremap_wt(frame_buffer_phys, p->total_vram); +#endif p->cmap_regs_phys = cmap_regs_phys; p->cmap_regs = ioremap(p->cmap_regs_phys, 0x1000); p->valkyrie_regs_phys = cmap_regs_phys+0x6000; From ed18e423a3d9b2dc9db801358b754e722fcabaff Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:37 +0000 Subject: [PATCH 132/221] drivers/block/z2ram: use ioremap_wt() instead of __ioremap(_PAGE_WRITETHRU) _PAGE_WRITETHRU is a target specific flag. Prefer generic functions. Acked-by: Geert Uytterhoeven Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- drivers/block/z2ram.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index d0c5bc4e0703..cfbd70520eeb 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -190,8 +190,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode) vfree(vmalloc (size)); } - vaddr = (unsigned long) __ioremap (paddr, size, - _PAGE_WRITETHRU); + vaddr = (unsigned long)ioremap_wt(paddr, size); #else vaddr = (unsigned long)z_remap_nocache_nonser(paddr, size); From 402a5698b4462a3dcfcf9bdafabed7f63c9be7d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:39 +0000 Subject: [PATCH 133/221] soc/fsl/qbman: use ioremap_cache() instead of ioremap_prot(0) ioremap_prot() with flag set to 0 relies on a hack in __ioremap_caller() which adds PAGE_KERNEL flags when the handed flags don't look like a valid set of flags (ie don't include _PAGE_PRESENT) The intention being to map cached memory, use ioremap_cache() instead. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- drivers/soc/fsl/qbman/qman_ccsr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c b/drivers/soc/fsl/qbman/qman_ccsr.c index 79cba58387a5..0fbb201346c7 100644 --- a/drivers/soc/fsl/qbman/qman_ccsr.c +++ b/drivers/soc/fsl/qbman/qman_ccsr.c @@ -418,7 +418,7 @@ static size_t fqd_sz, pfdr_sz; static int zero_priv_mem(phys_addr_t addr, size_t sz) { /* map as cacheable, non-guarded */ - void __iomem *tmpp = ioremap_prot(addr, sz, 0); + void __iomem *tmpp = ioremap_cache(addr, sz); if (!tmpp) return -ENOMEM; From aa91796ec46339f2ed53da311bd3ea77a3e4dfe1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:41 +0000 Subject: [PATCH 134/221] powerpc: don't use ioremap_prot() nor __ioremap() unless really needed. In many places, ioremap_prot() and __ioremap() can be replaced with higher level functions like ioremap(), ioremap_coherent(), ioremap_cache(), ioremap_wc() ... Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/btext.c | 2 +- arch/powerpc/kernel/crash_dump.c | 2 +- arch/powerpc/platforms/85xx/smp.c | 4 ++-- arch/powerpc/platforms/pasemi/dma_lib.c | 2 +- arch/powerpc/platforms/ps3/spu.c | 3 +-- arch/powerpc/sysdev/fsl_85xx_cache_sram.c | 8 ++++---- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index b2072d5bbf2b..b4241ed1456e 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -163,7 +163,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); + vbase = ioremap_wc(base, size); if (!vbase) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index d10ad258d41a..bbdc4706c159 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -110,7 +110,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, vaddr = __va(paddr); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); } else { - vaddr = __ioremap(paddr, PAGE_SIZE, 0); + vaddr = ioremap_cache(paddr, PAGE_SIZE); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); iounmap(vaddr); } diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 7e966f4cf19a..fff72425727a 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -216,8 +216,8 @@ static int smp_85xx_start_cpu(int cpu) /* Map the spin table */ if (ioremappable) - spin_table = ioremap_prot(*cpu_rel_addr, - sizeof(struct epapr_spin_table), _PAGE_COHERENT); + spin_table = ioremap_coherent(*cpu_rel_addr, + sizeof(struct epapr_spin_table)); else spin_table = phys_to_virt(*cpu_rel_addr); diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index c80f72c370ae..53384eb42a76 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -576,7 +576,7 @@ int pasemi_dma_init(void) res.start = 0xfd800000; res.end = res.start + 0x1000; } - dma_status = __ioremap(res.start, resource_size(&res), 0); + dma_status = ioremap_cache(res.start, resource_size(&res)); pci_dev_put(iob_pdev); for (i = 0; i < MAX_TXCH; i++) diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index b54850845466..7746c2a3c509 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -215,8 +215,7 @@ static int __init setup_areas(struct spu *spu) goto fail_ioremap; } - spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, - LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); + spu->local_store = (__force void *)ioremap_wc(spu->local_store_phys, LS_SIZE); if (!spu->local_store) { pr_debug("%s:%d: ioremap local_store failed\n", diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c index 00ccf3e4fcb4..15cbdd4fde06 100644 --- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c +++ b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c @@ -107,11 +107,11 @@ int __init instantiate_cache_sram(struct platform_device *dev, goto out_free; } - cache_sram->base_virt = ioremap_prot(cache_sram->base_phys, - cache_sram->size, _PAGE_COHERENT | PAGE_KERNEL); + cache_sram->base_virt = ioremap_coherent(cache_sram->base_phys, + cache_sram->size); if (!cache_sram->base_virt) { - dev_err(&dev->dev, "%pOF: ioremap_prot failed\n", - dev->dev.of_node); + dev_err(&dev->dev, "%pOF: ioremap_coherent failed\n", + dev->dev.of_node); ret = -ENOMEM; goto out_release; } From 56f3c1413f5cce0c8f4d6f1ab79d790da5aa61af Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:43 +0000 Subject: [PATCH 135/221] powerpc/mm: properly set PAGE_KERNEL flags in ioremap() Set PAGE_KERNEL directly in the caller and do not rely on a hack adding PAGE_KERNEL flags when _PAGE_PRESENT is not set. As already done for PPC64, use pgprot_cache() helpers instead of _PAGE_XXX flags in PPC32 ioremap() derived functions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/pgtable.h | 2 ++ arch/powerpc/kernel/isa-bridge.c | 6 ++--- arch/powerpc/kernel/pci_64.c | 2 +- arch/powerpc/mm/pgtable_32.c | 28 ++++++++++------------- arch/powerpc/mm/pgtable_64.c | 10 +++----- arch/powerpc/platforms/4xx/ocm.c | 7 ++---- drivers/pcmcia/electra_cf.c | 2 +- 7 files changed, 24 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index b321c82b3624..5b82e44c4231 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -197,6 +197,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre #if _PAGE_WRITETHRU != 0 #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \ _PAGE_COHERENT | _PAGE_WRITETHRU)) +#else +#define pgprot_cached_wthru(prot) pgprot_noncached(prot) #endif #define pgprot_cached_noncoherent(prot) \ diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 1df6c74aa731..072e384f8c86 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -110,14 +110,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_val(pgprot_noncached(PAGE_KERNEL))); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); + 0x10000, pgprot_val(pgprot_noncached(PAGE_KERNEL))); } @@ -253,7 +253,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) */ isa_io_base = ISA_IO_BASE; __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_val(pgprot_noncached(PAGE_KERNEL))); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index dff28f903512..64bb4dd2b8f1 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) + pgprot_val(pgprot_noncached(PAGE_KERNEL))) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 4c3adde09d95..6a81a2446c47 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -76,32 +76,36 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, - __builtin_return_address(0)); + unsigned long flags = pgprot_val(pgprot_noncached(PAGE_KERNEL)); + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap); void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE, - __builtin_return_address(0)); + unsigned long flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL)); + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wc); void __iomem * ioremap_wt(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_WRITETHRU, - __builtin_return_address(0)); + unsigned long flags = pgprot_val(pgprot_cached_wthru(PAGE_KERNEL)); + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wt); void __iomem * ioremap_coherent(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_COHERENT, - __builtin_return_address(0)); + unsigned long flags = pgprot_val(pgprot_cached(PAGE_KERNEL)); + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_coherent); @@ -134,14 +138,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, phys_addr_t p; int err; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index c0f356d9b135..1f1bb40555a8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -118,10 +118,6 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, { unsigned long i; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - /* We don't support the 4K PFN hack with ioremap */ if (flags & H_PAGE_4K_PFN) return NULL; @@ -204,7 +200,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); + unsigned long flags = pgprot_val(pgprot_noncached(PAGE_KERNEL)); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -214,7 +210,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); + unsigned long flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL)); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -224,7 +220,7 @@ void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_cached(__pgprot(0))); + unsigned long flags = pgprot_val(pgprot_cached(PAGE_KERNEL)); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c index 69d9f60d9fe5..f5bbd4563342 100644 --- a/arch/powerpc/platforms/4xx/ocm.c +++ b/arch/powerpc/platforms/4xx/ocm.c @@ -113,7 +113,6 @@ static void __init ocm_init_node(int count, struct device_node *node) int len; struct resource rsrc; - int ioflags; ocm = ocm_get_node(count); @@ -179,9 +178,8 @@ static void __init ocm_init_node(int count, struct device_node *node) /* ioremap the non-cached region */ if (ocm->nc.memtotal) { - ioflags = _PAGE_NO_CACHE | _PAGE_GUARDED | _PAGE_EXEC; ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal, - ioflags); + _PAGE_EXEC | PAGE_KERNEL_NCG); if (!ocm->nc.virt) { printk(KERN_ERR @@ -195,9 +193,8 @@ static void __init ocm_init_node(int count, struct device_node *node) /* ioremap the cached region */ if (ocm->c.memtotal) { - ioflags = _PAGE_EXEC; ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal, - ioflags); + _PAGE_EXEC | PAGE_KERNEL); if (!ocm->c.virt) { printk(KERN_ERR diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index 9671ded549f0..34d6c1a0971e 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -230,7 +230,7 @@ static int electra_cf_probe(struct platform_device *ofdev) if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || (__ioremap_at(io.start, cf->io_virt, cf->io_size, - pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) { + pgprot_val(pgprot_noncached(PAGE_KERNEL))) == NULL)) { dev_err(device, "can't ioremap ranges\n"); status = -ENOMEM; goto fail1; From c766ee72235d09b0080f77474085fc17d6ae2fb1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:45 +0000 Subject: [PATCH 136/221] powerpc: handover page flags with a pgprot_t parameter In order to avoid multiple conversions, handover directly a pgprot_t to map_kernel_page() as already done for radix. Do the same for __ioremap_caller() and __ioremap_at(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 +- arch/powerpc/include/asm/book3s/64/hash.h | 3 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 7 ++-- arch/powerpc/include/asm/fixmap.h | 2 +- arch/powerpc/include/asm/io.h | 4 +-- arch/powerpc/include/asm/machdep.h | 2 +- arch/powerpc/include/asm/nohash/32/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/64/pgtable.h | 3 +- arch/powerpc/kernel/io-workarounds.c | 4 +-- arch/powerpc/kernel/isa-bridge.c | 6 ++-- arch/powerpc/kernel/pci_64.c | 2 +- arch/powerpc/lib/code-patching.c | 3 +- arch/powerpc/mm/8xx_mmu.c | 3 +- arch/powerpc/mm/dma-noncoherent.c | 2 +- arch/powerpc/mm/mem.c | 4 +-- arch/powerpc/mm/pgtable-book3e.c | 9 ++--- arch/powerpc/mm/pgtable-hash64.c | 7 ++-- arch/powerpc/mm/pgtable_32.c | 37 +++++++++----------- arch/powerpc/mm/pgtable_64.c | 37 ++++++++++---------- drivers/pcmcia/electra_cf.c | 2 +- 20 files changed, 64 insertions(+), 77 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 751cf931bb3f..7a9f0ed599ff 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -292,7 +292,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index fcf8b10a209f..247aff9cc6ba 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -201,8 +201,7 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); extern int __meminit hash__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c68cbbff3429..eae6e1030523 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1030,17 +1030,16 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -static inline int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags) +static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { if (radix_enabled()) { #if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); #endif - return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + return radix__map_kernel_page(ea, pa, prot, PAGE_SIZE); } - return hash__map_kernel_page(ea, pa, flags); + return hash__map_kernel_page(ea, pa, prot); } static inline int __meminit vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 41cc15c14eee..b9fbed84ddca 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -72,7 +72,7 @@ enum fixed_addresses { static inline void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { - map_kernel_page(fix_to_virt(idx), phys, pgprot_val(flags)); + map_kernel_page(fix_to_virt(idx), phys, flags); } #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index cdccab3938db..0a034519957d 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -786,12 +786,12 @@ extern void iounmap(volatile void __iomem *addr); extern void __iomem *__ioremap(phys_addr_t, unsigned long size, unsigned long flags); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, - unsigned long flags, void *caller); + pgprot_t prot, void *caller); extern void __iounmap(volatile void __iomem *addr); extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, - unsigned long size, unsigned long flags); + unsigned long size, pgprot_t prot); extern void __iounmap_at(void *ea, unsigned long size); /* diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index b4831f1338db..8311869005fa 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -35,7 +35,7 @@ struct machdep_calls { char *name; #ifdef CONFIG_PPC64 void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller); + pgprot_t prot, void *caller); void (*iounmap)(volatile void __iomem *token); #ifdef CONFIG_PM diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a507a65b0866..a7f44498ab6f 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -323,7 +323,7 @@ static inline int pte_young(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 7cd6809f4d33..513b6e9e62c6 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -327,8 +327,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index aa9f1b8261db..7e89d02a84e1 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -153,10 +153,10 @@ static const struct ppc_pci_io iowa_pci_io = { #ifdef CONFIG_PPC_INDIRECT_MMIO static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { struct iowa_bus *bus; - void __iomem *res = __ioremap_caller(addr, size, flags, caller); + void __iomem *res = __ioremap_caller(addr, size, prot, caller); int busno; bus = iowa_pci_find(0, (unsigned long)addr); diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 072e384f8c86..fda3ae48480c 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -110,14 +110,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(PAGE_KERNEL))); + size, pgprot_noncached(PAGE_KERNEL)); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_val(pgprot_noncached(PAGE_KERNEL))); + 0x10000, pgprot_noncached(PAGE_KERNEL)); } @@ -253,7 +253,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) */ isa_io_base = ISA_IO_BASE; __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(PAGE_KERNEL))); + size, pgprot_noncached(PAGE_KERNEL)); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 64bb4dd2b8f1..9d8c10d55407 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_val(pgprot_noncached(PAGE_KERNEL))) == NULL) + pgprot_noncached(PAGE_KERNEL)) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 5ffee298745f..89502cbccb1b 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -98,8 +98,7 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr) else pfn = __pa_symbol(addr) >> PAGE_SHIFT; - err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), - pgprot_val(PAGE_KERNEL)); + err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); if (err) diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index cf77d755246d..9137361d687d 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -91,11 +91,10 @@ static void __init mmu_mapin_immr(void) { unsigned long p = PHYS_IMMR_BASE; unsigned long v = VIRT_IMMR_BASE; - unsigned long f = pgprot_val(PAGE_KERNEL_NCG); int offset; for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_kernel_page(v + offset, p + offset, f); + map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } /* Address of instructions to patch */ diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 382528475433..b6e7b5952ab5 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -228,7 +228,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t do { SetPageReserved(page); map_kernel_page(vaddr, page_to_phys(page), - pgprot_val(pgprot_noncached(PAGE_KERNEL))); + pgprot_noncached(PAGE_KERNEL)); page++; vaddr += PAGE_SIZE; } while (size -= PAGE_SIZE); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 04ccb274a620..cb421aeb7674 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -309,11 +309,11 @@ void __init paging_init(void) unsigned long end = __fix_to_virt(FIX_HOLE); for (; v < end; v += PAGE_SIZE) - map_kernel_page(v, 0, 0); /* XXX gross */ + map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ #endif #ifdef CONFIG_HIGHMEM - map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index a2298930f990..e0ccf36714b2 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -42,7 +42,7 @@ int __meminit vmemmap_create_mapping(unsigned long start, * thus must have the low bits clear */ for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); + BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); return 0; } @@ -70,7 +70,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -89,8 +89,6 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } else { pgdp = pgd_offset_k(ea); #ifndef __PAGETABLE_PUD_FOLDED @@ -113,9 +111,8 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); smp_wmb(); return 0; diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 692bfc9e372c..c08d49046a96 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -142,7 +142,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -161,8 +161,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); } else { /* * If the mm subsystem is not fully up, we cannot create a @@ -170,7 +169,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag * entry in the hardware page table. * */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), mmu_io_psize, mmu_kernel_ssize)) { printk(KERN_ERR "Failed to do bolted mapping IO " "memory at %016lx !\n", pa); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6a81a2446c47..0bbc7b7d8a05 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -76,36 +76,36 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached(PAGE_KERNEL)); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap); void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL)); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wc); void __iomem * ioremap_wt(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_cached_wthru(PAGE_KERNEL)); + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wt); void __iomem * ioremap_coherent(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_cached(PAGE_KERNEL)); + pgprot_t prot = pgprot_cached(PAGE_KERNEL); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_coherent); @@ -120,19 +120,18 @@ ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) flags &= ~(_PAGE_USER | _PAGE_EXEC); flags |= _PAGE_PRIVILEGED; - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, - void *caller) +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { unsigned long v, i; phys_addr_t p; @@ -195,7 +194,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, err = 0; for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v+i, p+i, flags); + err = map_kernel_page(v + i, p + i, prot); if (err) { if (slab_is_available()) vunmap((void *)v); @@ -221,7 +220,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -237,9 +236,8 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) * hash table */ BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && - flags); - set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + pgprot_val(prot)); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot)); } smp_wmb(); return err; @@ -250,7 +248,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) */ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { - unsigned long v, s, f; + unsigned long v, s; phys_addr_t p; int ktext; @@ -260,8 +258,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) for (; s < top; s += PAGE_SIZE) { ktext = ((char *)v >= _stext && (char *)v < etext) || ((char *)v >= _sinittext && (char *)v < _einittext); - f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); - map_kernel_page(v, p, f); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) hash_preload(&init_mm, v, 0, 0x300); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1f1bb40555a8..b0f4a4b4f62b 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -113,13 +113,12 @@ unsigned long ioremap_bot = IOREMAP_BASE; * __ioremap_at - Low level function to establish the page tables * for an IO mapping */ -void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, - unsigned long flags) +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) { unsigned long i; /* We don't support the 4K PFN hack with ioremap */ - if (flags & H_PAGE_4K_PFN) + if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -127,7 +126,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea + i, pa + i, prot)) return NULL; return (void __iomem *)ea; @@ -148,7 +147,7 @@ void __iounmap_at(void *ea, unsigned long size) } void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { phys_addr_t paligned; void __iomem *ret; @@ -178,11 +177,11 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, flags); + ret = __ioremap_at(paligned, area->addr, size, prot); if (!ret) vunmap(area->addr); } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); if (ret) ioremap_bot += size; } @@ -195,37 +194,37 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached(PAGE_KERNEL)); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL)); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_cached(PAGE_KERNEL)); + pgprot_t prot = pgprot_cached(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, @@ -246,8 +245,8 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, flags |= _PAGE_PRIVILEGED; if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, __pgprot(flags), caller); + return __ioremap_caller(addr, size, __pgprot(flags), caller); } diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index 34d6c1a0971e..b31abe35ed2c 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -230,7 +230,7 @@ static int electra_cf_probe(struct platform_device *ofdev) if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || (__ioremap_at(io.start, cf->io_virt, cf->io_size, - pgprot_val(pgprot_noncached(PAGE_KERNEL))) == NULL)) { + pgprot_noncached(PAGE_KERNEL)) == NULL)) { dev_err(device, "can't ioremap ranges\n"); status = -ENOMEM; goto fail1; From d81e6f8b7c6600c891cd133312061d23d4e6690c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:47 +0000 Subject: [PATCH 137/221] powerpc/mm: don't use _PAGE_EXEC in book3s/32 book3s/32 doesn't define _PAGE_EXEC, so no need to use it. All other platforms define _PAGE_EXEC so no need to check it is not NUL when not book3s/32. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 +- arch/powerpc/mm/pgtable.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7a9f0ed599ff..3127cc529aa1 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -234,7 +234,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, int psize) { unsigned long set = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW); unsigned long clr = ~pte_val(entry) & _PAGE_RO; pte_update(ptep, clr, set); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index aee04b209b51..f97d9c3760e3 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -73,7 +73,7 @@ static struct page *maybe_pte_to_page(pte_t pte) return page; } -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 +#ifdef CONFIG_PPC_BOOK3S /* Server-style MMU handles coherency when hashing if HW exec permission * is supposed per page (currently 64-bit only). If not, then, we always @@ -106,7 +106,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, return pte; } -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ +#else /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so @@ -179,7 +179,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, return __pte(pte_val(pte) | _PAGE_EXEC); } -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ +#endif /* CONFIG_PPC_BOOK3S */ /* * set_pte stores a linux PTE into the linux page table. From aa9cd505e39d63034243c9c1ea8b0a984a6b201f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:50 +0000 Subject: [PATCH 138/221] powerpc/mm: move some nohash pte helpers in nohash/[32:64]/pgtable.h In order to allow their use in nohash/32/pgtable.h, we have to move the following helpers in nohash/[32:64]/pgtable.h: - pte_mkwrite() - pte_mkdirty() - pte_mkyoung() - pte_wrprotect() Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgtable.h | 28 ++++++++++++++++++++ arch/powerpc/include/asm/nohash/64/pgtable.h | 20 ++++++++++++++ arch/powerpc/include/asm/nohash/pgtable.h | 28 -------------------- 3 files changed, 48 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a7f44498ab6f..4373f8c44b6d 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -136,6 +136,34 @@ extern int icache_44x_need_flush; #define pte_clear(mm, addr, ptep) \ do { pte_update(ptep, ~0, 0); } while (0) +static inline pte_t pte_mkwrite(pte_t pte) +{ + pte_basic_t ptev; + + ptev = pte_val(pte) & ~_PAGE_RO; + ptev |= _PAGE_RW; + return __pte(ptev); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte_basic_t ptev; + + ptev = pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE); + ptev |= _PAGE_RO; + return __pte(ptev); +} + #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) #define pmd_present(pmd) (pmd_val(pmd) & _PMD_PRESENT_MASK) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 513b6e9e62c6..72dac522aa66 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -94,6 +94,26 @@ #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} + #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 5b82e44c4231..c746e9e784cd 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -77,15 +77,6 @@ static inline unsigned long pte_pfn(pte_t pte) { return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ -static inline pte_t pte_wrprotect(pte_t pte) -{ - pte_basic_t ptev; - - ptev = pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE); - ptev |= _PAGE_RO; - return __pte(ptev); -} - static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); @@ -96,25 +87,6 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) -{ - pte_basic_t ptev; - - ptev = pte_val(pte) & ~_PAGE_RO; - ptev |= _PAGE_RW; - return __pte(ptev); -} - -static inline pte_t pte_mkdirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_DIRTY); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); From daba790242dfbdf6ef1bcabf3d6ed4c88cccf59a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:52 +0000 Subject: [PATCH 139/221] powerpc/mm: add pte helpers to query and change pte flags In order to avoid using generic _PAGE_XXX flags in powerpc core functions, define helpers for all needed flags: - pte_mkuser() and pte_mkprivileged() to set/unset and/or unset/set _PAGE_USER and/or _PAGE_PRIVILEGED - pte_hashpte() to check if _PAGE_HASHPTE is set. - pte_ci() check if cache is inhibited (already existing on book3s/64) - pte_exprotect() to protect against execution - pte_exec() and pte_mkexec() to query and set page execution - pte_mkpte() to set _PAGE_PTE flag. - pte_hw_valid() to check _PAGE_PRESENT since pte_present does something different on book3s/64. On book3s/32 there is no exec protection, so pte_mkexec() and pte_exprotect() are nops and pte_exec() returns always true. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 41 ++++++++++++++++++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 35 +++++++++++++++++ arch/powerpc/include/asm/nohash/32/pgtable.h | 5 +++ arch/powerpc/include/asm/nohash/64/pgtable.h | 5 +++ arch/powerpc/include/asm/nohash/pgtable.h | 28 +++++++++++++ 5 files changed, 114 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 3127cc529aa1..a6ca799e0eb5 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -301,6 +301,7 @@ static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } +static inline bool pte_exec(pte_t pte) { return true; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } static inline int pte_present(pte_t pte) @@ -308,6 +309,21 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } +static inline bool pte_hw_valid(pte_t pte) +{ + return pte_val(pte) & _PAGE_PRESENT; +} + +static inline bool pte_hashpte(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_HASHPTE); +} + +static inline bool pte_ci(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_NO_CACHE); +} + /* * We only find page table entry in the last level * Hence no need for other accessors @@ -354,6 +370,11 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_RW); } +static inline pte_t pte_exprotect(pte_t pte) +{ + return pte; +} + static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~_PAGE_DIRTY); @@ -364,6 +385,16 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } +static inline pte_t pte_mkexec(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkpte(pte_t pte) +{ + return pte; +} + static inline pte_t pte_mkwrite(pte_t pte) { return __pte(pte_val(pte) | _PAGE_RW); @@ -389,6 +420,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_USER); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_USER); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index eae6e1030523..28a15c3450ff 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -519,6 +519,11 @@ static inline int pte_special(pte_t pte) return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SPECIAL)); } +static inline bool pte_exec(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_EXEC)); +} + static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -587,6 +592,11 @@ static inline int pte_present(pte_t pte) return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); } +static inline bool pte_hw_valid(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); +} + #ifdef CONFIG_PPC_MEM_KEYS extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute); #else @@ -646,6 +656,11 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_WRITE); } +static inline pte_t pte_exprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_EXEC); +} + static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~_PAGE_DIRTY); @@ -656,6 +671,16 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +static inline pte_t pte_mkpte(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_PTE); +} + static inline pte_t pte_mkwrite(pte_t pte) { /* @@ -689,6 +714,16 @@ static inline pte_t pte_mkdevmap(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); } +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_PRIVILEGED); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); +} + /* * This is potentially called with a pmd as the argument, in which case it's not * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set. diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 4373f8c44b6d..6fecfd7854f5 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -164,6 +164,11 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(ptev); } +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) #define pmd_present(pmd) (pmd_val(pmd) & _PMD_PRESENT_MASK) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 72dac522aa66..b7d65d4b61be 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -114,6 +114,11 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_RW); } +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index c746e9e784cd..b256e38a047c 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -19,6 +19,9 @@ static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } +static inline bool pte_hashpte(pte_t pte) { return false; } +static inline bool pte_ci(pte_t pte) { return pte_val(pte) & _PAGE_NO_CACHE; } +static inline bool pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_NUMA_BALANCING @@ -44,6 +47,11 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } +static inline bool pte_hw_valid(pte_t pte) +{ + return pte_val(pte) & _PAGE_PRESENT; +} + /* * We only find page table entry in the last level * Hence no need for other accessors @@ -77,6 +85,11 @@ static inline unsigned long pte_pfn(pte_t pte) { return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ +static inline pte_t pte_exprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_EXEC); +} + static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); @@ -87,6 +100,11 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } +static inline pte_t pte_mkpte(pte_t pte) +{ + return pte; +} + static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); @@ -97,6 +115,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return __pte(pte_val(pte) | _PAGE_HUGE); } +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); From 34eb138ed74dc95285478903148a53bd034829be Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:54 +0000 Subject: [PATCH 140/221] powerpc/mm: don't use _PAGE_EXEC for calling hash_preload() The 'access' parameter of hash_preload() is either 0 or _PAGE_EXEC. Among the two versions of hash_preload(), only the PPC64 one is doing something with this 'access' parameter. In order to remove the use of _PAGE_EXEC outside platform code, 'access' parameter is replaced by 'is_exec' which will be either true of false, and the PPC64 version of hash_preload() creates the access flag based on 'is_exec'. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 3 ++- arch/powerpc/mm/mem.c | 9 +++++---- arch/powerpc/mm/mmu_decl.h | 2 +- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/ppc_mmu_32.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 8ff03c7205a0..854edc3722e0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1482,7 +1482,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) #endif void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1490,6 +1490,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pte_t *ptep; unsigned long flags; int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); BUG_ON(REGION_ID(ea) != USER_REGION_ID); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index cb421aeb7674..dd949d6649a2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -509,7 +509,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access, trap; + unsigned long trap; + bool is_exec; if (radix_enabled()) { prefetch((void *)address); @@ -531,16 +532,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; switch (trap) { case 0x300: - access = 0UL; + is_exec = false; break; case 0x400: - access = _PAGE_EXEC; + is_exec = true; break; default: return; } - hash_preload(vma->vm_mm, address, access, trap); + hash_preload(vma->vm_mm, address, is_exec, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ && defined(CONFIG_HUGETLB_PAGE) diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index e5d779eed181..dd7f9b951d25 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -82,7 +82,7 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, #else /* CONFIG_PPC_MMU_NOHASH */ extern void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap); + bool is_exec, unsigned long trap); extern void _tlbie(unsigned long address); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 0bbc7b7d8a05..01f348938328 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -261,7 +261,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) - hash_preload(&init_mm, v, 0, 0x300); + hash_preload(&init_mm, v, false, 0x300); #endif v += PAGE_SIZE; p += PAGE_SIZE; diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index bea6c544e38f..38a793bfca37 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -163,7 +163,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, * Preload a translation in the hash table */ void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { pmd_t *pmd; From 26973fa5ac0e3b88d0d476caccfc10839b26098b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:56 +0000 Subject: [PATCH 141/221] powerpc/mm: use pte helpers in generic code Get rid of platform specific _PAGE_XXXX in powerpc common code and use helpers instead. mm/dump_linuxpagetables.c will be handled separately Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 9 +++------ arch/powerpc/include/asm/nohash/32/pgtable.h | 12 +++++++---- arch/powerpc/include/asm/nohash/pgtable.h | 3 +-- arch/powerpc/mm/pgtable.c | 21 +++++++------------- arch/powerpc/mm/pgtable_32.c | 15 +++++++------- arch/powerpc/mm/pgtable_64.c | 14 ++++++------- arch/powerpc/xmon/xmon.c | 12 ++++++----- 7 files changed, 41 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index a6ca799e0eb5..a0dc3a3eef33 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -331,17 +331,14 @@ static inline bool pte_ci(pte_t pte) #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); /* * A read-only access is controlled by _PAGE_USER bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER; + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + return false; - if (write) - need_pte_bits |= _PAGE_WRITE; - - if ((pteval & need_pte_bits) != need_pte_bits) + if (write && !pte_write(pte)) return false; return true; diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 6fecfd7854f5..a4156da4a7a4 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -277,7 +277,10 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); + unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0))); + unsigned long set = pte_val(pte_wrprotect(__pte(0))); + + pte_update(ptep, clr, set); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -291,9 +294,10 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, int psize) { - unsigned long set = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); - unsigned long clr = ~pte_val(entry) & (_PAGE_RO | _PAGE_NA); + pte_t pte_set = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(0))))); + pte_t pte_clr = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(~0))))); + unsigned long set = pte_val(entry) & pte_val(pte_set); + unsigned long clr = ~pte_val(entry) & ~pte_val(pte_clr); pte_update(ptep, clr, set); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index b256e38a047c..062d96233673 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -32,8 +32,7 @@ static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PA */ static inline int pte_protnone(pte_t pte) { - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; + return pte_present(pte) && !pte_user(pte); } static inline int pmd_protnone(pmd_t pmd) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index f97d9c3760e3..ca4b1f7ac39d 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -44,20 +44,13 @@ static inline int is_exec_fault(void) static inline int pte_looks_normal(pte_t pte) { -#if defined(CONFIG_PPC_BOOK3S_64) - if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; if (pte_user(pte)) return 1; } return 0; -#else - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER | - _PAGE_PRIVILEGED)) == - (_PAGE_PRESENT | _PAGE_USER); -#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -117,7 +110,7 @@ static pte_t set_pte_filter(pte_t pte) struct page *pg; /* No exec permission in the first place, move on */ - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ @@ -137,7 +130,7 @@ static pte_t set_pte_filter(pte_t pte) } /* Else, we filter out _PAGE_EXEC */ - return __pte(pte_val(pte) & ~_PAGE_EXEC); + return pte_exprotect(pte); } static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, @@ -150,7 +143,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, * if necessary. Also if _PAGE_EXEC is already set, same deal, * we just bail out */ - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + if (dirty || pte_exec(pte) || !is_exec_fault()) return pte; #ifdef CONFIG_DEBUG_VM @@ -176,7 +169,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, set_bit(PG_arch_1, &pg->flags); bail: - return __pte(pte_val(pte) | _PAGE_EXEC); + return pte_mkexec(pte); } #endif /* CONFIG_PPC_BOOK3S */ @@ -191,10 +184,10 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * Make sure hardware valid bit is not set. We don't do * tlb flush for this update. */ - VM_WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); + VM_WARN_ON(pte_hw_valid(*ptep)); /* Add the pte bit when trying to set a pte */ - pte = __pte(pte_val(pte) | _PAGE_PTE); + pte = pte_mkpte(pte); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 01f348938328..5877f5aa8f5d 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -112,15 +112,17 @@ EXPORT_SYMBOL(ioremap_coherent); void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); + /* writeable implies dirty for kernel addresses */ - if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO) - flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); - flags |= _PAGE_PRIVILEGED; + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); + return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); @@ -235,8 +237,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) /* The PTE should never be already set nor present in the * hash table */ - BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && - pgprot_val(prot)); + BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot)); set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot)); } smp_wmb(); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index b0f4a4b4f62b..fb1375c07e8c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -230,23 +230,23 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_WRITE) - flags |= _PAGE_DIRTY; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_EXEC leak out */ - flags &= ~_PAGE_EXEC; + pte = pte_exprotect(pte); /* * Force kernel mapping. */ - flags &= ~_PAGE_USER; - flags |= _PAGE_PRIVILEGED; + pte = pte_mkprivileged(pte); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, __pgprot(flags), caller); - return __ioremap_caller(addr, size, __pgprot(flags), caller); + return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d139741f26fe..8345defa0e43 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2996,15 +2996,17 @@ static void show_task(struct task_struct *tsk) #ifdef CONFIG_PPC_BOOK3S_64 void format_pte(void *ptep, unsigned long pte) { + pte_t entry = __pte(pte); + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); printf("Flags = %s%s%s%s%s\n", - (pte & _PAGE_ACCESSED) ? "Accessed " : "", - (pte & _PAGE_DIRTY) ? "Dirty " : "", - (pte & _PAGE_READ) ? "Read " : "", - (pte & _PAGE_WRITE) ? "Write " : "", - (pte & _PAGE_EXEC) ? "Exec " : ""); + pte_young(entry) ? "Accessed " : "", + pte_dirty(entry) ? "Dirty " : "", + pte_read(entry) ? "Read " : "", + pte_write(entry) ? "Write " : "", + pte_exec(entry) ? "Exec " : ""); } static void show_pte(unsigned long addr) From 97026b5a5ac26541b3d294146f5c941491a9e609 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:51:58 +0000 Subject: [PATCH 142/221] powerpc/mm: Split dump_pagelinuxtables flag_array table To reduce the complexity of flag_array, and allow the removal of default 0 value of non existing flags, lets have one flag_array table for each platform family with only the really existing flags. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 7 + arch/powerpc/mm/dump_linuxpagetables-8xx.c | 82 +++++++++ .../mm/dump_linuxpagetables-book3s64.c | 115 +++++++++++++ .../powerpc/mm/dump_linuxpagetables-generic.c | 82 +++++++++ arch/powerpc/mm/dump_linuxpagetables.c | 155 +----------------- arch/powerpc/mm/dump_linuxpagetables.h | 19 +++ 6 files changed, 307 insertions(+), 153 deletions(-) create mode 100644 arch/powerpc/mm/dump_linuxpagetables-8xx.c create mode 100644 arch/powerpc/mm/dump_linuxpagetables-book3s64.c create mode 100644 arch/powerpc/mm/dump_linuxpagetables-generic.c create mode 100644 arch/powerpc/mm/dump_linuxpagetables.h diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index cdf6a9960046..3c844bdd16c4 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -43,5 +43,12 @@ obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o +ifdef CONFIG_PPC_PTDUMP +obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o +endif obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c new file mode 100644 index 000000000000..33f52a97975b --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-8xx.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include +#include + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = 0, + .set = "rw", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_RO, + .set = "r ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_NA, + .set = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c new file mode 100644 index 000000000000..a637e612b205 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include +#include + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "r", + .clear = " ", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "w", + .clear = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PTE, + .val = _PAGE_PTE, + .set = "pte", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = H_PAGE_HASHPTE, + .val = H_PAGE_HASHPTE, + .set = "hpte", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { + .mask = H_PAGE_BUSY, + .val = H_PAGE_BUSY, + .set = "busy", + }, { +#ifdef CONFIG_PPC_64K_PAGES + .mask = H_PAGE_COMBO, + .val = H_PAGE_COMBO, + .set = "combo", + }, { + .mask = H_PAGE_4K_PFN, + .val = H_PAGE_4K_PFN, + .set = "4K_pfn", + }, { +#else /* CONFIG_PPC_64K_PAGES */ + .mask = H_PAGE_F_GIX, + .val = H_PAGE_F_GIX, + .set = "f_gix", + .is_val = true, + .shift = H_PAGE_F_GIX_SHIFT, + }, { + .mask = H_PAGE_F_SECOND, + .val = H_PAGE_F_SECOND, + .set = "f_second", + }, { +#endif /* CONFIG_PPC_64K_PAGES */ + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/dump_linuxpagetables-generic.c new file mode 100644 index 000000000000..1e3829ec1348 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-generic.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include +#include + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RW, + .val = _PAGE_RW, + .set = "rw", + .clear = "r ", + }, { +#ifndef CONFIG_PPC_BOOK3S_32 + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { +#endif + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_WRITETHRU, + .val = _PAGE_WRITETHRU, + .set = "write through", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 876e2a3c79f2..e60aa6d7456d 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -27,6 +27,8 @@ #include #include +#include "dump_linuxpagetables.h" + #ifdef CONFIG_PPC32 #define KERN_VIRT_START 0 #endif @@ -101,159 +103,6 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; -struct flag_info { - u64 mask; - u64 val; - const char *set; - const char *clear; - bool is_val; - int shift; -}; - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_USER | _PAGE_PRIVILEGED, - .val = _PAGE_USER, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RW, - .set = "rw", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "ro", - }, { -#if _PAGE_NA != 0 - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "na", - }, { -#endif - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PTE, - .val = _PAGE_PTE, - .set = "pte", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", - }, { -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_HASHPTE, - .val = H_PAGE_HASHPTE, -#else - .mask = _PAGE_HASHPTE, - .val = _PAGE_HASHPTE, -#endif - .set = "hpte", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_GUARDED, - .val = _PAGE_GUARDED, - .set = "guarded", - .clear = " ", - }, { -#endif - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_WRITETHRU, - .val = _PAGE_WRITETHRU, - .set = "write through", - .clear = " ", - }, { -#endif -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_NO_CACHE, - .val = _PAGE_NO_CACHE, - .set = "no cache", - .clear = " ", - }, { -#else - .mask = _PAGE_NON_IDEMPOTENT, - .val = _PAGE_NON_IDEMPOTENT, - .set = "non-idempotent", - .clear = " ", - }, { - .mask = _PAGE_TOLERANT, - .val = _PAGE_TOLERANT, - .set = "tolerant", - .clear = " ", - }, { -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_BUSY, - .val = H_PAGE_BUSY, - .set = "busy", - }, { -#ifdef CONFIG_PPC_64K_PAGES - .mask = H_PAGE_COMBO, - .val = H_PAGE_COMBO, - .set = "combo", - }, { - .mask = H_PAGE_4K_PFN, - .val = H_PAGE_4K_PFN, - .set = "4K_pfn", - }, { -#else /* CONFIG_PPC_64K_PAGES */ - .mask = H_PAGE_F_GIX, - .val = H_PAGE_F_GIX, - .set = "f_gix", - .is_val = true, - .shift = H_PAGE_F_GIX_SHIFT, - }, { - .mask = H_PAGE_F_SECOND, - .val = H_PAGE_F_SECOND, - .set = "f_second", - }, { -#endif /* CONFIG_PPC_64K_PAGES */ -#endif - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level { - const struct flag_info *flag; - size_t num; - u64 mask; -}; - -static struct pgtable_level pg_level[] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; - static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/dump_linuxpagetables.h new file mode 100644 index 000000000000..5d513636de73 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +struct pgtable_level { + const struct flag_info *flag; + size_t num; + u64 mask; +}; + +extern struct pgtable_level pg_level[5]; From 093d7ca22920c79ecdda87614ac4c5e3786068d6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:00 +0000 Subject: [PATCH 143/221] powerpc/mm: drop unused page flags The following page flags in pte-common.h can be dropped: _PAGE_ENDIAN is only used in mm/fsl_booke_mmu.c and is defined in asm/nohash/32/pte-fsl-booke.h _PAGE_4K_PFN is nowhere defined nor used _PAGE_READ, _PAGE_WRITE and _PAGE_PTE are only defined and used in book3s/64 The following page flags in book3s/64/pgtable.h can be dropped as they are not used on this platform nor by common code. _PAGE_NA, _PAGE_RO, _PAGE_USER and _PAGE_PSIZE Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +--------- arch/powerpc/include/asm/pte-common.h | 17 +---------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 28a15c3450ff..a4d525a6e2bc 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -14,10 +14,6 @@ */ #define _PAGE_BIT_SWAP_TYPE 0 -#define _PAGE_NA 0 -#define _PAGE_RO 0 -#define _PAGE_USER 0 - #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ @@ -122,10 +118,6 @@ #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) -/* - * No page size encoding in the linux PTE - */ -#define _PAGE_PSIZE 0 /* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes @@ -149,7 +141,7 @@ * pages. We always set _PAGE_COHERENT when SMP is enabled or * the processor might need it for DMA coherency. */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) /* Permission masks used to generate the __P and __S table, diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index bef56141a549..5a5ba43bdf98 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -14,18 +14,12 @@ #ifndef _PAGE_EXEC #define _PAGE_EXEC 0 #endif -#ifndef _PAGE_ENDIAN -#define _PAGE_ENDIAN 0 -#endif #ifndef _PAGE_COHERENT #define _PAGE_COHERENT 0 #endif #ifndef _PAGE_WRITETHRU #define _PAGE_WRITETHRU 0 #endif -#ifndef _PAGE_4K_PFN -#define _PAGE_4K_PFN 0 -#endif #ifndef _PAGE_SAO #define _PAGE_SAO 0 #endif @@ -39,9 +33,6 @@ #define _PAGE_RW 0 #endif -#ifndef _PAGE_PTE -#define _PAGE_PTE 0 -#endif /* At least one of _PAGE_PRIVILEGED or _PAGE_USER must be defined */ #ifndef _PAGE_PRIVILEGED #define _PAGE_PRIVILEGED 0 @@ -122,7 +113,7 @@ static inline bool pte_user(pte_t pte) /* Mask of bits returned by pte_pgprot() */ #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_ENDIAN | _PAGE_4K_PFN | \ + _PAGE_WRITETHRU | \ _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ _PAGE_PRIVILEGED | \ _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) @@ -208,12 +199,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -#ifndef _PAGE_READ -/* if not defined, we should not find _PAGE_WRITE too */ -#define _PAGE_READ 0 -#define _PAGE_WRITE _PAGE_RW -#endif - #ifndef H_PAGE_4K_PFN #define H_PAGE_4K_PFN 0 #endif From f4805785f068a29f3be757d837cfc05903a8afe8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:02 +0000 Subject: [PATCH 144/221] powerpc/mm: move __P and __S tables in the common pgtable.h __P and __S flags are the same for all platform and should remain as is in the future, so avoid duplication. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 20 -------------------- arch/powerpc/include/asm/pgtable.h | 19 +++++++++++++++++++ arch/powerpc/include/asm/pte-common.h | 20 -------------------- 3 files changed, 19 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index a4d525a6e2bc..6fbb45d11bf0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -151,8 +151,6 @@ * Write permissions imply read permissions for now (we could make write-only * pages on BookE but we don't bother for now). Execute permission control is * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR */ #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) @@ -162,24 +160,6 @@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 14c79a7dc855..fb4b85bba110 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,6 +20,25 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ +/* Note due to the way vm flags are laid out, the bits are XWR */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X +#define __P101 PAGE_READONLY_X +#define __P110 PAGE_COPY_X +#define __P111 PAGE_COPY_X + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X +#define __S101 PAGE_READONLY_X +#define __S110 PAGE_SHARED_X +#define __S111 PAGE_SHARED_X + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 5a5ba43bdf98..4860dae76dae 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -139,8 +139,6 @@ static inline bool pte_user(pte_t pte) * Write permissions imply read permissions for now (we could make write-only * pages on BookE but we don't bother for now). Execute permission control is * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR */ #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) @@ -153,24 +151,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO | \ _PAGE_EXEC) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ From b2133bd7a553ae22c5cd849e3229197d4517fdb0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:04 +0000 Subject: [PATCH 145/221] powerpc/book3s/32: do not include pte-common.h As done for book3s/64, add necessary flags/defines in book3s/32/pgtable.h and do not include pte-common.h It allows in the meantime to remove all related hash definitions from pte-common.h and to also remove _PAGE_EXEC default as _PAGE_EXEC is defined on all platforms except book3s/32. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 102 ++++++++++++++++++- arch/powerpc/include/asm/pte-common.h | 16 +-- 2 files changed, 101 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index a0dc3a3eef33..0fbd4c642b51 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -8,7 +8,102 @@ #include /* And here we include common definitions */ -#include + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX 0 +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW) + +#define _PAGE_HPTEFLAGS _PAGE_HASHPTE + +#ifndef __ASSEMBLY__ + +static inline bool pte_user(pte_t pte) +{ + return pte_val(pte) & _PAGE_USER; +} +#endif /* __ASSEMBLY__ */ + +/* + * Location of the PFN in the PTE. Most 32-bit platforms use the same + * as _PAGE_SHIFT here (ie, naturally aligned). + * Platform who don't just pre-define the value so we don't override it here. + */ +#define PTE_RPN_SHIFT (PAGE_SHIFT) + +/* + * The mask covered by the RPN must be a ULL on 32-bit platforms with + * 64-bit PTEs. + */ +#ifdef CONFIG_PTE_64BIT +#define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#else +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#endif + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HASHPTE | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_SPECIAL) + +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ + _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ + _PAGE_RW | _PAGE_DIRTY) + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) + +/* + * Permission masks used to generate the __P and __S table. + * + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h + * + * Write permissions imply read permissions for now. + */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER) + +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X + +/* Advertise special mapping type for AGP */ +#define PAGE_AGP (PAGE_KERNEL_NC) +#define HAVE_PAGE_AGP #define PTE_INDEX_SIZE PTE_SHIFT #define PMD_INDEX_SIZE 0 @@ -219,7 +314,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); + pte_update(ptep, _PAGE_RW, 0); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -235,9 +330,8 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, { unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW); - unsigned long clr = ~pte_val(entry) & _PAGE_RO; - pte_update(ptep, clr, set); + pte_update(ptep, 0, set); flush_tlb_page(vma, address); } diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 4860dae76dae..3a8ec18ffd22 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -5,15 +5,9 @@ * Some bits are only used on some cpu families... Make sure that all * the undefined gets a sensible default */ -#ifndef _PAGE_HASHPTE -#define _PAGE_HASHPTE 0 -#endif #ifndef _PAGE_HWWRITE #define _PAGE_HWWRITE 0 #endif -#ifndef _PAGE_EXEC -#define _PAGE_EXEC 0 -#endif #ifndef _PAGE_COHERENT #define _PAGE_COHERENT 0 #endif @@ -68,11 +62,8 @@ #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | \ _PAGE_HWWRITE | _PAGE_EXEC) #endif -#ifndef _PAGE_HPTEFLAGS -#define _PAGE_HPTEFLAGS _PAGE_HASHPTE -#endif #ifndef _PTE_NONE_MASK -#define _PTE_NONE_MASK _PAGE_HPTEFLAGS +#define _PTE_NONE_MASK 0 #endif #ifndef __ASSEMBLY__ @@ -108,7 +99,7 @@ static inline bool pte_user(pte_t pte) /* _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL) /* Mask of bits returned by pte_pgprot() */ @@ -125,8 +116,7 @@ static inline bool pte_user(pte_t pte) * the processor might need it for DMA coherency. */ #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_STD_MMU) || \ - defined(CONFIG_PPC_E500MC) +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) #else #define _PAGE_BASE (_PAGE_BASE_NC) From e0f57031ca0b52f7db4bad57db4037a7daa546c3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:06 +0000 Subject: [PATCH 146/221] powerpc/mm: Move pte_user() into nohash/pgtable.h Now the pte-common.h is only for nohash platforms, lets move pte_user() helper out of pte-common.h to put it together with other helpers. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/pgtable.h | 10 ++++++++++ arch/powerpc/include/asm/pte-common.h | 13 ------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 062d96233673..8de3b7eb88b0 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -51,6 +51,16 @@ static inline bool pte_hw_valid(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } +/* + * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + */ +static inline bool pte_user(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_USER | _PAGE_PRIVILEGED)) == _PAGE_USER; +} + /* * We only find page table entry in the last level * Hence no need for other accessors diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 3a8ec18ffd22..556a914ff845 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -66,19 +66,6 @@ #define _PTE_NONE_MASK 0 #endif -#ifndef __ASSEMBLY__ - -/* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. - */ -static inline bool pte_user(pte_t pte) -{ - return (pte_val(pte) & (_PAGE_USER | _PAGE_PRIVILEGED)) == _PAGE_USER; -} -#endif /* __ASSEMBLY__ */ - /* Location of the PFN in the PTE. Most 32-bit platforms use the same * as _PAGE_SHIFT here (ie, naturally aligned). * Platform who don't just pre-define the value so we don't override it here From d82fd29c5a8caafdc88c6c59c18019a7a5975d99 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:08 +0000 Subject: [PATCH 147/221] powerpc/mm: Distribute platform specific PAGE and PMD flags and definitions The base kernel PAGE_XXXX definition sets are more or less platform specific. Lets distribute them close to platform _PAGE_XXX flags definition, and customise them to their exact platform flags. Also defines _PAGE_PSIZE and _PTE_NONE_MASK for each platform allthough they are defined as 0. Do the same with _PMD flags like _PMD_USER and _PMD_PRESENT_MASK Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pte-40x.h | 29 ++++++++ arch/powerpc/include/asm/nohash/32/pte-44x.h | 35 ++++++++++ arch/powerpc/include/asm/nohash/32/pte-8xx.h | 27 ++++++++ .../include/asm/nohash/32/pte-fsl-booke.h | 38 +++++++++++ arch/powerpc/include/asm/nohash/pte-book3e.h | 30 +++++++++ arch/powerpc/include/asm/pte-common.h | 66 ------------------- 6 files changed, 159 insertions(+), 66 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index bb4b3a4b92a0..2b48bc289a4d 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -50,13 +50,42 @@ #define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE | _PAGE_EXEC) + #define _PMD_PRESENT 0x400 /* PMD points to page of PTEs */ +#define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x802 #define _PMD_SIZE_4M 0x0c0 #define _PMD_SIZE_16M 0x0e0 +#define _PMD_USER 0 + +#define _PTE_NONE_MASK 0 /* Until my rework is finished, 40x still needs atomic PTE updates */ #define PTE_ATOMIC_UPDATES 1 +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_NO_CACHE | \ + _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ + _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) + +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_40x_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index f812c0272364..8d6b268a986f 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -85,14 +85,49 @@ #define _PAGE_NO_CACHE 0x00000400 /* H: I bit */ #define _PAGE_WRITETHRU 0x00000800 /* H: W bit */ +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) + +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ + _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ + _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) + /* TODO: Add large page lowmem mapping support */ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 /* ERPN in a PTE never gets cleared, ignore it */ #define _PTE_NONE_MASK 0xffffffff00000000ULL +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#if defined(CONFIG_SMP) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_44x_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index f04cb46ae8a1..d06fc45bd9ac 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -46,19 +46,46 @@ #define _PAGE_NA 0x0200 /* Supervisor NA, User no access */ #define _PAGE_RO 0x0600 /* Supervisor RO, User no access */ +#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_RO) +#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC) +#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_DIRTY) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_EXEC) + +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_NO_CACHE | \ + _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ + _PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_EXEC) + #define _PMD_PRESENT 0x0001 +#define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x0fd0 #define _PMD_PAGE_MASK 0x000c #define _PMD_PAGE_8M 0x000c #define _PMD_PAGE_512K 0x0004 #define _PMD_USER 0x0020 /* APG 1 */ +#define _PTE_NONE_MASK 0 + /* Until my rework is finished, 8xx still needs atomic PTE updates */ #define PTE_ATOMIC_UPDATES 1 #ifdef CONFIG_PPC_16K_PAGES #define _PAGE_PSIZE _PAGE_HUGE +#else +#define _PAGE_PSIZE 0 #endif +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) +#define PAGE_SHARED __pgprot(_PAGE_BASE) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_8xx_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h index d1ee24e9e137..1ecf60fe0909 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h @@ -31,11 +31,49 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) + +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ + _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ + _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) + #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 + +#define _PTE_NONE_MASK 0 #define PTE_WIMGE_SHIFT (6) +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H */ diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index 12730b81cd98..58eef8cb569d 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -77,7 +77,37 @@ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 +#else +#define _PTE_NONE_MASK 0 #endif +/* Mask of bits returned by pte_pgprot() */ +#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ + _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ + _PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#if defined(CONFIG_SMP) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_PTE_BOOK3E_H */ diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 556a914ff845..cce60b3ba7d4 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -17,9 +17,6 @@ #ifndef _PAGE_SAO #define _PAGE_SAO 0 #endif -#ifndef _PAGE_PSIZE -#define _PAGE_PSIZE 0 -#endif /* _PAGE_RO and _PAGE_RW shall not be defined at the same time */ #ifndef _PAGE_RO #define _PAGE_RO 0 @@ -42,30 +39,6 @@ #define _PAGE_HUGE 0 #endif -#ifndef _PMD_PRESENT_MASK -#define _PMD_PRESENT_MASK _PMD_PRESENT -#endif -#ifndef _PMD_USER -#define _PMD_USER 0 -#endif -#ifndef _PAGE_KERNEL_RO -#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_RO) -#endif -#ifndef _PAGE_KERNEL_ROX -#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC) -#endif -#ifndef _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | \ - _PAGE_HWWRITE) -#endif -#ifndef _PAGE_KERNEL_RWX -#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | \ - _PAGE_HWWRITE | _PAGE_EXEC) -#endif -#ifndef _PTE_NONE_MASK -#define _PTE_NONE_MASK 0 -#endif - /* Location of the PFN in the PTE. Most 32-bit platforms use the same * as _PAGE_SHIFT here (ie, naturally aligned). * Platform who don't just pre-define the value so we don't override it here @@ -89,45 +62,6 @@ #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL) -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | \ - _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ - _PAGE_PRIVILEGED | \ - _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) - -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) -#else -#define _PAGE_BASE (_PAGE_BASE_NC) -#endif - -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - */ -#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ - _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO | \ - _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO | \ - _PAGE_EXEC) - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ From 6c5d2d3fd376c6b4b8d8624b53dbdda966ce762b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:10 +0000 Subject: [PATCH 148/221] powerpc/nohash/64: do not include pte-common.h nohash/64 only uses book3e PTE flags, so it doesn't need pte-common.h This also allows to drop PAGE_SAO and H_PAGE_4K_PFN from pte_common.h as they are only used by PPC64 Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/64/pgtable.h | 17 +++++++++- arch/powerpc/include/asm/nohash/pgtable.h | 28 ++++++++++++++++ arch/powerpc/include/asm/pte-common.h | 35 -------------------- 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index b7d65d4b61be..efd73d94c2fc 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -89,7 +89,22 @@ * Include the PTE bits definitions */ #include -#include + +#define _PAGE_HWWRITE 0 +#define _PAGE_SAO 0 +#define _PAGE_RO 0 +#define _PAGE_NA 0 +#define _PAGE_HUGE 0 + +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) + +#define H_PAGE_4K_PFN 0 #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 8de3b7eb88b0..eeb4f891f362 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -8,6 +8,34 @@ #include #endif +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X + +/* Advertise special mapping type for AGP */ +#define PAGE_AGP (PAGE_KERNEL_NC) +#define HAVE_PAGE_AGP + #ifndef __ASSEMBLY__ /* Generic accessors to PTE bits */ diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index cce60b3ba7d4..4d594039bca5 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -14,9 +14,6 @@ #ifndef _PAGE_WRITETHRU #define _PAGE_WRITETHRU 0 #endif -#ifndef _PAGE_SAO -#define _PAGE_SAO 0 -#endif /* _PAGE_RO and _PAGE_RW shall not be defined at the same time */ #ifndef _PAGE_RO #define _PAGE_RO 0 @@ -61,35 +58,3 @@ */ #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL) - -/* Permission masks used for kernel mappings */ -#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) -#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) - -/* Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X - -/* Advertise special mapping type for AGP */ -#define PAGE_AGP (PAGE_KERNEL_NC) -#define HAVE_PAGE_AGP - -#ifndef H_PAGE_4K_PFN -#define H_PAGE_4K_PFN 0 -#endif From a0da4bc166f21d88400ec91e01b815054561804e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:12 +0000 Subject: [PATCH 149/221] powerpc/mm: Allow platforms to redefine some helpers The 40xx defines _PAGE_HWWRITE while others don't. The 8xx defines _PAGE_RO instead of _PAGE_RW. The 8xx defines _PAGE_PRIVILEGED instead of _PAGE_USER. The 8xx defines _PAGE_HUGE and _PAGE_NA while others don't. Lets those platforms redefine pte_write(), pte_wrprotect() and pte_mkwrite() and get _PAGE_RO and _PAGE_HWWRITE off the common helpers. Lets the 8xx redefine pte_user(), pte_mkprivileged() and pte_mkuser() and get rid of _PAGE_PRIVILEGED and _PAGE_USER default values. Lets the 8xx redefine pte_mkhuge() and get rid of _PAGE_HUGE default value. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgtable.h | 16 +++--- arch/powerpc/include/asm/nohash/32/pte-40x.h | 16 ++++++ arch/powerpc/include/asm/nohash/32/pte-8xx.h | 51 ++++++++++++++++++++ arch/powerpc/include/asm/nohash/64/pgtable.h | 4 -- arch/powerpc/include/asm/nohash/pgtable.h | 24 ++++++--- arch/powerpc/include/asm/pte-common.h | 24 --------- 6 files changed, 91 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a4156da4a7a4..ce9270a0ea42 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -136,14 +136,12 @@ extern int icache_44x_need_flush; #define pte_clear(mm, addr, ptep) \ do { pte_update(ptep, ~0, 0); } while (0) +#ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte) { - pte_basic_t ptev; - - ptev = pte_val(pte) & ~_PAGE_RO; - ptev |= _PAGE_RW; - return __pte(ptev); + return __pte(pte_val(pte) | _PAGE_RW); } +#endif static inline pte_t pte_mkdirty(pte_t pte) { @@ -155,14 +153,12 @@ static inline pte_t pte_mkyoung(pte_t pte) return __pte(pte_val(pte) | _PAGE_ACCESSED); } +#ifndef pte_wrprotect static inline pte_t pte_wrprotect(pte_t pte) { - pte_basic_t ptev; - - ptev = pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE); - ptev |= _PAGE_RO; - return __pte(ptev); + return __pte(pte_val(pte) & ~_PAGE_RW); } +#endif static inline pte_t pte_mkexec(pte_t pte) { diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 2b48bc289a4d..ab043b3e9b99 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -87,5 +87,21 @@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#ifndef __ASSEMBLY__ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE)); +} + +#define pte_wrprotect pte_wrprotect + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); +} + +#define pte_mkclean pte_mkclean +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_40x_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index d06fc45bd9ac..b899c3c877ac 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -87,5 +87,56 @@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_RO) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) +#ifndef __ASSEMBLY__ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RO); +} + +#define pte_wrprotect pte_wrprotect + +static inline int pte_write(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_RO); +} + +#define pte_write pte_write + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RO); +} + +#define pte_mkwrite pte_mkwrite + +static inline bool pte_user(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_PRIVILEGED); +} + +#define pte_user pte_user + +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_PRIVILEGED); +} + +#define pte_mkprivileged pte_mkprivileged + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); +} + +#define pte_mkuser pte_mkuser + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_HUGE); +} + +#define pte_mkhuge pte_mkhuge +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_8xx_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index efd73d94c2fc..dc6bb9da3f23 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -90,11 +90,7 @@ */ #include -#define _PAGE_HWWRITE 0 #define _PAGE_SAO 0 -#define _PAGE_RO 0 -#define _PAGE_NA 0 -#define _PAGE_HUGE 0 #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index eeb4f891f362..04e9f0922ad4 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -39,10 +39,12 @@ #ifndef __ASSEMBLY__ /* Generic accessors to PTE bits */ +#ifndef pte_write static inline int pte_write(pte_t pte) { - return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; + return pte_val(pte) & _PAGE_RW; } +#endif static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } @@ -84,10 +86,12 @@ static inline bool pte_hw_valid(pte_t pte) * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. */ +#ifndef pte_user static inline bool pte_user(pte_t pte) { - return (pte_val(pte) & (_PAGE_USER | _PAGE_PRIVILEGED)) == _PAGE_USER; + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; } +#endif /* * We only find page table entry in the last level @@ -127,10 +131,12 @@ static inline pte_t pte_exprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_EXEC); } +#ifndef pte_mkclean static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); + return __pte(pte_val(pte) & ~_PAGE_DIRTY); } +#endif static inline pte_t pte_mkold(pte_t pte) { @@ -147,20 +153,26 @@ static inline pte_t pte_mkspecial(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL); } +#ifndef pte_mkhuge static inline pte_t pte_mkhuge(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_HUGE); + return __pte(pte_val(pte)); } +#endif +#ifndef pte_mkprivileged static inline pte_t pte_mkprivileged(pte_t pte) { - return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); + return __pte(pte_val(pte) & ~_PAGE_USER); } +#endif +#ifndef pte_mkuser static inline pte_t pte_mkuser(pte_t pte) { - return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); + return __pte(pte_val(pte) | _PAGE_USER); } +#endif static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 4d594039bca5..1a2102f8b1e7 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -5,36 +5,12 @@ * Some bits are only used on some cpu families... Make sure that all * the undefined gets a sensible default */ -#ifndef _PAGE_HWWRITE -#define _PAGE_HWWRITE 0 -#endif #ifndef _PAGE_COHERENT #define _PAGE_COHERENT 0 #endif #ifndef _PAGE_WRITETHRU #define _PAGE_WRITETHRU 0 #endif -/* _PAGE_RO and _PAGE_RW shall not be defined at the same time */ -#ifndef _PAGE_RO -#define _PAGE_RO 0 -#else -#define _PAGE_RW 0 -#endif - -/* At least one of _PAGE_PRIVILEGED or _PAGE_USER must be defined */ -#ifndef _PAGE_PRIVILEGED -#define _PAGE_PRIVILEGED 0 -#else -#ifndef _PAGE_USER -#define _PAGE_USER 0 -#endif -#endif -#ifndef _PAGE_NA -#define _PAGE_NA 0 -#endif -#ifndef _PAGE_HUGE -#define _PAGE_HUGE 0 -#endif /* Location of the PFN in the PTE. Most 32-bit platforms use the same * as _PAGE_SHIFT here (ie, naturally aligned). From cbcbbf4afd6d38272bf0119d890caa5ebba6fe4c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:14 +0000 Subject: [PATCH 150/221] powerpc/mm: Define platform default caches related flags Cache related flags like _PAGE_COHERENT and _PAGE_WRITETHRU are defined on most platforms. The platforms not defining them don't define any alternative. So we can give them a NUL value directly for those platforms directly. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pte-40x.h | 3 +++ arch/powerpc/include/asm/nohash/32/pte-8xx.h | 4 ++++ arch/powerpc/include/asm/pte-common.h | 11 ----------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index ab043b3e9b99..7a8b3c94592f 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -53,6 +53,9 @@ /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 +/* cache related flags non existing on 40x */ +#define _PAGE_COHERENT 0 + #define _PAGE_KERNEL_RO 0 #define _PAGE_KERNEL_ROX _PAGE_EXEC #define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index b899c3c877ac..2b4669b3badb 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -46,6 +46,10 @@ #define _PAGE_NA 0x0200 /* Supervisor NA, User no access */ #define _PAGE_RO 0x0600 /* Supervisor RO, User no access */ +/* cache related flags non existing on 8xx */ +#define _PAGE_COHERENT 0 +#define _PAGE_WRITETHRU 0 + #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_RO) #define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC) #define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_DIRTY) diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 1a2102f8b1e7..ff01368a175a 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -1,17 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Included from asm/pgtable-*.h only ! */ -/* - * Some bits are only used on some cpu families... Make sure that all - * the undefined gets a sensible default - */ -#ifndef _PAGE_COHERENT -#define _PAGE_COHERENT 0 -#endif -#ifndef _PAGE_WRITETHRU -#define _PAGE_WRITETHRU 0 -#endif - /* Location of the PFN in the PTE. Most 32-bit platforms use the same * as _PAGE_SHIFT here (ie, naturally aligned). * Platform who don't just pre-define the value so we don't override it here From 5662315384fcb3d81125562124cf4e1743aa2c3a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:16 +0000 Subject: [PATCH 151/221] powerpc/mm: Get rid of pte-common.h Do not include pte-common.h in nohash/32/pgtable.h As that was the last includer, get rid of pte-common.h Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgtable.h | 26 ++++++++++++++++++-- arch/powerpc/include/asm/pte-common.h | 25 ------------------- 2 files changed, 24 insertions(+), 27 deletions(-) delete mode 100644 arch/powerpc/include/asm/pte-common.h diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index ce9270a0ea42..f7b129a83054 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -128,8 +128,30 @@ extern int icache_44x_need_flush; #include #endif -/* And here we include common definitions */ -#include +/* + * Location of the PFN in the PTE. Most 32-bit platforms use the same + * as _PAGE_SHIFT here (ie, naturally aligned). + * Platform who don't just pre-define the value so we don't override it here. + */ +#ifndef PTE_RPN_SHIFT +#define PTE_RPN_SHIFT (PAGE_SHIFT) +#endif + +/* + * The mask covered by the RPN must be a ULL on 32-bit platforms with + * 64-bit PTEs. + */ +#if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) +#define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#else +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#endif + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h deleted file mode 100644 index ff01368a175a..000000000000 --- a/arch/powerpc/include/asm/pte-common.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Included from asm/pgtable-*.h only ! */ - -/* Location of the PFN in the PTE. Most 32-bit platforms use the same - * as _PAGE_SHIFT here (ie, naturally aligned). - * Platform who don't just pre-define the value so we don't override it here - */ -#ifndef PTE_RPN_SHIFT -#define PTE_RPN_SHIFT (PAGE_SHIFT) -#endif - -/* The mask covered by the RPN must be a ULL on 32-bit platforms with - * 64-bit PTEs - */ -#if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) -#define PTE_RPN_MASK (~((1ULL< Date: Tue, 9 Oct 2018 13:52:18 +0000 Subject: [PATCH 152/221] powerpc/8xx: change name of a few page flags to avoid confusion _PAGE_PRIVILEGED corresponds to the SH bit which doesn't protect against user access but only disables ASID verification on kernel accesses. User access is controlled with _PMD_USER flag. Name it _PAGE_SH instead of _PAGE_PRIVILEGED _PAGE_HUGE corresponds to the SPS bit which doesn't really tells that's it is a huge page but only that it is not a 4k page. Name it _PAGE_SPS instead of _PAGE_HUGE Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 28 ++++++++++---------- arch/powerpc/kernel/head_8xx.S | 6 ++--- arch/powerpc/mm/8xx_mmu.c | 2 +- arch/powerpc/mm/dump_linuxpagetables-8xx.c | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 2b4669b3badb..1c57efac089d 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -29,10 +29,10 @@ */ /* Definitions for 8xx embedded chips. */ -#define _PAGE_PRESENT 0x0001 /* Page is valid */ -#define _PAGE_NO_CACHE 0x0002 /* I: cache inhibit */ -#define _PAGE_PRIVILEGED 0x0004 /* No ASID (context) compare */ -#define _PAGE_HUGE 0x0008 /* SPS: Small Page Size (1 if 16k, 512k or 8M)*/ +#define _PAGE_PRESENT 0x0001 /* V: Page is valid */ +#define _PAGE_NO_CACHE 0x0002 /* CI: cache inhibit */ +#define _PAGE_SH 0x0004 /* SH: No ASID (context) compare */ +#define _PAGE_SPS 0x0008 /* SPS: Small Page Size (1 if 16k, 512k or 8M)*/ #define _PAGE_DIRTY 0x0100 /* C: page changed */ /* These 4 software bits must be masked out when the L2 entry is loaded @@ -50,15 +50,15 @@ #define _PAGE_COHERENT 0 #define _PAGE_WRITETHRU 0 -#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_RO) -#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC) -#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_DIRTY) -#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_EXEC) +#define _PAGE_KERNEL_RO (_PAGE_SH | _PAGE_RO) +#define _PAGE_KERNEL_ROX (_PAGE_SH | _PAGE_RO | _PAGE_EXEC) +#define _PAGE_KERNEL_RW (_PAGE_SH | _PAGE_DIRTY) +#define _PAGE_KERNEL_RWX (_PAGE_SH | _PAGE_DIRTY | _PAGE_EXEC) /* Mask of bits returned by pte_pgprot() */ #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_NO_CACHE | \ _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ - _PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_EXEC) + _PAGE_SH | _PAGE_DIRTY | _PAGE_EXEC) #define _PMD_PRESENT 0x0001 #define _PMD_PRESENT_MASK _PMD_PRESENT @@ -74,7 +74,7 @@ #define PTE_ATOMIC_UPDATES 1 #ifdef CONFIG_PPC_16K_PAGES -#define _PAGE_PSIZE _PAGE_HUGE +#define _PAGE_PSIZE _PAGE_SPS #else #define _PAGE_PSIZE 0 #endif @@ -115,28 +115,28 @@ static inline pte_t pte_mkwrite(pte_t pte) static inline bool pte_user(pte_t pte) { - return !(pte_val(pte) & _PAGE_PRIVILEGED); + return !(pte_val(pte) & _PAGE_SH); } #define pte_user pte_user static inline pte_t pte_mkprivileged(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_PRIVILEGED); + return __pte(pte_val(pte) | _PAGE_SH); } #define pte_mkprivileged pte_mkprivileged static inline pte_t pte_mkuser(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); + return __pte(pte_val(pte) & ~_PAGE_SH); } #define pte_mkuser pte_mkuser static inline pte_t pte_mkhuge(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_HUGE); + return __pte(pte_val(pte) | _PAGE_SPS); } #define pte_mkhuge pte_mkhuge diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 6582f824d620..134a573a9f2d 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -642,7 +642,7 @@ DTLBMissIMMR: mtspr SPRN_MD_TWC, r10 mfspr r10, SPRN_IMMR /* Get current IMMR */ rlwinm r10, r10, 0, 0xfff80000 /* Get 512 kbytes boundary */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT | _PAGE_NO_CACHE mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -660,7 +660,7 @@ DTLBMissLinear: li r11, MD_PS8MEG | MD_SVALID | M_APG2 mtspr SPRN_MD_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -679,7 +679,7 @@ ITLBMissLinear: li r11, MI_PS8MEG | MI_SVALID | M_APG2 mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 9137361d687d..36484a2ef915 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -67,7 +67,7 @@ void __init MMU_init_hw(void) /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ #ifdef CONFIG_PIN_TLB_DATA unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; #ifdef CONFIG_PIN_TLB_IMMR int i = 29; #else diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c index 33f52a97975b..ab9e3f24db2f 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-8xx.c +++ b/arch/powerpc/mm/dump_linuxpagetables-8xx.c @@ -11,7 +11,7 @@ static const struct flag_info flag_array[] = { { - .mask = _PAGE_PRIVILEGED, + .mask = _PAGE_SH, .val = 0, .set = "user", .clear = " ", From 1b2443a547f91cf7f89eed6bddb07394dafae24e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 9 Oct 2018 13:52:20 +0000 Subject: [PATCH 153/221] powerpc/book3s64: Avoid multiple endian conversion in pte helpers In the same spirit as already done in pte query helpers, this patch changes pte setting helpers to perform endian conversions on the constants rather than on the pte value. In the meantime, it changes pte_access_permitted() to use pte helpers for the same reason. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 71 +++++++++----------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 6fbb45d11bf0..9db2b8eba61d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -506,12 +506,12 @@ static inline bool pte_soft_dirty(pte_t pte) static inline pte_t pte_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SOFT_DIRTY)); } static inline pte_t pte_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@ -532,7 +532,7 @@ static inline pte_t pte_mk_savedwrite(pte_t pte) */ VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) != cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED)); - return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } #define pte_clear_savedwrite pte_clear_savedwrite @@ -542,14 +542,14 @@ static inline pte_t pte_clear_savedwrite(pte_t pte) * Used by KSM subsystem to make a protnone pte readonly. */ VM_BUG_ON(!pte_protnone(pte)); - return __pte(pte_val(pte) | _PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); } #else #define pte_clear_savedwrite pte_clear_savedwrite static inline pte_t pte_clear_savedwrite(pte_t pte) { VM_WARN_ON(1); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } #endif /* CONFIG_NUMA_BALANCING */ @@ -578,25 +578,22 @@ static inline bool arch_pte_access_permitted(u64 pte, bool write, bool execute) } #endif /* CONFIG_PPC_MEM_KEYS */ +static inline bool pte_user(pte_t pte) +{ + return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); +} + #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); - /* Also check for pte_user */ - unsigned long clear_pte_bits = _PAGE_PRIVILEGED; /* * _PAGE_READ is needed for any access and will be * cleared for PROT_NONE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_READ; - - if (write) - need_pte_bits |= _PAGE_WRITE; - - if ((pteval & need_pte_bits) != need_pte_bits) + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) return false; - if ((pteval & clear_pte_bits) == clear_pte_bits) + if (write && !pte_write(pte)) return false; return arch_pte_access_permitted(pte_val(pte), write, 0); @@ -625,32 +622,32 @@ static inline pte_t pte_wrprotect(pte_t pte) { if (unlikely(pte_savedwrite(pte))) return pte_clear_savedwrite(pte); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } static inline pte_t pte_exprotect(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_EXEC); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_EXEC)); } static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_DIRTY)); } static inline pte_t pte_mkold(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_ACCESSED)); } static inline pte_t pte_mkexec(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_EXEC); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); } static inline pte_t pte_mkpte(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_PTE); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); } static inline pte_t pte_mkwrite(pte_t pte) @@ -658,22 +655,22 @@ static inline pte_t pte_mkwrite(pte_t pte) /* * write implies read, hence set both */ - return __pte(pte_val(pte) | _PAGE_RW); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_RW)); } static inline pte_t pte_mkdirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_DIRTY | _PAGE_SOFT_DIRTY)); } static inline pte_t pte_mkyoung(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_ACCESSED)); } static inline pte_t pte_mkspecial(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL)); } static inline pte_t pte_mkhuge(pte_t pte) @@ -683,17 +680,17 @@ static inline pte_t pte_mkhuge(pte_t pte) static inline pte_t pte_mkdevmap(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP)); } static inline pte_t pte_mkprivileged(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); } static inline pte_t pte_mkuser(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } /* @@ -712,12 +709,8 @@ static inline int pte_devmap(pte_t pte) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); -} - -static inline bool pte_user(pte_t pte) -{ - return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); + return __pte_raw((pte_raw(pte) & cpu_to_be64(_PAGE_CHG_MASK)) | + cpu_to_be64(pgprot_val(newprot))); } /* Encode and de-code a swap entry */ @@ -760,7 +753,7 @@ static inline bool pte_user(pte_t pte) #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_SOFT_DIRTY)); } static inline bool pte_swp_soft_dirty(pte_t pte) @@ -770,7 +763,7 @@ static inline bool pte_swp_soft_dirty(pte_t pte) static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@ -859,10 +852,10 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot) */ static inline bool pte_ci(pte_t pte) { - unsigned long pte_v = pte_val(pte); + __be64 pte_v = pte_raw(pte); - if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || - ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + if (((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_TOLERANT)) || + ((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_NON_IDEMPOTENT))) return true; return false; } From 3bad719b495467a7e7023c5b26f0f5a523a4a193 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 31 May 2018 14:33:39 +1000 Subject: [PATCH 154/221] powerpc/prom_init: Make of_workarounds static It's not used anywhere else. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 1af453a61991..f5b682094bde 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -87,7 +87,7 @@ #define OF_WORKAROUNDS 0 #else #define OF_WORKAROUNDS of_workarounds -int of_workarounds; +static int of_workarounds; #endif #define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ From 30c69ca0486fd73f2e5d40096ee68bfaed469b76 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 31 May 2018 14:33:40 +1000 Subject: [PATCH 155/221] powerpc/prom_init: Make "fake_elf" const It is never modified Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f5b682094bde..0096a53d03da 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -922,7 +922,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { /* Old method - ELF header with PT_NOTE sections only works on BE */ #ifdef __BIG_ENDIAN__ -static struct fake_elf { +static const struct fake_elf { Elf32_Ehdr elfhdr; Elf32_Phdr phdr[2]; struct chrpnote { From 7f995d3ba6f2bfbe274f5a012f5b549f881e78fe Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 31 May 2018 14:33:41 +1000 Subject: [PATCH 156/221] powerpc/prom_init: Make "default_colors" const It's never modified. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 0096a53d03da..ad9320347a28 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2205,7 +2205,7 @@ static void __init prom_check_displays(void) ihandle ih; int i; - static unsigned char default_colors[] = { + static const unsigned char default_colors[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0x00, 0xaa, 0x00, From 002af9391bfbe84f8e491bb10bd9c6001a6becee Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 12 Oct 2018 23:13:17 +1100 Subject: [PATCH 157/221] powerpc: Split user/kernel definitions of struct pt_regs We use a shared definition for struct pt_regs in uapi/asm/ptrace.h. That means the layout of the structure is ABI, ie. we can't change it. That would be fine if it was only used to describe the user-visible register state of a process, but it's also the struct we use in the kernel to describe the registers saved in an interrupt frame. We'd like more flexibility in the content (and possibly layout) of the kernel version of the struct, but currently that's not possible. So split the definition into a user-visible definition which remains unchanged, and a kernel internal one. At the moment they're still identical, and we check that at build time. That's because we have code (in ptrace etc.) that assumes that they are the same. We will fix that code in future patches, and then we can break the strict symmetry between the two structs. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ptrace.h | 27 ++++++++++++++++++ arch/powerpc/include/uapi/asm/ptrace.h | 7 ++++- arch/powerpc/kernel/ptrace.c | 39 ++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 447cbd1bee99..3dd15024db93 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -26,6 +26,33 @@ #include #include +#ifndef __ASSEMBLY__ +struct pt_regs +{ + union { + struct user_pt_regs user_regs; + struct { + unsigned long gpr[32]; + unsigned long nip; + unsigned long msr; + unsigned long orig_gpr3; + unsigned long ctr; + unsigned long link; + unsigned long xer; + unsigned long ccr; +#ifdef CONFIG_PPC64 + unsigned long softe; +#else + unsigned long mq; +#endif + unsigned long trap; + unsigned long dar; + unsigned long dsisr; + unsigned long result; + }; + }; +}; +#endif #ifdef __powerpc64__ diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index 55c7a131d2ab..f5f1ccc740fc 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -29,7 +29,12 @@ #ifndef __ASSEMBLY__ -struct pt_regs { +#ifdef __KERNEL__ +struct user_pt_regs +#else +struct pt_regs +#endif +{ unsigned long gpr[32]; unsigned long nip; unsigned long msr; diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 4e372f54088f..939d7f81bbbe 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -3335,3 +3335,42 @@ void do_syscall_trace_leave(struct pt_regs *regs) user_enter(); } + +void __init pt_regs_check(void) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != + offsetof(struct user_pt_regs, gpr)); + BUILD_BUG_ON(offsetof(struct pt_regs, nip) != + offsetof(struct user_pt_regs, nip)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct user_pt_regs, orig_gpr3)); + BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != + offsetof(struct user_pt_regs, ctr)); + BUILD_BUG_ON(offsetof(struct pt_regs, link) != + offsetof(struct user_pt_regs, link)); + BUILD_BUG_ON(offsetof(struct pt_regs, xer) != + offsetof(struct user_pt_regs, xer)); + BUILD_BUG_ON(offsetof(struct pt_regs, ccr) != + offsetof(struct user_pt_regs, ccr)); +#ifdef __powerpc64__ + BUILD_BUG_ON(offsetof(struct pt_regs, softe) != + offsetof(struct user_pt_regs, softe)); +#else + BUILD_BUG_ON(offsetof(struct pt_regs, mq) != + offsetof(struct user_pt_regs, mq)); +#endif + BUILD_BUG_ON(offsetof(struct pt_regs, trap) != + offsetof(struct user_pt_regs, trap)); + BUILD_BUG_ON(offsetof(struct pt_regs, dar) != + offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != + offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, result) != + offsetof(struct user_pt_regs, result)); + + BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); +} From 3eeacd9f4ea33546f272fcf131d6a11edbe3b4a6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 13 Oct 2018 00:39:31 +1100 Subject: [PATCH 158/221] powerpc/ptrace: Don't use sizeof(struct pt_regs) in ptrace code Now that we've split the user & kernel versions of pt_regs we need to be more careful in the ptrace code. For now we've ensured the location of the fields in both structs is the same, so most of the ptrace code doesn't need updating. But there are a few places where we use sizeof(pt_regs), and these will be wrong as soon as we increase the size of the kernel structure. So flip them all to use sizeof(user_pt_regs). Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ptrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 939d7f81bbbe..c7d0d0c1e34d 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -297,7 +297,7 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) { + if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -360,10 +360,10 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.regs->orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -853,10 +853,10 @@ static int tm_cgpr_get(struct task_struct *target, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.ckpt_regs.orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -3131,7 +3131,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all pt_regs from the child. */ return copy_regset_to_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); #ifdef CONFIG_PPC64 @@ -3140,7 +3140,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ From 4c2de74cc8696154b283f241d74ec0bb24438e22 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 13 Oct 2018 00:15:16 +1100 Subject: [PATCH 159/221] powerpc/64: Interrupts save PPR on stack rather than thread_struct PPR is the odd register out when it comes to interrupt handling, it is saved in current->thread.ppr while all others are saved on the stack. The difficulty with this is that accessing thread.ppr can cause a SLB fault, but the SLB fault handler implementation in C change had assumed the normal exception entry handlers would not cause an SLB fault. Fix this by allocating room in the interrupt stack to save PPR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 9 ++++----- arch/powerpc/include/asm/processor.h | 6 ++---- arch/powerpc/include/asm/ptrace.h | 4 ++++ arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_64.S | 15 +++++---------- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace.c | 4 ++-- 7 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index a86feddddad0..403d73898a9a 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -236,11 +236,10 @@ * PPR save/restore macros used in exceptions_64s.S * Used for P7 or later processors */ -#define SAVE_PPR(area, ra, rb) \ +#define SAVE_PPR(area, ra) \ BEGIN_FTR_SECTION_NESTED(940) \ - ld ra,PACACURRENT(r13); \ - ld rb,area+EX_PPR(r13); /* Read PPR from paca */ \ - std rb,TASKTHREADPPR(ra); \ + ld ra,area+EX_PPR(r13); /* Read PPR from paca */ \ + std ra,_PPR(r1); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,940) #define RESTORE_PPR_PACA(area, ra) \ @@ -508,7 +507,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) 3: EXCEPTION_PROLOG_COMMON_1(); \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ - SAVE_PPR(area, r9, r10); \ + SAVE_PPR(area, r9); \ 4: EXCEPTION_PROLOG_COMMON_2(area) \ EXCEPTION_PROLOG_COMMON_3(n) \ ACCOUNT_STOLEN_TIME diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 52fadded5c1e..3fefb8a65b17 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -32,9 +32,9 @@ /* Default SMT priority is set to 3. Use 11- 13bits to save priority. */ #define PPR_PRIORITY 3 #ifdef __ASSEMBLY__ -#define INIT_PPR (PPR_PRIORITY << 50) +#define DEFAULT_PPR (PPR_PRIORITY << 50) #else -#define INIT_PPR ((u64)PPR_PRIORITY << 50) +#define DEFAULT_PPR ((u64)PPR_PRIORITY << 50) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PPC64 */ @@ -341,7 +341,6 @@ struct thread_struct { * onwards. */ int dscr_inherit; - unsigned long ppr; /* used to save/restore SMT priority */ unsigned long tidr; #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -389,7 +388,6 @@ struct thread_struct { .regs = (struct pt_regs *)INIT_SP - 1, /* XXX bogus, I think */ \ .addr_limit = KERNEL_DS, \ .fpexc_mode = 0, \ - .ppr = INIT_PPR, \ .fscr = FSCR_TAR | FSCR_EBB \ } #endif diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3dd15024db93..2ba2a1e52291 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -51,6 +51,10 @@ struct pt_regs unsigned long result; }; }; + +#ifdef CONFIG_PPC64 + unsigned long ppr; +#endif }; #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 2eb4923f8468..92156c61d21c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -89,7 +89,6 @@ int main(void) #ifdef CONFIG_PPC64 DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); - OFFSET(TASKTHREADPPR, task_struct, thread.ppr); #else OFFSET(THREAD_INFO, task_struct, stack); DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); @@ -323,6 +322,7 @@ int main(void) STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); + STACK_PT_REGS_OFFSET(_PPR, ppr); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_PPC32) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 7db00ee6be48..7b1693adff2a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -386,10 +386,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 4: /* Anything else left to do? */ BEGIN_FTR_SECTION - lis r3,INIT_PPR@highest /* Set thread.ppr = 3 */ - ld r10,PACACURRENT(r13) + lis r3,DEFAULT_PPR@highest /* Set default PPR */ sldi r3,r3,32 /* bits 11-13 are used for ppr */ - std r3,TASKTHREADPPR(r10) + std r3,_PPR(r1) END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -942,12 +941,6 @@ fast_exception_return: andi. r0,r3,MSR_RI beq- .Lunrecov_restore - /* Load PPR from thread struct before we clear MSR:RI */ -BEGIN_FTR_SECTION - ld r2,PACACURRENT(r13) - ld r2,TASKTHREADPPR(r2) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - /* * Clear RI before restoring r13. If we are returning to * userspace and we take an exception after restoring r13, @@ -968,7 +961,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r3,MSR_PR beq 1f BEGIN_FTR_SECTION - mtspr SPRN_PPR,r2 /* Restore PPR */ + /* Restore PPR */ + ld r2,_PPR(r1) + mtspr SPRN_PPR,r2 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0ed8d0968515..f9d1cca28cce 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1710,7 +1710,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p->thread.dscr = mfspr(SPRN_DSCR); } if (cpu_has_feature(CPU_FTR_HAS_PPR)) - p->thread.ppr = INIT_PPR; + childregs->ppr = DEFAULT_PPR; p->thread.tidr = 0; #endif diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index c7d0d0c1e34d..afb819f4ca68 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1609,7 +1609,7 @@ static int ppr_get(struct task_struct *target, void *kbuf, void __user *ubuf) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1618,7 +1618,7 @@ static int ppr_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, From 48e7b76957457f9a6f086ca2bbe49ec1ffd75f84 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:51 +1000 Subject: [PATCH 160/221] powerpc/64s/hash: Convert SLB miss handlers to C This patch moves SLB miss handlers completely to C, using the standard exception handler macros to set up the stack and branch to C. This can be done because the segment containing the kernel stack is always bolted, so accessing it with relocation on will not cause an SLB exception. Arbitrary kernel memory must not be accessed when handling kernel space SLB misses, so care should be taken there. However user SLB misses can access any kernel memory, which can be used to move some fields out of the paca (in later patches). User SLB misses could quite easily reconcile IRQs and set up a first class kernel environment and exit via ret_from_except, however that doesn't seem to be necessary at the moment, so we only do that if a bad fault is encountered. [ Credit to Aneesh for bug fixes, error checks, and improvements to bad address handling, etc ] Signed-off-by: Nicholas Piggin [mpe: Disallow tracing for all of slb.c for now.] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/asm-prototypes.h | 2 + arch/powerpc/include/asm/exception-64s.h | 8 - arch/powerpc/kernel/exceptions-64s.S | 202 +++---------- arch/powerpc/mm/Makefile | 4 +- arch/powerpc/mm/slb.c | 295 +++++++++++-------- arch/powerpc/mm/slb_low.S | 335 ---------------------- 6 files changed, 217 insertions(+), 629 deletions(-) delete mode 100644 arch/powerpc/mm/slb_low.S diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 9bc98c239305..2741831482f4 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -77,6 +77,8 @@ void kernel_bad_stack(struct pt_regs *regs); void system_reset_exception(struct pt_regs *regs); void machine_check_exception(struct pt_regs *regs); void emulation_assist_interrupt(struct pt_regs *regs); +long do_slb_fault(struct pt_regs *regs, unsigned long ea); +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err); /* signals, syscalls and interrupts */ long sys_swapcontext(struct ucontext __user *old_ctx, diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 403d73898a9a..3b4767ed3ec5 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -60,14 +60,6 @@ */ #define MAX_MCE_DEPTH 4 -/* - * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR - * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole - * in the save area so it's not necessary to overlap them. Could be used - * for future savings though if another 4 byte register was to be saved. - */ -#define EX_LR EX_DAR - /* * EX_R3 is only used by the bad_stack handler. bad_stack reloads and * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b9239dbf6d59..89d32bb79d5e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -596,28 +596,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); EXC_VIRT_END(data_access_slb, 0x4380, 0x80) + TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) +EXC_COMMON_BEGIN(data_access_slb_common) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) + ld r4,PACA_EXSLB+EX_DAR(r13) + std r4,_DAR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) + bl save_nvgprs + RECONCILE_IRQ_STATE(r10, r11) + ld r4,_DAR(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault + b ret_from_except + EXC_REAL(instruction_access, 0x400, 0x80) EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) @@ -640,160 +648,34 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, KVMTEST_PR, 0x480); EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, NOTEST, 0x480); EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) + TRAMP_KVM(PACA_EXSLB, 0x480) - -/* - * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as - * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. - */ -EXC_COMMON_BEGIN(slb_miss_common) - /* - * r13 points to the PACA, r9 contains the saved CR, - * r12 contains the saved r3, - * r11 contain the saved SRR1, SRR0 is still ready for return - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss - * We assume we aren't going to take any exceptions during this - * procedure. - */ - mflr r10 - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - andi. r9,r11,MSR_PR // Check for exception from userspace - cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later - - /* - * Test MSR_RI before calling slb_allocate_realmode, because the - * MSR in r11 gets clobbered. However we still want to allocate - * SLB in case MSR_RI=0, to minimise the risk of getting stuck in - * recursive SLB faults. So use cr5 for this, which is preserved. - */ - andi. r11,r11,MSR_RI /* check for unrecoverable exception */ - cmpdi cr5,r11,MSR_RI - - crset 4*cr0+eq -#ifdef CONFIG_PPC_BOOK3S_64 -BEGIN_MMU_FTR_SECTION - bl slb_allocate -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) -#endif - - ld r10,PACA_EXSLB+EX_LR(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - mtlr r10 - - /* - * Large address, check whether we have to allocate new contexts. - */ - beq- 8f - - bne- cr5,2f /* if unrecoverable exception, oops */ - - /* All done -- return from exception. */ - - bne cr4,1f /* returning to kernel */ - - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_USER - b . /* prevent speculative execution */ -1: - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_KERNEL - b . /* prevent speculative execution */ - - -2: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10,unrecov_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -8: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10, large_addr_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -EXC_COMMON_BEGIN(unrecov_slb) - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - RECONCILE_IRQ_STATE(r10, r11) +EXC_COMMON_BEGIN(instruction_access_slb_common) + EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) + ld r4,_NIP(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(large_addr_slb) - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) RECONCILE_IRQ_STATE(r10, r11) - ld r3, PACA_EXSLB+EX_DAR(r13) - std r3, _DAR(r1) - beq cr6, 2f - li r10, 0x481 /* fix trap number for I-SLB miss */ - std r10, _TRAP(r1) -2: bl save_nvgprs - addi r3, r1, STACK_FRAME_OVERHEAD - bl slb_miss_large_addr + ld r4,_NIP(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault b ret_from_except + EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; hardware_interrupt_hv: diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3c844bdd16c4..ceb352ccbc76 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,6 +7,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o @@ -15,7 +17,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 513c6596140d..76c1a9523049 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,6 +14,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -33,7 +34,7 @@ enum slb_index { KSTACK_INDEX = 1, /* Kernel stack map */ }; -extern void slb_allocate(unsigned long ea); +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) @@ -44,11 +45,17 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, + unsigned long flags) +{ + return (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, unsigned long flags) { - return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } static inline void slb_shadow_update(unsigned long ea, int ssize, @@ -353,49 +360,19 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) is_kernel_addr(exec_base)) return; - slb_allocate(pc); + slb_allocate_user(mm, pc); if (!esids_match(pc, stack)) - slb_allocate(stack); + slb_allocate_user(mm, stack); if (!esids_match(pc, exec_base) && !esids_match(stack, exec_base)) - slb_allocate(exec_base); + slb_allocate_user(mm, exec_base); } -static inline void patch_slb_encoding(unsigned int *insn_addr, - unsigned int immed) -{ - - /* - * This function patches either an li or a cmpldi instruction with - * a new immediate value. This relies on the fact that both li - * (which is actually addi) and cmpldi both take a 16-bit immediate - * value, and it is situated in the same location in the instruction, - * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. - * The signedness of the immediate operand differs between the two - * instructions however this code is only ever patching a small value, - * much less than 1 << 15, so we can get away with it. - * To patch the value we read the existing instruction, clear the - * immediate value, and or in our new value, then write the instruction - * back. - */ - unsigned int insn = (*insn_addr & 0xffff0000) | immed; - patch_instruction(insn_addr, insn); -} - -extern u32 slb_miss_kernel_load_linear[]; -extern u32 slb_miss_kernel_load_io[]; -extern u32 slb_compare_rr_to_size[]; -extern u32 slb_miss_kernel_load_vmemmap[]; - void slb_set_size(u16 size) { - if (mmu_slb_size == size) - return; - mmu_slb_size = size; - patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) @@ -417,19 +394,9 @@ void slb_initialize(void) #endif if (!slb_encoding_inited) { slb_encoding_inited = 1; - patch_slb_encoding(slb_miss_kernel_load_linear, - SLB_VSID_KERNEL | linear_llp); - patch_slb_encoding(slb_miss_kernel_load_io, - SLB_VSID_KERNEL | io_llp); - patch_slb_encoding(slb_compare_rr_to_size, - mmu_slb_size); - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); pr_devel("SLB: io LLP = %04lx\n", io_llp); - #ifdef CONFIG_SPARSEMEM_VMEMMAP - patch_slb_encoding(slb_miss_kernel_load_vmemmap, - SLB_VSID_KERNEL | vmemmap_llp); pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); #endif } @@ -458,125 +425,203 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } -static void insert_slb_entry(unsigned long vsid, unsigned long ea, - int bpsize, int ssize) +static void slb_cache_update(unsigned long esid_data) { - unsigned long flags, vsid_data, esid_data; - enum slb_index index; int slb_cache_index; if (cpu_has_feature(CPU_FTR_ARCH_300)) return; /* ISAv3.0B and later does not use slb_cache */ - /* - * We are irq disabled, hence should be safe to access PACA. - */ - VM_WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - index = get_paca()->stab_rr; - - /* - * simple round-robin replacement of slb starting at SLB_NUM_BOLTED. - */ - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - - get_paca()->stab_rr = index; - - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - vsid_data = (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); - esid_data = mk_esid_data(ea, ssize, index); - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * Also we only handle user segments here. - */ - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data) - : "memory"); - /* * Now update slb cache entries */ - slb_cache_index = get_paca()->slb_cache_ptr; + slb_cache_index = local_paca->slb_cache_ptr; if (slb_cache_index < SLB_CACHE_ENTRIES) { /* * We have space in slb cache for optimized switch_slb(). * Top 36 bits from esid_data as per ISA */ - get_paca()->slb_cache[slb_cache_index++] = esid_data >> 28; - get_paca()->slb_cache_ptr++; + local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache_ptr++; } else { /* * Our cache is full and the current cache content strictly * doesn't indicate the active SLB conents. Bump the ptr * so that switch_slb() will ignore the cache. */ - get_paca()->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; } } -static void handle_multi_context_slb_miss(int context_id, unsigned long ea) +static enum slb_index alloc_slb_index(void) { - struct mm_struct *mm = current->mm; - unsigned long vsid; - int bpsize; + enum slb_index index; - /* - * We are always above 1TB, hence use high user segment size. - */ - vsid = get_vsid(context_id, ea, mmu_highuser_ssize); - bpsize = get_slice_psize(mm, ea); - insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize); + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = get_paca()->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + get_paca()->stab_rr = index; + + return index; } -void slb_miss_large_addr(struct pt_regs *regs) +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) { - enum ctx_state prev_state = exception_enter(); - unsigned long ea = regs->dar; - int context; + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; - if (REGION_ID(ea) != USER_REGION_ID) - goto slb_bad_addr; + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; /* - * Are we beyound what the page table layout supports ? + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. */ - if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) - goto slb_bad_addr; + barrier(); - /* Lower address should have been handled by asm code */ - if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT)) - goto slb_bad_addr; + index = alloc_slb_index(); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + if (id == KERNEL_REGION_ID) { + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + if (ea < H_VMALLOC_END) + flags = get_paca()->vmalloc_sllp; + else + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = id - KERNEL_REGION_CONTEXT_OFFSET; + + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; /* * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= current->mm->context.slb_addr_limit) - goto slb_bad_addr; + if (ea >= mm->context.slb_addr_limit) + return -EFAULT; - context = get_ea_context(¤t->mm->context, ea); + context = get_ea_context(&mm->context, ea); if (!context) - goto slb_bad_addr; + return -EFAULT; - handle_multi_context_slb_miss(context, ea); - exception_exit(prev_state); - return; + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } -slb_bad_addr: - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - exception_exit(prev_state); + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = REGION_ID(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= KERNEL_REGION_ID) { + return slb_allocate_kernel(ea, id); + } else { + struct mm_struct *mm = current->mm; + + if (unlikely(!mm)) + return -EFAULT; + + return slb_allocate_user(mm, ea); + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S deleted file mode 100644 index 4d2e921d696e..000000000000 --- a/arch/powerpc/mm/slb_low.S +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson , IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * rf = flags - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ -/* \ - * powermac get slb fault before feature fixup, so make 65 bit part \ - * the default part of feature fixup \ - */ \ -BEGIN_MMU_FTR_SECTION \ - srdi rx,rt,VSID_BITS_65_##size; \ - clrldi rt,rt,(64-VSID_BITS_65_##size); \ - add rt,rt,rx; \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_65_##size; \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ -MMU_FTR_SECTION_ELSE \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) - - -/* void slb_allocate(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * r3 is preserved. - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate) - /* - * Check if the address falls within the range of the first context, or - * if we may need to handle multi context. For the first context we - * allocate the slb entry via the fast path below. For large address we - * branch out to C-code and see if additional contexts have been - * allocated. - * The test here is: - * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT) - */ - rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4) - bne- 8f - - srdi r9,r3,60 /* get region */ - srdi r10,r3,SID_SHIFT /* get esid */ - cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - - /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ - blt cr7,0f /* user or kernel? */ - - /* Check if hitting the linear mapping or some other kernel space - */ - bne cr7,1f - - /* Linear mapping encoding bits, the "li" instruction below will - * be patched by the kernel at boot - */ -.globl slb_miss_kernel_load_linear -slb_miss_kernel_load_linear: - li r11,0 - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -1: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - cmpldi cr0,r9,0xf - bne 1f -/* Check virtual memmap region. To be patched at kernel boot */ -.globl slb_miss_kernel_load_vmemmap -slb_miss_kernel_load_vmemmap: - li r11,0 - b 6f -1: -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - /* - * r10 contains the ESID, which is the original faulting EA shifted - * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) - * which is 0xd00038000. That can't be used as an immediate, even if we - * ignored the 0xd, so we have to load it into a register, and we only - * have one register free. So we must load all of (H_VMALLOC_END >> 28) - * into a register and compare ESID against that. - */ - lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 - ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 - // Rotate left 4, then mask with 0xffffffff0 - rldic r11,r11,4,28 // r11 = 0xd00038000 - cmpld r10,r11 // if r10 >= r11 - bge 5f // goto io_mapping - - /* - * vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines. - */ - lhz r11,PACAVMALLOCSLLP(r13) - b 6f -5: - /* IO mapping */ -.globl slb_miss_kernel_load_io -slb_miss_kernel_load_io: - li r11,0 -6: - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -0: /* - * For userspace addresses, make sure this is region 0. - */ - cmpdi r9, 0 - bne- 8f - /* - * user space make sure we are within the allowed limit - */ - ld r11,PACA_SLB_ADDR_LIMIT(r13) - cmpld r3,r11 - bge- 8f - - /* when using slices, we extract the psize off the slice bitmaps - * and then we need to get the sllp encoding off the mmu_psize_defs - * array. - * - * XXX This is a bit inefficient especially for the normal case, - * so we should try to implement a fast path for the standard page - * size using the old sllp value so we avoid the array. We cannot - * really do dynamic patching unfortunately as processes might flip - * between 4k and 64k standard page size - */ -#ifdef CONFIG_PPC_MM_SLICES - /* r10 have esid */ - cmpldi r10,16 - /* below SLICE_LOW_TOP */ - blt 5f - /* - * Handle hpsizes, - * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ - addi r9,r11,PACAHIGHSLICEPSIZE - lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ - /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ - rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 - b 6f - -5: - /* - * Handle lpsizes - * r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,1 /* index */ - addi r9,r11,PACALOWSLICESPSIZE - lbzx r9,r13,r9 /* r9 is lpsizes[r11] */ - rldicl r11,r10,0,63 /* r11 = r10 & 0x1 */ -6: - sldi r11,r11,2 /* index * 4 */ - /* Extract the psize and multiply to get an array offset */ - srd r9,r9,r11 - andi. r9,r9,0xf - mulli r9,r9,MMUPSIZEDEFSIZE - - /* Now get to the array and obtain the sllp - */ - ld r11,PACATOC(r13) - ld r11,mmu_psize_defs@got(r11) - add r11,r11,r9 - ld r11,MMUPSIZESLLP(r11) - ori r11,r11,SLB_VSID_USER -#else - /* paca context sllp already contains the SLB_VSID_USER bits */ - lhz r11,PACACONTEXTSLLP(r13) -#endif /* CONFIG_PPC_MM_SLICES */ - - ld r9,PACACONTEXTID(r13) -BEGIN_FTR_SECTION - cmpldi r10,0x1000 - bge .Lslb_finish_load_1T -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load - -8: /* invalid EA - return an error indication */ - crset 4*cr0+eq /* indicate failure */ - blr - -/* - * Finish loading of an SLB entry and return - * - * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET - */ -.Lslb_finish_load: - rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,256M) - /* r3 = EA, r11 = VSID data */ - /* - * Find a slot, round robin. Previously we tried to find a - * free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ - - mr r9,r3 - - /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ -7: ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* This gets soft patched on boot. */ -.globl slb_compare_rr_to_size -slb_compare_rr_to_size: - cmpldi r10,0 - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) - -3: - rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ - oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - - /* r9 = ESID data, r11 = VSID data */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - /* we're done for kernel addresses */ - crclr 4*cr0+eq /* set result to "success" */ - bgelr cr7 - - /* Update the slb cache */ - lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r9,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ - srdi r10,r10,28 /* get the 36 bits of the ESID */ - add r11,r11,r13 /* r11 = (u32 *)paca + offset */ - stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r9,r9,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r9,SLB_CACHE_ENTRIES+1 -2: - sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - crclr 4*cr0+eq /* set result to "success" */ - blr - -/* - * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * - * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 - */ -.Lslb_finish_load_1T: - srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - - li r10,MMU_SEGSIZE_1T - rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ - - /* r3 = EA, r11 = VSID data */ - clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ - b 7b - - -_ASM_NOKPROBE_SYMBOL(slb_allocate) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) -_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) -#ifdef CONFIG_SPARSEMEM_VMEMMAP -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap) -#endif From 126b11b294d15a90e38eb2dcded2433619b2c794 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:53 +1000 Subject: [PATCH 161/221] powerpc/64s/hash: Add SLB allocation status bitmaps Add 32-entry bitmaps to track the allocation status of the first 32 SLB entries, and whether they are user or kernel entries. These are used to allocate free SLB entries first, before resorting to the round robin allocator. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 6 ++- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/mm/slb.c | 64 ++++++++++++++++++++++++------- arch/powerpc/xmon/xmon.c | 4 +- 4 files changed, 59 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index c6d01f0aa898..97e8a57a4998 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -113,7 +113,10 @@ struct paca_struct { * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; - u16 slb_cache_ptr; + u8 slb_cache_ptr; + u8 stab_rr; /* stab/slb round-robin counter */ + u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ + u32 slb_kern_bitmap; u32 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -160,7 +163,6 @@ struct paca_struct { */ struct task_struct *__current; /* Pointer to current */ u64 kstack; /* Saved Kernel stack addr */ - u64 stab_rr; /* stab/slb round-robin counter */ u64 saved_r1; /* r1 save for RTAS calls or PM or EE=0 */ u64 saved_msr; /* MSR saved here by enter_rtas */ u16 trap_save; /* Used when bad stack is encountered */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 92156c61d21c..10ef2e4db2fd 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -178,7 +178,6 @@ int main(void) OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); - OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); OFFSET(PACAKBASE, paca_struct, kernelbase); @@ -217,6 +216,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); + OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); #ifdef CONFIG_PPC_MM_SLICES OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 76c1a9523049..ed61639fe4f4 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -122,6 +122,9 @@ void slb_restore_bolted_realmode(void) { __slb_restore_bolted_realmode(); get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } /* @@ -129,9 +132,6 @@ void slb_restore_bolted_realmode(void) */ void slb_flush_all_realmode(void) { - /* - * This flushes all SLB entries including 0, so it must be realmode. - */ asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } @@ -177,6 +177,9 @@ void slb_flush_and_rebolt(void) : "memory"); get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } void slb_save_contents(struct slb_entry *slb_ptr) @@ -209,7 +212,7 @@ void slb_dump_contents(struct slb_entry *slb_ptr) return; pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %lld\n", get_paca()->stab_rr); + pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); for (i = 0; i < mmu_slb_size; i++) { e = slb_ptr->esid; @@ -342,10 +345,13 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) "isync" :: "r"(ksp_vsid_data), "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; } get_paca()->slb_cache_ptr = 0; } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; copy_mm_to_paca(mm); @@ -402,6 +408,8 @@ void slb_initialize(void) } get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; lflags = SLB_VSID_KERNEL | linear_llp; @@ -453,17 +461,47 @@ static void slb_cache_update(unsigned long esid_data) } } -static enum slb_index alloc_slb_index(void) +static enum slb_index alloc_slb_index(bool kernel) { enum slb_index index; - /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ - index = get_paca()->stab_rr; - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - get_paca()->stab_rr = index; + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); return index; } @@ -490,7 +528,7 @@ static long slb_insert_entry(unsigned long ea, unsigned long context, */ barrier(); - index = alloc_slb_index(); + index = alloc_slb_index(kernel); vsid_data = __mk_vsid_data(vsid, ssize, flags); esid_data = mk_esid_data(ea, ssize, index); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 8345defa0e43..58e67b67a97c 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2394,7 +2394,9 @@ static void dump_one_paca(int cpu) } } DUMP(p, vmalloc_sllp, "%#-*x"); - DUMP(p, stab_rr, "%#-*llx"); + DUMP(p, stab_rr, "%#-*x"); + DUMP(p, slb_used_bitmap, "%#-*x"); + DUMP(p, slb_kern_bitmap, "%#-*x"); if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { DUMP(p, slb_cache_ptr, "%#-*x"); From 425d33146260a4a2e8a1ba64003d6c8ff3bdfcc4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:55 +1000 Subject: [PATCH 162/221] powerpc/64s/hash: Provide arch_setup_exec() hooks for hash slice setup This will be used by the SLB code in the next patch, but for now this sets the slb_addr_limit to the correct size for 32-bit tasks. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 ++ arch/powerpc/include/asm/slice.h | 1 + arch/powerpc/include/asm/thread_info.h | 6 ++++++ arch/powerpc/kernel/process.c | 9 +++++++++ arch/powerpc/mm/mmu_context_book3s64.c | 5 +++++ arch/powerpc/mm/slice.c | 14 ++++++++++++++ 6 files changed, 37 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index e0e4ce8f77d6..14e552ea5e52 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -487,6 +487,8 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); +extern void hash__setup_new_exec(void); + #ifdef CONFIG_PPC_PSERIES void hpte_init_pseries(void); #else diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index e40406cf5628..a595461c9cb0 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -32,6 +32,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); void slice_init_new_context_exec(struct mm_struct *mm); +void slice_setup_new_exec(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 3185f8ac1182..916a3d67b592 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -72,6 +72,12 @@ static inline struct thread_info *current_thread_info(void) } extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); + +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f9d1cca28cce..96cd9cd1a119 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1490,6 +1490,15 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void) +{ + if (radix_enabled()) + return; + hash__setup_new_exec(); +} +#endif + int set_thread_uses_vas(void) { #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index dbd8f762140b..f7352c66b6b8 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -84,6 +84,11 @@ static int hash__init_new_context(struct mm_struct *mm) return index; } +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); +} + static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 205fe557ca10..546dd07c8083 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -757,6 +757,20 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } +#ifdef CONFIG_PPC_BOOK3S_64 +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; +} +#endif + void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { From 5434ae74629af58ad0fc27143a9ea435f7734410 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 15 Sep 2018 01:30:56 +1000 Subject: [PATCH 163/221] powerpc/64s/hash: Add a SLB preload cache When switching processes, currently all user SLBEs are cleared, and a few (exec_base, pc, and stack) are preloaded. In trivial testing with small apps, this tends to miss the heap and low 256MB segments, and it will also miss commonly accessed segments on large memory workloads. Add a simple round-robin preload cache that just inserts the last SLB miss into the head of the cache and preloads those at context switch time. Every 256 context switches, the oldest entry is removed from the cache to shrink the cache and require fewer slbmte if they are unused. Much more could go into this, including into the SLB entry reclaim side to track some LRU information etc, which would require a study of large memory workloads. But this is a simple thing we can do now that is an obvious win for common workloads. With the full series, process switching speed on the context_switch benchmark on POWER9/hash (with kernel speculation security masures disabled) increases from 140K/s to 178K/s (27%). POWER8 does not change much (within 1%), it's unclear why it does not see a big gain like POWER9. Booting to busybox init with 256MB segments has SLB misses go down from 945 to 69, and with 1T segments 900 to 21. These could almost all be eliminated by preloading a bit more carefully with ELF binary loading. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/include/asm/thread_info.h | 5 + arch/powerpc/kernel/process.c | 7 + arch/powerpc/mm/mmu_context_book3s64.c | 4 + arch/powerpc/mm/slb.c | 214 +++++++++++++++++++------ 5 files changed, 184 insertions(+), 47 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3fefb8a65b17..7d04d60a39c9 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -273,6 +273,7 @@ struct thread_struct { #endif /* CONFIG_HAVE_HW_BREAKPOINT */ struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */ unsigned long trap_nr; /* last trap # on this thread */ + u8 load_slb; /* Ages out SLB preload cache entries */ u8 load_fp; #ifdef CONFIG_ALTIVEC u8 load_vec; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 916a3d67b592..544cac0474cb 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -29,6 +29,7 @@ #include #include +#define SLB_PRELOAD_NR 16U /* * low level task data. */ @@ -44,6 +45,10 @@ struct thread_info { #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) struct cpu_accounting_data accounting; #endif + unsigned char slb_preload_nr; + unsigned char slb_preload_tail; + u32 slb_preload_esid[SLB_PRELOAD_NR]; + /* low level flags - has atomic operations done on it */ unsigned long flags ____cacheline_aligned_in_smp; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 96cd9cd1a119..7ad304a3cc7d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1727,6 +1727,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } +void preload_new_slb_context(unsigned long start, unsigned long sp); + /* * Set up a thread for executing a new program */ @@ -1734,6 +1736,10 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ + +#ifdef CONFIG_PPC_BOOK3S_64 + preload_new_slb_context(start, sp); +#endif #endif /* @@ -1824,6 +1830,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_slb = 0; current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index f7352c66b6b8..510f103d7813 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -53,6 +53,8 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +void slb_setup_new_exec(void); + static int hash__init_new_context(struct mm_struct *mm) { int index; @@ -87,6 +89,8 @@ static int hash__init_new_context(struct mm_struct *mm) void hash__setup_new_exec(void) { slice_setup_new_exec(); + + slb_setup_new_exec(); } static int radix__init_new_context(struct mm_struct *mm) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index ed61639fe4f4..3b7d8af09724 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -257,41 +257,148 @@ void slb_vmalloc_update(void) slb_flush_and_rebolt(); } -/* Helper function to compare esids. There are four cases to handle. - * 1. The system is not 1T segment size capable. Use the GET_ESID compare. - * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. - * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. - * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. - */ -static inline int esids_match(unsigned long addr1, unsigned long addr2) +static bool preload_hit(struct thread_info *ti, unsigned long esid) { - int esid_1t_count; + unsigned char i; - /* System is not 1T segment size capable. */ - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - return (GET_ESID(addr1) == GET_ESID(addr2)); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; - esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + - ((addr2 >> SID_SHIFT_1T) != 0)); - - /* both addresses are < 1T */ - if (esid_1t_count == 0) - return (GET_ESID(addr1) == GET_ESID(addr2)); - - /* One address < 1T, the other > 1T. Not a match */ - if (esid_1t_count == 1) - return 0; - - /* Both addresses are > 1T. */ - return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; } +static bool preload_add(struct thread_info *ti, unsigned long ea) +{ + unsigned char idx; + unsigned long esid; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; + + if (preload_hit(ti, esid)) + return false; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; + + return true; +} + +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} + +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; + + WARN_ON(irqs_disabled()); + + /* + * preload cache can only be used to determine whether a SLB + * entry exists if it does not start to overflow. + */ + if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + WARN_ON(irqs_disabled()); + + /* see above */ + if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + + /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long exec_base; + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -300,6 +407,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); + asm volatile("isync" : : : "memory"); if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* * SLBIA IH=3 invalidates all Class=1 SLBEs and their @@ -307,16 +415,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * switch_slb wants. So ARCH_300 does not use the slb * cache. */ - asm volatile("isync ; " PPC_SLBIA(3)" ; isync"); + asm volatile(PPC_SLBIA(3)); } else { unsigned long offset = get_paca()->slb_cache_ptr; if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { unsigned long slbie_data = 0; - int i; - asm volatile("isync" : : : "memory"); for (i = 0; i < offset; i++) { /* EA */ slbie_data = (unsigned long) @@ -331,7 +437,6 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) asm volatile("slbie %0" : : "r" (slbie_data)); - asm volatile("isync" : : : "memory"); } else { struct slb_shadow *p = get_slb_shadow(); unsigned long ksp_esid_data = @@ -339,8 +444,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) unsigned long ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - asm volatile("isync\n" - PPC_SLBIA(1) "\n" + asm volatile(PPC_SLBIA(1) "\n" "slbmte %0,%1\n" "isync" :: "r"(ksp_vsid_data), @@ -356,24 +460,35 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) copy_mm_to_paca(mm); /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. */ - exec_base = 0x10000000; + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); - if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(exec_base)) - return; + preload_age(ti); + preload_add(ti, pc); + } - slb_allocate_user(mm, pc); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; - if (!esids_match(pc, stack)) - slb_allocate_user(mm, stack); + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; - if (!esids_match(pc, exec_base) && - !esids_match(stack, exec_base)) - slb_allocate_user(mm, exec_base); + slb_allocate_user(mm, ea); + } + + /* + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). + */ + asm volatile("isync" : : : "memory"); } void slb_set_size(u16 size) @@ -642,11 +757,16 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) return slb_allocate_kernel(ea, id); } else { struct mm_struct *mm = current->mm; + long err; if (unlikely(!mm)) return -EFAULT; - return slb_allocate_user(mm, ea); + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; } } From 94ee42727ce06522787a28476465becace1c238b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 3 Oct 2018 00:27:58 +1000 Subject: [PATCH 164/221] powerpc/64s/hash: Simplify slb_flush_and_rebolt() slb_flush_and_rebolt() is misleading, it is called in virtual mode, so it can not possibly change the stack, so it should not be touching the shadow area. And since vmalloc is no longer bolted, it should not change any bolted mappings at all. Change the name to slb_flush_and_restore_bolted(), and have it just load the kernel stack from what's currently in the shadow SLB area. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/kernel/swsusp_asm64.S | 2 +- arch/powerpc/mm/hash_utils_64.c | 4 +- arch/powerpc/mm/slb.c | 46 +++++++------------ arch/powerpc/mm/slice.c | 2 +- 5 files changed, 21 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 14e552ea5e52..60cda8fb0677 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -503,7 +503,7 @@ struct slb_entry { }; extern void slb_initialize(void); -extern void slb_flush_and_rebolt(void); +void slb_flush_and_restore_bolted(void); void slb_flush_all_realmode(void); void __slb_restore_bolted_realmode(void); void slb_restore_bolted_realmode(void); diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index f83bf6f72cb0..185216becb8b 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -262,7 +262,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) addi r1,r1,-128 #ifdef CONFIG_PPC_BOOK3S_64 - bl slb_flush_and_rebolt + bl slb_flush_and_restore_bolted #endif bl do_after_copyback addi r1,r1,128 diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 854edc3722e0..0cc7fbc3bd1c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1125,7 +1125,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } #endif /* CONFIG_PPC_64K_PAGES */ @@ -1197,7 +1197,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, if (user_region) { if (psize != get_paca_psize(ea)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } else if (get_paca()->vmalloc_sllp != mmu_psize_defs[mmu_vmalloc_psize].sllp) { diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 3b7d8af09724..d8d9c9bd15d3 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -115,8 +115,6 @@ void __slb_restore_bolted_realmode(void) /* * Insert the bolted entries into an empty SLB. - * This is not the same as rebolt because the bolted segments are not - * changed, just loaded from the shadow area. */ void slb_restore_bolted_realmode(void) { @@ -135,12 +133,15 @@ void slb_flush_all_realmode(void) asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } -void slb_flush_and_rebolt(void) +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) { - /* If you change this make sure you change SLB_NUM_BOLTED - * and PR KVM appropriately too. */ - unsigned long linear_llp, lflags; - unsigned long ksp_esid_data, ksp_vsid_data; + struct slb_shadow *p = get_slb_shadow(); + + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); WARN_ON(!irqs_disabled()); @@ -150,30 +151,12 @@ void slb_flush_and_rebolt(void) */ hard_irq_disable(); - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - lflags = SLB_VSID_KERNEL | linear_llp; - - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); - if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { - ksp_esid_data &= ~SLB_ESID_V; - ksp_vsid_data = 0; - slb_shadow_clear(KSTACK_INDEX); - } else { - /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX); - ksp_vsid_data = - be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid); - } - - /* We need to do this all in asm, so we're sure we don't touch - * the stack between the slbia and rebolting it. */ asm volatile("isync\n" "slbia\n" - /* Slot 1 - kernel stack */ - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data) + "slbmte %0, %1\n" + "isync\n" + :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), + "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) : "memory"); get_paca()->slb_cache_ptr = 0; @@ -254,7 +237,10 @@ void slb_dump_contents(struct slb_entry *slb_ptr) void slb_vmalloc_update(void) { - slb_flush_and_rebolt(); + /* + * vmalloc is not bolted, so just have to flush non-bolted. + */ + slb_flush_and_restore_bolted(); } static bool preload_hit(struct thread_info *ti, unsigned long esid) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 546dd07c8083..50ba3d0456a5 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -219,7 +219,7 @@ static void slice_flush_segments(void *parm) copy_mm_to_paca(current->active_mm); local_irq_save(flags); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); local_irq_restore(flags); #endif } From e15a4fea4dee2771c6989862527546b2b3326799 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 3 Oct 2018 00:27:59 +1000 Subject: [PATCH 165/221] powerpc/64s/hash: Add some SLB debugging tests This adds CONFIG_DEBUG_VM checks to ensure: - The kernel stack is in the SLB after it's flushed and bolted. - We don't insert an SLB for an address that is aleady in the SLB. - The kernel SLB miss handler does not take an SLB miss. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 3 ++ arch/powerpc/mm/slb.c | 53 +++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 97e8a57a4998..e843bc5d1a0f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -115,6 +115,9 @@ struct paca_struct { u16 vmalloc_sllp; u8 slb_cache_ptr; u8 stab_rr; /* stab/slb round-robin counter */ +#ifdef CONFIG_DEBUG_VM + u8 in_kernel_slb_handler; +#endif u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ u32 slb_kern_bitmap; u32 slb_cache[SLB_CACHE_ENTRIES]; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index d8d9c9bd15d3..4b6e7a21a7c5 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -58,6 +58,30 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } +static void assert_slb_exists(unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + asm volatile("slbfee. %0, %1" : "=r"(tmp) : "r"(ea) : "cr0"); + WARN_ON(tmp == 0); +#endif +} + +static void assert_slb_notexists(unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + asm volatile("slbfee. %0, %1" : "=r"(tmp) : "r"(ea) : "cr0"); + WARN_ON(tmp != 0); +#endif +} + static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, enum slb_index index) @@ -90,6 +114,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, */ slb_shadow_update(ea, ssize, flags, index); + assert_slb_notexists(ea); asm volatile("slbmte %0,%1" : : "r" (mk_vsid_data(ea, ssize, flags)), "r" (mk_esid_data(ea, ssize, index)) @@ -111,6 +136,8 @@ void __slb_restore_bolted_realmode(void) : "r" (be64_to_cpu(p->save_area[index].vsid)), "r" (be64_to_cpu(p->save_area[index].esid))); } + + assert_slb_exists(local_paca->kstack); } /* @@ -158,6 +185,7 @@ void slb_flush_and_restore_bolted(void) :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) : "memory"); + assert_slb_exists(get_paca()->kstack); get_paca()->slb_cache_ptr = 0; @@ -410,9 +438,17 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) unsigned long slbie_data = 0; for (i = 0; i < offset; i++) { - /* EA */ - slbie_data = (unsigned long) + unsigned long ea; + + ea = (unsigned long) get_paca()->slb_cache[i] << SID_SHIFT; + /* + * Could assert_slb_exists here, but hypervisor + * or machine check could have come in and + * removed the entry at this point. + */ + + slbie_data = ea; slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT; slbie_data |= SLBIE_C; /* user slbs have C=1 */ @@ -640,6 +676,7 @@ static long slb_insert_entry(unsigned long ea, unsigned long context, * User preloads should add isync afterwards in case the kernel * accesses user memory before it returns to userspace with rfid. */ + assert_slb_notexists(ea); asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); barrier(); @@ -740,7 +777,17 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) * if they go via fast_exception_return too. */ if (id >= KERNEL_REGION_ID) { - return slb_allocate_kernel(ea, id); + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; } else { struct mm_struct *mm = current->mm; long err; From c9f80734cd552ddba50567bc43b0ff250a4b2c17 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 14:03:57 +0530 Subject: [PATCH 166/221] powerpc/mm/hash: Rename get_ea_context to get_user_context We will be adding get_kernel_context later. Update function name to indicate this handle context allocation user space address. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 4 ++-- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/mm/slb.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 9c8c669a6b6a..6328857f259f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -208,7 +208,7 @@ extern void radix_init_pseries(void); static inline void radix_init_pseries(void) { }; #endif -static inline int get_ea_context(mm_context_t *ctx, unsigned long ea) +static inline int get_user_context(mm_context_t *ctx, unsigned long ea) { int index = ea >> MAX_EA_BITS_PER_CONTEXT; @@ -223,7 +223,7 @@ static inline int get_ea_context(mm_context_t *ctx, unsigned long ea) static inline unsigned long get_user_vsid(mm_context_t *ctx, unsigned long ea, int ssize) { - unsigned long context = get_ea_context(ctx, ea); + unsigned long context = get_user_context(ctx, ea); return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b2f89b621b15..dbbab7ba449b 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -81,7 +81,7 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) { int context_id; - context_id = get_ea_context(&mm->context, ea); + context_id = get_user_context(&mm->context, ea); if (!context_id) return true; return false; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 4b6e7a21a7c5..4fe5cb5052b6 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -734,7 +734,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) if (ea >= mm->context.slb_addr_limit) return -EFAULT; - context = get_ea_context(&mm->context, ea); + context = get_user_context(&mm->context, ea); if (!context) return -EFAULT; From 4ffe713b7587b14695c9bec26a000fc88ef54895 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Sep 2018 14:03:58 +0530 Subject: [PATCH 167/221] powerpc/mm: Increase the max addressable memory to 2PB Currently we limit the max addressable memory to 128TB. This patch increase the limit to 2PB. We can have devices like nvdimm which adds memory above 512TB limit. We still don't support regular system ram above 512TB. One of the challenge with that is the percpu allocator, that allocates per node memory and use the max distance between them as the percpu offsets. This means with large gap in address space ( system ram above 1PB) we will run out of vmalloc space to map the percpu allocation. In order to support addressable memory above 512TB, kernel should be able to linear map this range. To do that with hash translation we now add 4 context to kernel linear map region. Our per context addressable range is 512TB. We still keep VMALLOC and VMEMMAP region to old size. SLB miss handlers is updated to validate these limit. We also limit this update to SPARSEMEM_VMEMMAP and SPARSEMEM_EXTREME Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 82 +++++++++++++------ arch/powerpc/include/asm/mmu.h | 15 ++++ arch/powerpc/include/asm/sparsemem.h | 11 --- arch/powerpc/mm/slb.c | 20 +++-- 4 files changed, 87 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 60cda8fb0677..fc7f056e9d97 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -521,13 +521,9 @@ extern void slb_set_size(u16 size); * from mmu context id and effective segment id of the address. * * For user processes max context id is limited to MAX_USER_CONTEXT. - - * For kernel space, we use context ids 1-4 to map addresses as below: - * NOTE: each context only support 64TB now. - * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * more details in get_user_context + * + * For kernel space get_kernel_context * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -567,6 +563,21 @@ extern void slb_set_size(u16 size); #define ESID_BITS_MASK ((1 << ESID_BITS) - 1) #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) +/* + * Now certain config support MAX_PHYSMEM more than 512TB. Hence we will need + * to use more than one context for linear mapping the kernel. + * For vmalloc and memmap, we use just one context with 512TB. With 64 byte + * struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)). + */ +#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT) +#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT)) +#else +#define MAX_KERNEL_CTX_CNT 1 +#endif + +#define MAX_VMALLOC_CTX_CNT 1 +#define MAX_MEMMAP_CTX_CNT 1 + /* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments @@ -577,12 +588,13 @@ extern void slb_set_size(u16 size); * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 * because of the modulo operation in vsid scramble. + * + * We add one extra context to MIN_USER_CONTEXT so that we can map kernel + * context easily. The +1 is to map the unused 0xe region mapping. */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) -#define MIN_USER_CONTEXT (5) - -/* Would be nice to use KERNEL_REGION_ID here */ -#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1) +#define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \ + MAX_MEMMAP_CTX_CNT + 2) /* * For platforms that support on 65bit VA we limit the context bits @@ -742,6 +754,39 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits); } +/* + * For kernel space, we use context ids as below + * below. Range is 512TB per context. + * + * 0x00001 - [ 0xc000000000000000 - 0xc001ffffffffffff] + * 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff] + * 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff] + * 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff] + + * 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ] + * 0x00006 - Not used - Can map 0xe000000000000000 range. + * 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ] + * + * So we can compute the context from the region (top nibble) by + * subtracting 11, or 0xc - 1. + */ +static inline unsigned long get_kernel_context(unsigned long ea) +{ + unsigned long region_id = REGION_ID(ea); + unsigned long ctx; + /* + * For linear mapping we do support multiple context + */ + if (region_id == KERNEL_REGION_ID) { + /* + * We already verified ea to be not beyond the addr limit. + */ + ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT); + } else + ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT; + return ctx; +} + /* * This is only valid for addresses >= PAGE_OFFSET */ @@ -752,20 +797,7 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) if (!is_kernel_addr(ea)) return 0; - /* - * For kernel space, we use context ids 1-4 to map the address space as - * below: - * - * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] - * - * So we can compute the context from the region (top nibble) by - * subtracting 11, or 0xc - 1. - */ - context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET; - + context = get_kernel_context(ea); return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 13ea441ac531..eb20eb3b8fb0 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -309,6 +309,21 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) */ #define MMU_PAGE_COUNT 16 +/* + * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS + * if we increase SECTIONS_WIDTH we will not store node details in page->flags and + * page_to_nid does a page->section->node lookup + * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce + * memory requirements with large number of sections. + * 51 bits is the max physical real address on POWER9 + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ + defined (CONFIG_PPC_64K_PAGES) +#define MAX_PHYSMEM_BITS 51 +#else +#define MAX_PHYSMEM_BITS 46 +#endif + #ifdef CONFIG_PPC_BOOK3S_64 #include #else /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 28f5dae25db6..68da49320592 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -9,17 +9,6 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #define SECTION_SIZE_BITS 24 -/* - * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS - * if we increase SECTIONS_WIDTH we will not store node details in page->flags and - * page_to_nid does a page->section->node lookup - * Hence only increase for VMEMMAP. - */ -#ifdef CONFIG_SPARSEMEM_VMEMMAP -#define MAX_PHYSMEM_BITS 47 -#else -#define MAX_PHYSMEM_BITS 46 -#endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 4fe5cb5052b6..c3fdf2969d9f 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -693,16 +693,27 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) unsigned long flags; int ssize; - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) - return -EFAULT; - if (id == KERNEL_REGION_ID) { + + /* We only support upto MAX_PHYSMEM_BITS */ + if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + return -EFAULT; + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + #ifdef CONFIG_SPARSEMEM_VMEMMAP } else if (id == VMEMMAP_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; #endif } else if (id == VMALLOC_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + if (ea < H_VMALLOC_END) flags = get_paca()->vmalloc_sllp; else @@ -715,8 +726,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) ssize = MMU_SEGSIZE_256M; - context = id - KERNEL_REGION_CONTEXT_OFFSET; - + context = get_kernel_context(ea); return slb_insert_entry(ea, context, flags, ssize, true); } From b9fb4480a3af85552d88561a2fea9c4d5f54c917 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Oct 2018 15:09:21 +0530 Subject: [PATCH 168/221] powerpc/mm: Make pte_pgprot return all pte bits Other archs do the same and instead of adding required pte bits (which got masked out) in __ioremap_at(), make sure we filter only pfn bits out. Fixes: 26973fa5ac0e ("powerpc/mm: use pte helpers in generic code") Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 6 ------ arch/powerpc/include/asm/book3s/64/pgtable.h | 8 -------- arch/powerpc/include/asm/nohash/32/pte-40x.h | 5 ----- arch/powerpc/include/asm/nohash/32/pte-44x.h | 5 ----- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 5 ----- arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h | 5 ----- arch/powerpc/include/asm/nohash/pgtable.h | 1 - arch/powerpc/include/asm/nohash/pte-book3e.h | 5 ----- arch/powerpc/include/asm/pgtable.h | 10 ++++++++++ 9 files changed, 10 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 0fbd4c642b51..e61dd3ae5bc0 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -48,11 +48,6 @@ static inline bool pte_user(pte_t pte) #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HASHPTE | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL) -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY) - /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable @@ -396,7 +391,6 @@ static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSE static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } static inline bool pte_exec(pte_t pte) { return true; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } static inline int pte_present(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9db2b8eba61d..d33421648d39 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -128,13 +128,6 @@ #define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \ H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4) -/* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ - H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ - _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY | H_PTE_PKEY) /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable @@ -496,7 +489,6 @@ static inline bool pte_exec(pte_t pte) return !!(pte_raw(pte) & cpu_to_be64(_PAGE_EXEC)); } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline bool pte_soft_dirty(pte_t pte) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 7a8b3c94592f..661f4599f2fc 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -73,11 +73,6 @@ /* Until my rework is finished, 40x still needs atomic PTE updates */ #define PTE_ATOMIC_UPDATES 1 -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) - #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index 8d6b268a986f..78bc304f750e 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -93,11 +93,6 @@ #define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) #define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) - /* TODO: Add large page lowmem mapping support */ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 1c57efac089d..6bfe041ef59d 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -55,11 +55,6 @@ #define _PAGE_KERNEL_RW (_PAGE_SH | _PAGE_DIRTY) #define _PAGE_KERNEL_RWX (_PAGE_SH | _PAGE_DIRTY | _PAGE_EXEC) -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_NO_CACHE | \ - _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ - _PAGE_SH | _PAGE_DIRTY | _PAGE_EXEC) - #define _PMD_PRESENT 0x0001 #define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x0fd0 diff --git a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h index 1ecf60fe0909..0fc1bd42bb3e 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h @@ -39,11 +39,6 @@ /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) - #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 04e9f0922ad4..70ff23974b59 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -52,7 +52,6 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) static inline bool pte_hashpte(pte_t pte) { return false; } static inline bool pte_ci(pte_t pte) { return pte_val(pte) & _PAGE_NO_CACHE; } static inline bool pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_NUMA_BALANCING /* diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index 58eef8cb569d..f95ab6eaf441 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -82,11 +82,6 @@ #define _PTE_NONE_MASK 0 #endif -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC) - /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index fb4b85bba110..9679b7519a35 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -46,6 +46,16 @@ struct mm_struct; /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pte_flags; + + pte_flags = pte_val(pte) & ~PTE_RPN_MASK; + return __pgprot(pte_flags); +} /* * ZERO_PAGE is a global shared page that is always zero: used From bde1a1335c5031758d3917d83dd5b85b761bbebd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 17 Oct 2018 13:03:22 +0000 Subject: [PATCH 169/221] powerpc/book3e: redefine pte_mkprivileged() and pte_mkuser() Book3e defines both _PAGE_USER and _PAGE_PRIVILEGED, so the nohash default pte_mkprivileged() and pte_mkuser() are not usable. This patch redefines them for book3e. In theorie, only pte_mkprivileged() needs to be redefined because _PAGE_USER includes _PAGE_PRIVILEGED, but it is less confusing to redefine both. Fixes: a0da4bc166f2 ("powerpc/mm: Allow platforms to redefine some helpers") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/pte-book3e.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index f95ab6eaf441..dd40d200f274 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -104,5 +104,21 @@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#ifndef __ASSEMBLY__ +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); +} + +#define pte_mkprivileged pte_mkprivileged + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); +} + +#define pte_mkuser pte_mkuser +#endif /* __ASSEMBLY__ */ + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_PTE_BOOK3E_H */ From 422123ccb9a13dcea2e008194ae6c262fbb64604 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Oct 2018 07:20:45 +0000 Subject: [PATCH 170/221] powerpc/traps: fix machine check handlers to use pr_cont() When printing the machine check cause, the cause appears on the following line due to bad use of printk without \n: [ 33.663993] Machine check in kernel mode. [ 33.664011] Caused by (from SRR1=9032): [ 33.664036] Data access error at address c90c8000 This patch fixes it by using pr_cont() for the second part: [ 133.258131] Machine check in kernel mode. [ 133.258146] Caused by (from SRR1=9032): Data access error at address c90c8000 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 78 +++++++++++----------- arch/powerpc/platforms/8xx/machine_check.c | 4 +- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index fd58749b4d6b..c8559b2c2c81 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -535,10 +535,10 @@ int machine_check_e500mc(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) { - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); /* * This is recoverable by invalidating the i-cache. @@ -556,7 +556,7 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_DCPERR_MC) { - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); /* * In write shadow mode we auto-recover from the error, but it @@ -575,38 +575,38 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_L2MMU_MHIT) { - printk("Hit on multiple TLB entries\n"); + pr_cont("Hit on multiple TLB entries\n"); recoverable = 0; } if (reason & MCSR_NMI) - printk("Non-maskable interrupt\n"); + pr_cont("Non-maskable interrupt\n"); if (reason & MCSR_IF) { - printk("Instruction Fetch Error Report\n"); + pr_cont("Instruction Fetch Error Report\n"); recoverable = 0; } if (reason & MCSR_LD) { - printk("Load Error Report\n"); + pr_cont("Load Error Report\n"); recoverable = 0; } if (reason & MCSR_ST) { - printk("Store Error Report\n"); + pr_cont("Store Error Report\n"); recoverable = 0; } if (reason & MCSR_LDG) { - printk("Guarded Load Error Report\n"); + pr_cont("Guarded Load Error Report\n"); recoverable = 0; } if (reason & MCSR_TLBSYNC) - printk("Simultaneous tlbsync operations\n"); + pr_cont("Simultaneous tlbsync operations\n"); if (reason & MCSR_BSL2_ERR) { - printk("Level 2 Cache Error\n"); + pr_cont("Level 2 Cache Error\n"); recoverable = 0; } @@ -616,7 +616,7 @@ int machine_check_e500mc(struct pt_regs *regs) addr = mfspr(SPRN_MCAR); addr |= (u64)mfspr(SPRN_MCARU) << 32; - printk("Machine Check %s Address: %#llx\n", + pr_cont("Machine Check %s Address: %#llx\n", reason & MCSR_MEA ? "Effective" : "Physical", addr); } @@ -640,29 +640,29 @@ int machine_check_e500(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); if (reason & MCSR_DCP_PERR) - printk("Data Cache Push Parity Error\n"); + pr_cont("Data Cache Push Parity Error\n"); if (reason & MCSR_DCPERR) - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); if (reason & MCSR_BUS_IAERR) - printk("Bus - Instruction Address Error\n"); + pr_cont("Bus - Instruction Address Error\n"); if (reason & MCSR_BUS_RAERR) - printk("Bus - Read Address Error\n"); + pr_cont("Bus - Read Address Error\n"); if (reason & MCSR_BUS_WAERR) - printk("Bus - Write Address Error\n"); + pr_cont("Bus - Write Address Error\n"); if (reason & MCSR_BUS_IBERR) - printk("Bus - Instruction Data Error\n"); + pr_cont("Bus - Instruction Data Error\n"); if (reason & MCSR_BUS_RBERR) - printk("Bus - Read Data Bus Error\n"); + pr_cont("Bus - Read Data Bus Error\n"); if (reason & MCSR_BUS_WBERR) - printk("Bus - Write Data Bus Error\n"); + pr_cont("Bus - Write Data Bus Error\n"); if (reason & MCSR_BUS_IPERR) - printk("Bus - Instruction Parity Error\n"); + pr_cont("Bus - Instruction Parity Error\n"); if (reason & MCSR_BUS_RPERR) - printk("Bus - Read Parity Error\n"); + pr_cont("Bus - Read Parity Error\n"); return 0; } @@ -680,19 +680,19 @@ int machine_check_e200(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_CP_PERR) - printk("Cache Push Parity Error\n"); + pr_cont("Cache Push Parity Error\n"); if (reason & MCSR_CPERR) - printk("Cache Parity Error\n"); + pr_cont("Cache Parity Error\n"); if (reason & MCSR_EXCP_ERR) - printk("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); + pr_cont("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); if (reason & MCSR_BUS_IRERR) - printk("Bus - Read Bus Error on instruction fetch\n"); + pr_cont("Bus - Read Bus Error on instruction fetch\n"); if (reason & MCSR_BUS_DRERR) - printk("Bus - Read Bus Error on data load\n"); + pr_cont("Bus - Read Bus Error on data load\n"); if (reason & MCSR_BUS_WRERR) - printk("Bus - Write Bus Error on buffered store or cache line push\n"); + pr_cont("Bus - Write Bus Error on buffered store or cache line push\n"); return 0; } @@ -705,30 +705,30 @@ int machine_check_generic(struct pt_regs *regs) printk("Caused by (from SRR1=%lx): ", reason); switch (reason & 0x601F0000) { case 0x80000: - printk("Machine check signal\n"); + pr_cont("Machine check signal\n"); break; case 0: /* for 601 */ case 0x40000: case 0x140000: /* 7450 MSS error and TEA */ - printk("Transfer error ack signal\n"); + pr_cont("Transfer error ack signal\n"); break; case 0x20000: - printk("Data parity error signal\n"); + pr_cont("Data parity error signal\n"); break; case 0x10000: - printk("Address parity error signal\n"); + pr_cont("Address parity error signal\n"); break; case 0x20000000: - printk("L1 Data Cache error\n"); + pr_cont("L1 Data Cache error\n"); break; case 0x40000000: - printk("L1 Instruction Cache error\n"); + pr_cont("L1 Instruction Cache error\n"); break; case 0x00100000: - printk("L2 data cache parity error\n"); + pr_cont("L2 data cache parity error\n"); break; default: - printk("Unknown values in msr\n"); + pr_cont("Unknown values in msr\n"); } return 0; } diff --git a/arch/powerpc/platforms/8xx/machine_check.c b/arch/powerpc/platforms/8xx/machine_check.c index 402016705a39..9944fc303df0 100644 --- a/arch/powerpc/platforms/8xx/machine_check.c +++ b/arch/powerpc/platforms/8xx/machine_check.c @@ -18,9 +18,9 @@ int machine_check_8xx(struct pt_regs *regs) pr_err("Machine check in kernel mode.\n"); pr_err("Caused by (from SRR1=%lx): ", reason); if (reason & 0x40000000) - pr_err("Fetch error at address %lx\n", regs->nip); + pr_cont("Fetch error at address %lx\n", regs->nip); else - pr_err("Data access error at address %lx\n", regs->dar); + pr_cont("Data access error at address %lx\n", regs->dar); #ifdef CONFIG_PCI /* the qspan pci read routines can cause machine checks -- Cort From 4c5d87db497832c493ed296157bd1749dddc69f1 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Oct 2018 10:18:27 +1100 Subject: [PATCH 171/221] powerpc/pseries: PAPR persistent memory support This patch implements support for discovering storage class memory devices at boot and for handling hotplug of new regions via RTAS hotplug events. Signed-off-by: Oliver O'Halloran [mpe: Fix CONFIG_MEMORY_HOTPLUG=n build] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/firmware.h | 4 +- arch/powerpc/include/asm/hvcall.h | 10 +- arch/powerpc/include/asm/rtas.h | 2 + arch/powerpc/kernel/rtasd.c | 2 + arch/powerpc/platforms/pseries/Makefile | 2 +- arch/powerpc/platforms/pseries/dlpar.c | 4 + arch/powerpc/platforms/pseries/firmware.c | 1 + arch/powerpc/platforms/pseries/pmem.c | 164 ++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 5 + arch/powerpc/platforms/pseries/ras.c | 3 +- 10 files changed, 193 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/platforms/pseries/pmem.c diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 2aca2655fe30..00bc42d95679 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -53,6 +53,7 @@ #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) #define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) +#define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) #ifndef __ASSEMBLY__ @@ -70,7 +71,8 @@ enum { FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | - FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE, + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | + FW_FEATURE_PAPR_SCM, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index c349d3960d63..5a2c5ea57b73 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -296,7 +296,15 @@ #define H_INT_ESB 0x3C8 #define H_INT_SYNC 0x3CC #define H_INT_RESET 0x3D0 -#define MAX_HCALL_OPCODE H_INT_RESET +#define H_SCM_READ_METADATA 0x3E4 +#define H_SCM_WRITE_METADATA 0x3E8 +#define H_SCM_BIND_MEM 0x3EC +#define H_SCM_UNBIND_MEM 0x3F0 +#define H_SCM_QUERY_BLOCK_MEM_BINDING 0x3F4 +#define H_SCM_QUERY_LOGICAL_MEM_BINDING 0x3F8 +#define H_SCM_MEM_QUERY 0x3FC +#define H_SCM_BLOCK_CLEAR 0x400 +#define MAX_HCALL_OPCODE H_SCM_BLOCK_CLEAR /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 0183e9595acc..bb38dd67d47d 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -125,6 +125,7 @@ struct rtas_suspend_me_data { #define RTAS_TYPE_INFO 0xE2 #define RTAS_TYPE_DEALLOC 0xE3 #define RTAS_TYPE_DUMP 0xE4 +#define RTAS_TYPE_HOTPLUG 0xE5 /* I don't add PowerMGM events right now, this is a different topic */ #define RTAS_TYPE_PMGM_POWER_SW_ON 0x60 #define RTAS_TYPE_PMGM_POWER_SW_OFF 0x61 @@ -329,6 +330,7 @@ struct pseries_hp_errorlog { #define PSERIES_HP_ELOG_RESOURCE_MEM 2 #define PSERIES_HP_ELOG_RESOURCE_SLOT 3 #define PSERIES_HP_ELOG_RESOURCE_PHB 4 +#define PSERIES_HP_ELOG_RESOURCE_PMEM 6 #define PSERIES_HP_ELOG_ACTION_ADD 1 #define PSERIES_HP_ELOG_ACTION_REMOVE 2 diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index c1378661b12f..38cadae4ca4f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -91,6 +91,8 @@ static char *rtas_event_type(int type) return "Dump Notification Event"; case RTAS_TYPE_PRRN: return "Platform Resource Reassignment Event"; + case RTAS_TYPE_HOTPLUG: + return "Hotplug Event"; } return rtas_type[0]; diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 7e89d5c47068..892b27ced973 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o -obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o +obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o pmem.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 052c4f2ba0a0..7625546caefd 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -355,6 +355,10 @@ int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_RESOURCE_CPU: rc = dlpar_cpu(hp_elog); break; + case PSERIES_HP_ELOG_RESOURCE_PMEM: + rc = dlpar_hp_pmem(hp_elog); + break; + default: pr_warn_ratelimited("Invalid resource (%d) specified\n", hp_elog->resource); diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 1624501386f4..608ecad0178f 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -66,6 +66,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, + {FW_FEATURE_PAPR_SCM, "hcall-scm"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c new file mode 100644 index 000000000000..a27f40eb57b1 --- /dev/null +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Handles hot and cold plug of persistent memory regions on pseries. + */ + +#define pr_fmt(fmt) "pseries-pmem: " fmt + +#include +#include +#include +#include /* for idle_task_exit */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pseries.h" +#include "offline_states.h" + +static struct device_node *pmem_node; + +static ssize_t pmem_drc_add_node(u32 drc_index) +{ + struct device_node *dn; + int rc; + + pr_debug("Attempting to add pmem node, drc index: %x\n", drc_index); + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + pr_err("Failed to acquire DRC, rc: %d, drc index: %x\n", + rc, drc_index); + return -EINVAL; + } + + dn = dlpar_configure_connector(cpu_to_be32(drc_index), pmem_node); + if (!dn) { + pr_err("configure-connector failed for drc %x\n", drc_index); + dlpar_release_drc(drc_index); + return -EINVAL; + } + + /* NB: The of reconfig notifier creates platform device from the node */ + rc = dlpar_attach_node(dn, pmem_node); + if (rc) { + pr_err("Failed to attach node %s, rc: %d, drc index: %x\n", + dn->name, rc, drc_index); + + if (dlpar_release_drc(drc_index)) + dlpar_free_cc_nodes(dn); + + return rc; + } + + pr_info("Successfully added %pOF, drc index: %x\n", dn, drc_index); + + return 0; +} + +static ssize_t pmem_drc_remove_node(u32 drc_index) +{ + struct device_node *dn; + uint32_t index; + int rc; + + for_each_child_of_node(pmem_node, dn) { + if (of_property_read_u32(dn, "ibm,my-drc-index", &index)) + continue; + if (index == drc_index) + break; + } + + if (!dn) { + pr_err("Attempting to remove unused DRC index %x\n", drc_index); + return -ENODEV; + } + + pr_debug("Attempting to remove %pOF, drc index: %x\n", dn, drc_index); + + /* * NB: tears down the ibm,pmemory device as a side-effect */ + rc = dlpar_detach_node(dn); + if (rc) + return rc; + + rc = dlpar_release_drc(drc_index); + if (rc) { + pr_err("Failed to release drc (%x) for CPU %s, rc: %d\n", + drc_index, dn->name, rc); + dlpar_attach_node(dn, pmem_node); + return rc; + } + + pr_info("Successfully removed PMEM with drc index: %x\n", drc_index); + + return 0; +} + +int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) +{ + u32 count, drc_index; + int rc; + + /* slim chance, but we might get a hotplug event while booting */ + if (!pmem_node) + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); + if (!pmem_node) { + pr_err("Hotplug event for a pmem device, but none exists\n"); + return -ENODEV; + } + + if (hp_elog->id_type != PSERIES_HP_ELOG_ID_DRC_INDEX) { + pr_err("Unsupported hotplug event type %d\n", + hp_elog->id_type); + return -EINVAL; + } + + count = hp_elog->_drc_u.drc_count; + drc_index = hp_elog->_drc_u.drc_index; + + lock_device_hotplug(); + + if (hp_elog->action == PSERIES_HP_ELOG_ACTION_ADD) { + rc = pmem_drc_add_node(drc_index); + } else if (hp_elog->action == PSERIES_HP_ELOG_ACTION_REMOVE) { + rc = pmem_drc_remove_node(drc_index); + } else { + pr_err("Unsupported hotplug action (%d)\n", hp_elog->action); + rc = -EINVAL; + } + + unlock_device_hotplug(); + return rc; +} + +const struct of_device_id drc_pmem_match[] = { + { .type = "ibm,persistent-memory", }, + {} +}; + +static int pseries_pmem_init(void) +{ + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); + if (!pmem_node) + return 0; + + /* + * The generic OF bus probe/populate handles creating platform devices + * from the child (ibm,pmemory) nodes. The generic code registers an of + * reconfig notifier to handle the hot-add/remove cases too. + */ + of_platform_bus_probe(pmem_node, drc_pmem_match, NULL); + + return 0; +} +machine_arch_initcall(pseries, pseries_pmem_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 72c0b8986536..7dee8c5d3363 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -65,11 +65,16 @@ int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); #ifdef CONFIG_MEMORY_HOTPLUG int dlpar_memory(struct pseries_hp_errorlog *hp_elog); +int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog); #else static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { return -EOPNOTSUPP; } +static inline int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) +{ + return -EOPNOTSUPP; +} #endif #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 2a9c28e4d4f9..d97d52772789 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -333,7 +333,8 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) * hotplug events on the ras_log_buf to be handled by rtas_errd. */ if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || - hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) queue_hotplug_event(hp_elog); else log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); From b5beae5e224f1c72c4482b0ab36fc3d89481a6b2 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Oct 2018 10:18:28 +1100 Subject: [PATCH 172/221] powerpc/pseries: Add driver for PAPR SCM regions Adds a driver that implements support for enabling and accessing PAPR SCM regions. Unfortunately due to how the PAPR interface works we can't use the existing of_pmem driver (yet) because: a) The guest is required to use the H_SCM_BIND_MEM h-call to add add the SCM region to it's physical address space, and b) There is currently no mechanism for relating a bare of_pmem region to the backing DIMM (or not-a-DIMM for our case). Both of these are easily handled by rolling the functionality into a seperate driver so here we are... Acked-by: Dan Williams Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/Kconfig | 7 + arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/papr_scm.c | 345 ++++++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100644 arch/powerpc/platforms/pseries/papr_scm.c diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 39032d9b316c..2e4bd32154b5 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -138,3 +138,10 @@ config IBMEBUS bool "Support for GX bus based adapters" help Bus device driver for GX bus based adapters. + +config PAPR_SCM + depends on PPC_PSERIES && MEMORY_HOTPLUG + select LIBNVDIMM + tristate "Support for the PAPR Storage Class Memory interface" + help + Enable access to hypervisor provided storage class memory. diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 892b27ced973..a43ec843c8e2 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_PAPR_SCM) += papr_scm.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c new file mode 100644 index 000000000000..ee9372b65ca5 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "papr-scm: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define BIND_ANY_ADDR (~0ul) + +#define PAPR_SCM_DIMM_CMD_MASK \ + ((1ul << ND_CMD_GET_CONFIG_SIZE) | \ + (1ul << ND_CMD_GET_CONFIG_DATA) | \ + (1ul << ND_CMD_SET_CONFIG_DATA)) + +struct papr_scm_priv { + struct platform_device *pdev; + struct device_node *dn; + uint32_t drc_index; + uint64_t blocks; + uint64_t block_size; + int metadata_size; + + uint64_t bound_addr; + + struct nvdimm_bus_descriptor bus_desc; + struct nvdimm_bus *bus; + struct nvdimm *nvdimm; + struct resource res; + struct nd_region *region; + struct nd_interleave_set nd_set; +}; + +static int drc_pmem_bind(struct papr_scm_priv *p) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + uint64_t rc, token; + + /* + * When the hypervisor cannot map all the requested memory in a single + * hcall it returns H_BUSY and we call again with the token until + * we get H_SUCCESS. Aborting the retry loop before getting H_SUCCESS + * leave the system in an undefined state, so we wait. + */ + token = 0; + + do { + rc = plpar_hcall(H_SCM_BIND_MEM, ret, p->drc_index, 0, + p->blocks, BIND_ANY_ADDR, token); + token = be64_to_cpu(ret[0]); + cond_resched(); + } while (rc == H_BUSY); + + if (rc) { + dev_err(&p->pdev->dev, "bind err: %lld\n", rc); + return -ENXIO; + } + + p->bound_addr = be64_to_cpu(ret[1]); + + dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res); + + return 0; +} + +static int drc_pmem_unbind(struct papr_scm_priv *p) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + uint64_t rc, token; + + token = 0; + + /* NB: unbind has the same retry requirements mentioned above */ + do { + rc = plpar_hcall(H_SCM_UNBIND_MEM, ret, p->drc_index, + p->bound_addr, p->blocks, token); + token = be64_to_cpu(ret); + cond_resched(); + } while (rc == H_BUSY); + + if (rc) + dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); + + return !!rc; +} + +static int papr_scm_meta_get(struct papr_scm_priv *p, + struct nd_cmd_get_config_data_hdr *hdr) +{ + unsigned long data[PLPAR_HCALL_BUFSIZE]; + int64_t ret; + + if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1) + return -EINVAL; + + ret = plpar_hcall(H_SCM_READ_METADATA, data, p->drc_index, + hdr->in_offset, 1); + + if (ret == H_PARAMETER) /* bad DRC index */ + return -ENODEV; + if (ret) + return -EINVAL; /* other invalid parameter */ + + hdr->out_buf[0] = data[0] & 0xff; + + return 0; +} + +static int papr_scm_meta_set(struct papr_scm_priv *p, + struct nd_cmd_set_config_hdr *hdr) +{ + int64_t ret; + + if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1) + return -EINVAL; + + ret = plpar_hcall_norets(H_SCM_WRITE_METADATA, + p->drc_index, hdr->in_offset, hdr->in_buf[0], 1); + + if (ret == H_PARAMETER) /* bad DRC index */ + return -ENODEV; + if (ret) + return -EINVAL; /* other invalid parameter */ + + return 0; +} + +int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, + unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) +{ + struct nd_cmd_get_config_size *get_size_hdr; + struct papr_scm_priv *p; + + /* Only dimm-specific calls are supported atm */ + if (!nvdimm) + return -EINVAL; + + p = nvdimm_provider_data(nvdimm); + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + get_size_hdr = buf; + + get_size_hdr->status = 0; + get_size_hdr->max_xfer = 1; + get_size_hdr->config_size = p->metadata_size; + *cmd_rc = 0; + break; + + case ND_CMD_GET_CONFIG_DATA: + *cmd_rc = papr_scm_meta_get(p, buf); + break; + + case ND_CMD_SET_CONFIG_DATA: + *cmd_rc = papr_scm_meta_set(p, buf); + break; + + default: + return -EINVAL; + } + + dev_dbg(&p->pdev->dev, "returned with cmd_rc = %d\n", *cmd_rc); + + return 0; +} + +static const struct attribute_group *region_attr_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + &nd_mapping_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static const struct attribute_group *bus_attr_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; + +static const struct attribute_group *papr_scm_dimm_groups[] = { + &nvdimm_attribute_group, + &nd_device_attribute_group, + NULL, +}; + +static int papr_scm_nvdimm_init(struct papr_scm_priv *p) +{ + struct device *dev = &p->pdev->dev; + struct nd_mapping_desc mapping; + struct nd_region_desc ndr_desc; + unsigned long dimm_flags; + + p->bus_desc.ndctl = papr_scm_ndctl; + p->bus_desc.module = THIS_MODULE; + p->bus_desc.of_node = p->pdev->dev.of_node; + p->bus_desc.attr_groups = bus_attr_groups; + p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); + + if (!p->bus_desc.provider_name) + return -ENOMEM; + + p->bus = nvdimm_bus_register(NULL, &p->bus_desc); + if (!p->bus) { + dev_err(dev, "Error creating nvdimm bus %pOF\n", p->dn); + return -ENXIO; + } + + dimm_flags = 0; + set_bit(NDD_ALIASING, &dimm_flags); + + p->nvdimm = nvdimm_create(p->bus, p, papr_scm_dimm_groups, + dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL); + if (!p->nvdimm) { + dev_err(dev, "Error creating DIMM object for %pOF\n", p->dn); + goto err; + } + + /* now add the region */ + + memset(&mapping, 0, sizeof(mapping)); + mapping.nvdimm = p->nvdimm; + mapping.start = 0; + mapping.size = p->blocks * p->block_size; // XXX: potential overflow? + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.attr_groups = region_attr_groups; + ndr_desc.numa_node = dev_to_node(&p->pdev->dev); + ndr_desc.res = &p->res; + ndr_desc.of_node = p->dn; + ndr_desc.provider_data = p; + ndr_desc.mapping = &mapping; + ndr_desc.num_mappings = 1; + ndr_desc.nd_set = &p->nd_set; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + + p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc); + if (!p->region) { + dev_err(dev, "Error registering region %pR from %pOF\n", + ndr_desc.res, p->dn); + goto err; + } + + return 0; + +err: nvdimm_bus_unregister(p->bus); + kfree(p->bus_desc.provider_name); + return -ENXIO; +} + +static int papr_scm_probe(struct platform_device *pdev) +{ + uint32_t drc_index, metadata_size, unit_cap[2]; + struct device_node *dn = pdev->dev.of_node; + struct papr_scm_priv *p; + int rc; + + /* check we have all the required DT properties */ + if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) { + dev_err(&pdev->dev, "%pOF: missing drc-index!\n", dn); + return -ENODEV; + } + + if (of_property_read_u32_array(dn, "ibm,unit-capacity", unit_cap, 2)) { + dev_err(&pdev->dev, "%pOF: missing unit-capacity!\n", dn); + return -ENODEV; + } + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + /* optional DT properties */ + of_property_read_u32(dn, "ibm,metadata-size", &metadata_size); + + p->dn = dn; + p->drc_index = drc_index; + p->block_size = unit_cap[0]; + p->blocks = unit_cap[1]; + + /* might be zero */ + p->metadata_size = metadata_size; + p->pdev = pdev; + + /* request the hypervisor to bind this region to somewhere in memory */ + rc = drc_pmem_bind(p); + if (rc) + goto err; + + /* setup the resource for the newly bound range */ + p->res.start = p->bound_addr; + p->res.end = p->bound_addr + p->blocks * p->block_size; + p->res.name = pdev->name; + p->res.flags = IORESOURCE_MEM; + + rc = papr_scm_nvdimm_init(p); + if (rc) + goto err2; + + platform_set_drvdata(pdev, p); + + return 0; + +err2: drc_pmem_unbind(p); +err: kfree(p); + return rc; +} + +static int papr_scm_remove(struct platform_device *pdev) +{ + struct papr_scm_priv *p = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(p->bus); + drc_pmem_unbind(p); + kfree(p); + + return 0; +} + +static const struct of_device_id papr_scm_match[] = { + { .compatible = "ibm,pmemory" }, + { }, +}; + +static struct platform_driver papr_scm_driver = { + .probe = papr_scm_probe, + .remove = papr_scm_remove, + .driver = { + .name = "papr_scm", + .owner = THIS_MODULE, + .of_match_table = papr_scm_match, + }, +}; + +module_platform_driver(papr_scm_driver); +MODULE_DEVICE_TABLE(of, papr_scm_match); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("IBM Corporation"); From e63334e556d9286fc30bec1050360824bcd2d990 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:52 +1100 Subject: [PATCH 173/221] powerpc/prom_init: Replace __initdata with __prombss when applicable This replaces all occurrences of __initdata for uninitialized data with a new __prombss Currently __promdata is defined to be __initdata but we'll eventually change that. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 55 +++++++++++++++++---------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ad9320347a28..ebda53877842 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -48,6 +48,9 @@ #include +/* All of prom_init bss lives here */ +#define __prombss __initdata + /* * Eventually bump that one up */ @@ -87,7 +90,7 @@ #define OF_WORKAROUNDS 0 #else #define OF_WORKAROUNDS of_workarounds -static int of_workarounds; +static int of_workarounds __prombss; #endif #define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ @@ -148,26 +151,26 @@ extern void copy_and_flush(unsigned long dest, unsigned long src, unsigned long size, unsigned long offset); /* prom structure */ -static struct prom_t __initdata prom; +static struct prom_t __prombss prom; -static unsigned long prom_entry __initdata; +static unsigned long __prombss prom_entry; #define PROM_SCRATCH_SIZE 256 -static char __initdata of_stdout_device[256]; -static char __initdata prom_scratch[PROM_SCRATCH_SIZE]; +static char __prombss of_stdout_device[256]; +static char __prombss prom_scratch[PROM_SCRATCH_SIZE]; -static unsigned long __initdata dt_header_start; -static unsigned long __initdata dt_struct_start, dt_struct_end; -static unsigned long __initdata dt_string_start, dt_string_end; +static unsigned long __prombss dt_header_start; +static unsigned long __prombss dt_struct_start, dt_struct_end; +static unsigned long __prombss dt_string_start, dt_string_end; -static unsigned long __initdata prom_initrd_start, prom_initrd_end; +static unsigned long __prombss prom_initrd_start, prom_initrd_end; #ifdef CONFIG_PPC64 -static int __initdata prom_iommu_force_on; -static int __initdata prom_iommu_off; -static unsigned long __initdata prom_tce_alloc_start; -static unsigned long __initdata prom_tce_alloc_end; +static int __prombss prom_iommu_force_on; +static int __prombss prom_iommu_off; +static unsigned long __prombss prom_tce_alloc_start; +static unsigned long __prombss prom_tce_alloc_end; #endif static bool prom_radix_disable __initdata = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); @@ -190,22 +193,22 @@ struct platform_support { #define PLATFORM_GENERIC 0x0500 #define PLATFORM_OPAL 0x0600 -static int __initdata of_platform; +static int __prombss of_platform; -static char __initdata prom_cmd_line[COMMAND_LINE_SIZE]; +static char __prombss prom_cmd_line[COMMAND_LINE_SIZE]; -static unsigned long __initdata prom_memory_limit; +static unsigned long __prombss prom_memory_limit; -static unsigned long __initdata alloc_top; -static unsigned long __initdata alloc_top_high; -static unsigned long __initdata alloc_bottom; -static unsigned long __initdata rmo_top; -static unsigned long __initdata ram_top; +static unsigned long __prombss alloc_top; +static unsigned long __prombss alloc_top_high; +static unsigned long __prombss alloc_bottom; +static unsigned long __prombss rmo_top; +static unsigned long __prombss ram_top; -static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE]; -static int __initdata mem_reserve_cnt; +static struct mem_map_entry __prombss mem_reserve_map[MEM_RESERVE_MAP_SIZE]; +static int __prombss mem_reserve_cnt; -static cell_t __initdata regbuf[1024]; +static cell_t __prombss regbuf[1024]; static bool rtas_has_query_cpu_stopped; @@ -1568,8 +1571,8 @@ static void __init prom_close_stdin(void) #ifdef CONFIG_PPC_POWERNV #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL -static u64 __initdata prom_opal_base; -static u64 __initdata prom_opal_entry; +static u64 __prombss prom_opal_base; +static u64 __prombss prom_opal_entry; #endif /* From 11fdb309341ca1ba2e3d03fd1c9c0c6aedaea0b6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:53 +1100 Subject: [PATCH 174/221] powerpc/prom_init: Remove support for OPAL v2 We removed support for running under any OPAL version earlier than v3 in 2015 (they never saw the light of day anyway), but we kept some leftovers of this support in prom_init.c, so let's take it out. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 125 +++----------------------------- 1 file changed, 10 insertions(+), 115 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ebda53877842..84f763e0fe1d 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include @@ -191,7 +190,6 @@ struct platform_support { #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_GENERIC 0x0500 -#define PLATFORM_OPAL 0x0600 static int __prombss of_platform; @@ -684,7 +682,7 @@ static void __init early_cmdline_parse(void) prom_debug("Radix disabled from cmdline\n"); } -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * The architecture vector has an array of PVR mask/value pairs, * followed by # option vectors - 1, followed by the option vectors. @@ -1231,7 +1229,7 @@ static void __init prom_send_capabilities(void) } #endif /* __BIG_ENDIAN__ */ } -#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ +#endif /* CONFIG_PPC_PSERIES */ /* * Memory allocation strategy... our layout is normally: @@ -1568,88 +1566,6 @@ static void __init prom_close_stdin(void) } } -#ifdef CONFIG_PPC_POWERNV - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL -static u64 __prombss prom_opal_base; -static u64 __prombss prom_opal_entry; -#endif - -/* - * Allocate room for and instantiate OPAL - */ -static void __init prom_instantiate_opal(void) -{ - phandle opal_node; - ihandle opal_inst; - u64 base, entry; - u64 size = 0, align = 0x10000; - __be64 val64; - u32 rets[2]; - - prom_debug("prom_instantiate_opal: start...\n"); - - opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal")); - prom_debug("opal_node: %x\n", opal_node); - if (!PHANDLE_VALID(opal_node)) - return; - - val64 = 0; - prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64)); - size = be64_to_cpu(val64); - if (size == 0) - return; - val64 = 0; - prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64)); - align = be64_to_cpu(val64); - - base = alloc_down(size, align, 0); - if (base == 0) { - prom_printf("OPAL allocation failed !\n"); - return; - } - - opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal")); - if (!IHANDLE_VALID(opal_inst)) { - prom_printf("opening opal package failed (%x)\n", opal_inst); - return; - } - - prom_printf("instantiating opal at 0x%llx...", base); - - if (call_prom_ret("call-method", 4, 3, rets, - ADDR("load-opal-runtime"), - opal_inst, - base >> 32, base & 0xffffffff) != 0 - || (rets[0] == 0 && rets[1] == 0)) { - prom_printf(" failed\n"); - return; - } - entry = (((u64)rets[0]) << 32) | rets[1]; - - prom_printf(" done\n"); - - reserve_mem(base, size); - - prom_debug("opal base = 0x%llx\n", base); - prom_debug("opal align = 0x%llx\n", align); - prom_debug("opal entry = 0x%llx\n", entry); - prom_debug("opal size = 0x%llx\n", size); - - prom_setprop(opal_node, "/ibm,opal", "opal-base-address", - &base, sizeof(base)); - prom_setprop(opal_node, "/ibm,opal", "opal-entry-address", - &entry, sizeof(entry)); - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - prom_opal_base = base; - prom_opal_entry = entry; -#endif - prom_debug("prom_instantiate_opal: end...\n"); -} - -#endif /* CONFIG_PPC_POWERNV */ - /* * Allocate room for and instantiate RTAS */ @@ -2156,10 +2072,6 @@ static int __init prom_find_machine_type(void) } } #ifdef CONFIG_PPC64 - /* Try to detect OPAL */ - if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal")))) - return PLATFORM_OPAL; - /* Try to figure out if it's an IBM pSeries or any other * PAPR compliant platform. We assume it is if : * - /device_type is "chrp" (please, do NOT use that for future @@ -2488,7 +2400,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, } /* Add a "linux,phandle" property if no "phandle" property already - * existed (can happen with OPAL) + * existed. */ if (!has_phandle) { soff = dt_find_string("linux,phandle"); @@ -3178,7 +3090,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ early_cmdline_parse(); -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * On pSeries, inform the firmware about our capabilities */ @@ -3222,15 +3134,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, try to instantiate RTAS. PowerMacs don't * have a usable RTAS implementation. */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_instantiate_rtas(); -#ifdef CONFIG_PPC_POWERNV - if (of_platform == PLATFORM_OPAL) - prom_instantiate_opal(); -#endif /* CONFIG_PPC_POWERNV */ - #ifdef CONFIG_PPC64 /* instantiate sml */ prom_instantiate_sml(); @@ -3243,8 +3149,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * * (This must be done after instanciating RTAS) */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_hold_cpus(); /* @@ -3288,11 +3193,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * in case stdin is USB and still active on IBM machines... * Unfortunately quiesce crashes on some powermacs if we have - * closed stdin already (in particular the powerbook 101). It - * appears that the OPAL version of OFW doesn't like it either. + * closed stdin already (in particular the powerbook 101). */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_close_stdin(); /* @@ -3310,10 +3213,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, hdr = dt_header_start; /* Don't print anything after quiesce under OPAL, it crashes OFW */ - if (of_platform != PLATFORM_OPAL) { - prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); - prom_debug("->dt_header_start=0x%lx\n", hdr); - } + prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); + prom_debug("->dt_header_start=0x%lx\n", hdr); #ifdef CONFIG_PPC32 reloc_got2(-offset); @@ -3321,13 +3222,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ - __start(hdr, kbase, 0, 0, 0, - prom_opal_base, prom_opal_entry); -#else __start(hdr, kbase, 0, 0, 0, 0, 0); -#endif return 0; } From c886087caee759790db47f345f8382d653015de3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:54 +1100 Subject: [PATCH 175/221] powerpc/prom_init: Move prom_radix_disable to __prombss Initialize it dynamically instead of statically Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 84f763e0fe1d..7a5eb0192562 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -172,7 +172,9 @@ static unsigned long __prombss prom_tce_alloc_start; static unsigned long __prombss prom_tce_alloc_end; #endif -static bool prom_radix_disable __initdata = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); +#ifdef CONFIG_PPC_PSERIES +static bool prom_radix_disable __prombss; +#endif struct platform_support { bool hash_mmu; @@ -665,6 +667,8 @@ static void __init early_cmdline_parse(void) #endif } +#ifdef CONFIG_PPC_PSERIES + prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); opt = strstr(prom_cmd_line, "disable_radix"); if (opt) { opt += 13; @@ -680,6 +684,7 @@ static void __init early_cmdline_parse(void) } if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); +#endif /* CONFIG_PPC_PSERIES */ } #ifdef CONFIG_PPC_PSERIES From a614f52e75bd69b513707b4adc672149c4903995 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:55 +1100 Subject: [PATCH 176/221] powerpc/prom_init: Move ibm_arch_vec to __prombss Make the existing initialized definition constant and copy it to a __prombss copy Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 7a5eb0192562..dec8f7f689a2 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -788,7 +788,7 @@ struct ibm_arch_vec { struct option_vector6 vec6; } __packed; -struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { +static const struct ibm_arch_vec ibm_architecture_vec_template = { .pvrs = { { .mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */ @@ -926,6 +926,8 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { }, }; +static struct ibm_arch_vec __prombss ibm_architecture_vec ____cacheline_aligned; + /* Old method - ELF header with PT_NOTE sections only works on BE */ #ifdef __BIG_ENDIAN__ static const struct fake_elf { @@ -1135,6 +1137,10 @@ static void __init prom_check_platform_support(void) }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); + + /* First copy the architecture vec template */ + ibm_architecture_vec = ibm_architecture_vec_template; + if (prop_len > 1) { int i; u8 vec[8]; From d00e34b92cd7d8f1c10c2f0a8c10368bfca1a5dc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:56 +1100 Subject: [PATCH 177/221] powerpc/prom_init: Move const structures to __initconst As they are no longer used past the end of prom_init Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index dec8f7f689a2..300631d9b844 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -788,7 +788,7 @@ struct ibm_arch_vec { struct option_vector6 vec6; } __packed; -static const struct ibm_arch_vec ibm_architecture_vec_template = { +static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .pvrs = { { .mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */ @@ -963,7 +963,7 @@ static const struct fake_elf { u32 ignore_me; } rpadesc; } rpanote; -} fake_elf = { +} fake_elf __initconst = { .elfhdr = { .e_ident = { 0x7f, 'E', 'L', 'F', ELFCLASS32, ELFDATA2MSB, EV_CURRENT }, @@ -2131,7 +2131,7 @@ static void __init prom_check_displays(void) ihandle ih; int i; - static const unsigned char default_colors[] = { + static const unsigned char default_colors[] __initconst = { 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0x00, 0xaa, 0x00, From 8ca2d5151e7f5cbef42eda780eac56acc0eab47a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:57 +1100 Subject: [PATCH 178/221] powerpc/prom_init: Move a few remaining statics to appropriate sections Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 300631d9b844..f45ed445c9f4 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -173,7 +173,7 @@ static unsigned long __prombss prom_tce_alloc_end; #endif #ifdef CONFIG_PPC_PSERIES -static bool prom_radix_disable __prombss; +static bool __prombss prom_radix_disable; #endif struct platform_support { @@ -210,7 +210,7 @@ static int __prombss mem_reserve_cnt; static cell_t __prombss regbuf[1024]; -static bool rtas_has_query_cpu_stopped; +static bool __prombss rtas_has_query_cpu_stopped; /* @@ -525,8 +525,8 @@ static void add_string(char **str, const char *q) static char *tohex(unsigned int x) { - static char digits[] = "0123456789abcdef"; - static char result[9]; + static const char digits[] __initconst = "0123456789abcdef"; + static char result[9] __prombss; int i; result[8] = 0; @@ -2327,7 +2327,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, char *namep, *prev_name, *sstart, *p, *ep, *lp, *path; unsigned long soff; unsigned char *valp; - static char pname[MAX_PROPERTY_NAME]; + static char pname[MAX_PROPERTY_NAME] __prombss; int l, room, has_phandle = 0; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); From 5f69e38885c3483a1838dd946aaf0166b727ecbd Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:58 +1100 Subject: [PATCH 179/221] powerpc/prom_init: Move __prombss to it's own section and store it in .bss This makes __prombss its own section, and for now store it in .bss. This will give us the ability later to store it elsewhere and/or free it after boot (it's about 8KB). Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/kernel/vmlinux.lds.S | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f45ed445c9f4..88d621a7bf67 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -48,7 +48,7 @@ #include /* All of prom_init bss lives here */ -#define __prombss __initdata +#define __prombss __section(.bss.prominit) /* * Eventually bump that one up diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 07ae018e550e..ac0ceb31b336 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -4,6 +4,9 @@ #else #define PROVIDE32(x) PROVIDE(x) #endif + +#define BSS_FIRST_SECTIONS *(.bss.prominit) + #include #include #include From 2c51d97ee88da897db8405f659d1735ffe86ad7c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:49:59 +1100 Subject: [PATCH 180/221] powerpc: Check prom_init for disallowed sections prom_init.c must not modify the kernel image outside of the .bss.prominit section. Thus make sure that prom_init.o doesn't have anything in any of these: .data .bss .init.data Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init_check.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index acb6b9226352..667df97d2595 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -28,6 +28,18 @@ OBJ="$2" ERROR=0 +function check_section() +{ + file=$1 + section=$2 + size=$(objdump -h -j $section $file 2>/dev/null | awk "\$2 == \"$section\" {print \$3}") + size=${size:-0} + if [ $size -ne 0 ]; then + ERROR=1 + echo "Error: Section $section not empty in prom_init.c" >&2 + fi +} + for UNDEF in $($NM -u $OBJ | awk '{print $2}') do # On 64-bit nm gives us the function descriptors, which have @@ -66,4 +78,8 @@ do fi done +check_section $OBJ .data +check_section $OBJ .bss +check_section $OBJ .init.data + exit $ERROR From f1f208e54d08ccf00121c700a9bb1fe3e55b3a51 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 13:50:00 +1100 Subject: [PATCH 181/221] powerpc/prom_init: Generate "phandle" instead of "linux, phandle" When creating the boot-time FDT from an actual Open Firmware live tree, let's generate "phandle" properties for the phandles instead of the old deprecated "linux,phandle". Signed-off-by: Benjamin Herrenschmidt [mpe: Unsplit warning printf()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 88d621a7bf67..f33ff4163a51 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2410,14 +2410,11 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, has_phandle = 1; } - /* Add a "linux,phandle" property if no "phandle" property already - * existed. - */ + /* Add a "phandle" property if none already exist */ if (!has_phandle) { - soff = dt_find_string("linux,phandle"); + soff = dt_find_string("phandle"); if (soff == 0) - prom_printf("WARNING: Can't find string index for" - " node %s\n", path); + prom_printf("WARNING: Can't find string index for node %s\n", path); else { dt_push_token(OF_DT_PROP, mem_start, mem_end); dt_push_token(4, mem_start, mem_end); @@ -2477,9 +2474,9 @@ static void __init flatten_device_tree(void) dt_string_start = mem_start; mem_start += 4; /* hole */ - /* Add "linux,phandle" in there, we'll need it */ + /* Add "phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, "linux,phandle"); + strcpy(namep, "phandle"); mem_start = (unsigned long)namep + strlen(namep) + 1; /* Build string array */ From bd03fd84a53ac9ddaeb0a0fc4c4c9836e12f3ab9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Oct 2018 07:38:10 +0000 Subject: [PATCH 182/221] powerpc/traps: remove redundant in_interrupt panic in die() do_exit() already includes a test to panic() is in_interrupt() This patch removes powerpc one which is redundant. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c8559b2c2c81..f1629a8acc4b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -247,8 +247,6 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, mdelay(MSEC_PER_SEC); } - if (in_interrupt()) - panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); do_exit(signr); From fc0c8b36d379a046525eacb9c3323ca635283757 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 15 Oct 2018 11:18:49 +1100 Subject: [PATCH 183/221] macintosh/windfarm_smu_sat: Fix debug output There's some antiquated debug output that's trying to do a hand-made hexdump and turning into horrible 1-byte-per-line output these days. Use print_hex_dump() instead Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- drivers/macintosh/windfarm_smu_sat.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c index da7f4fc1a51d..a0f61eb853c5 100644 --- a/drivers/macintosh/windfarm_smu_sat.c +++ b/drivers/macintosh/windfarm_smu_sat.c @@ -22,14 +22,6 @@ #define VERSION "1.0" -#define DEBUG - -#ifdef DEBUG -#define DBG(args...) printk(args) -#else -#define DBG(args...) do { } while(0) -#endif - /* If the cache is older than 800ms we'll refetch it */ #define MAX_AGE msecs_to_jiffies(800) @@ -106,13 +98,10 @@ struct smu_sdbp_header *smu_sat_get_sdb_partition(unsigned int sat_id, int id, buf[i+2] = data[3]; buf[i+3] = data[2]; } -#ifdef DEBUG - DBG(KERN_DEBUG "sat %d partition %x:", sat_id, id); - for (i = 0; i < len; ++i) - DBG(" %x", buf[i]); - DBG("\n"); -#endif + printk(KERN_DEBUG "sat %d partition %x:", sat_id, id); + print_hex_dump(KERN_DEBUG, " ", DUMP_PREFIX_OFFSET, + 16, 1, buf, len, false); if (size) *size = len; return (struct smu_sdbp_header *) buf; @@ -132,13 +121,13 @@ static int wf_sat_read_cache(struct wf_sat *sat) if (err < 0) return err; sat->last_read = jiffies; + #ifdef LOTSA_DEBUG { int i; - DBG(KERN_DEBUG "wf_sat_get: data is"); - for (i = 0; i < 16; ++i) - DBG(" %.2x", sat->cache[i]); - DBG("\n"); + printk(KERN_DEBUG "wf_sat_get: data is"); + print_hex_dump(KERN_DEBUG, " ", DUMP_PREFIX_OFFSET, + 16, 1, sat->cache, 16, false); } #endif return 0; From c47ca98d32a22a412ddbc69916cf62bdcdfa1a4e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Oct 2018 16:13:05 +1100 Subject: [PATCH 184/221] powerpc: Move core kernel logic into arch/powerpc/Kbuild This is a nice cleanup, arch/powerpc/Makefile is long and messy so moving this out helps a little. It also allows us to do: $ make arch/powerpc Which can be helpful if you just want to compile test some changes to arch code and not link everything. Finally it also gives us a single place to do subdir-cc-flags assignments which affect the whole of arch/powerpc, which we will do in a future patch. Signed-off-by: Michael Ellerman --- arch/powerpc/Kbuild | 14 ++++++++++++++ arch/powerpc/Makefile | 14 ++------------ 2 files changed, 16 insertions(+), 12 deletions(-) create mode 100644 arch/powerpc/Kbuild diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild new file mode 100644 index 000000000000..690a498da050 --- /dev/null +++ b/arch/powerpc/Kbuild @@ -0,0 +1,14 @@ +obj-y += kernel/ +obj-y += mm/ +obj-y += lib/ +obj-y += sysdev/ +obj-y += platforms/ +obj-y += math-emu/ +obj-y += crypto/ +obj-y += net/ + +obj-$(CONFIG_XMON) += xmon/ +obj-$(CONFIG_KVM) += kvm/ + +obj-$(CONFIG_PERF_EVENTS) += perf/ +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 6c4f8a099bbb..a1cae0377d9e 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -273,18 +273,8 @@ head-$(CONFIG_PPC_FPU) += arch/powerpc/kernel/fpu.o head-$(CONFIG_ALTIVEC) += arch/powerpc/kernel/vector.o head-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += arch/powerpc/kernel/prom_init.o -core-y += arch/powerpc/kernel/ \ - arch/powerpc/mm/ \ - arch/powerpc/lib/ \ - arch/powerpc/sysdev/ \ - arch/powerpc/platforms/ \ - arch/powerpc/math-emu/ \ - arch/powerpc/crypto/ \ - arch/powerpc/net/ -core-$(CONFIG_XMON) += arch/powerpc/xmon/ -core-$(CONFIG_KVM) += arch/powerpc/kvm/ -core-$(CONFIG_PERF_EVENTS) += arch/powerpc/perf/ -core-$(CONFIG_KEXEC_FILE) += arch/powerpc/purgatory/ +# See arch/powerpc/Kbuild for content of core part of the kernel +core-y += arch/powerpc/ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ From 23ad1a2700725d46ee7760920974c68be81ab82d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Oct 2018 16:13:06 +1100 Subject: [PATCH 185/221] powerpc: Add -Werror at arch/powerpc level Back when I added -Werror in commit ba55bd74360e ("powerpc: Add configurable -Werror for arch/powerpc") I did it by adding it to most of the arch Makefiles. At the time we excluded math-emu, because apparently it didn't build cleanly. But that seems to have been fixed somewhere in the interim. So move the -Werror addition to the top-level of the arch, this saves us from repeating it in every Makefile and means we won't forget to add it to any new sub-dirs. Signed-off-by: Michael Ellerman --- arch/powerpc/Kbuild | 2 ++ arch/powerpc/kernel/Makefile | 2 -- arch/powerpc/kernel/trace/Makefile | 2 -- arch/powerpc/kvm/Makefile | 2 -- arch/powerpc/lib/Makefile | 2 -- arch/powerpc/mm/Makefile | 2 -- arch/powerpc/oprofile/Makefile | 1 - arch/powerpc/perf/Makefile | 1 - arch/powerpc/platforms/Makefile | 2 -- arch/powerpc/sysdev/Makefile | 3 --- arch/powerpc/sysdev/xics/Makefile | 1 - arch/powerpc/sysdev/xive/Makefile | 1 - arch/powerpc/xmon/Makefile | 2 -- 13 files changed, 2 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild index 690a498da050..1625a06802ca 100644 --- a/arch/powerpc/Kbuild +++ b/arch/powerpc/Kbuild @@ -1,3 +1,5 @@ +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + obj-y += kernel/ obj-y += mm/ obj-y += lib/ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fb70e9b6fa67..53d4b8d5b54d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -8,8 +8,6 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' # Disable clang warning for using setjmp without setjmp.h header CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ifdef CONFIG_PPC64 CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) endif diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index d868ba42032f..b1725ad3e13d 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -3,8 +3,6 @@ # Makefile for the powerpc trace subsystem # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ifdef CONFIG_FUNCTION_TRACER # do not trace tracer code CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index f872c04bb5b1..e9d579921a96 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -3,8 +3,6 @@ # Makefile for Kernel-based Virtual Machine module # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 670286808928..703afa1808ed 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for ppc-specific library files.. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index ceb352ccbc76..ca96e7be4d0e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -3,8 +3,6 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile index 7a7834c39f64..8d26d7416481 100644 --- a/arch/powerpc/oprofile/Makefile +++ b/arch/powerpc/oprofile/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 82986d2acd9b..ab26df5bacb9 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o diff --git a/arch/powerpc/platforms/Makefile b/arch/powerpc/platforms/Makefile index e46bb7ea710f..143d4417f6cc 100644 --- a/arch/powerpc/platforms/Makefile +++ b/arch/powerpc/platforms/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - obj-$(CONFIG_FSL_ULI1575) += fsl_uli1575.o obj-$(CONFIG_PPC_PMAC) += powermac/ diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index f730539074c4..2caa4defdfb6 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) @@ -56,8 +55,6 @@ obj-$(CONFIG_PPC_SCOM) += scom.o obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - obj-$(CONFIG_PPC_XICS) += xics/ obj-$(CONFIG_PPC_XIVE) += xive/ diff --git a/arch/powerpc/sysdev/xics/Makefile b/arch/powerpc/sysdev/xics/Makefile index 5d438d92472b..ba1e3117b1c0 100644 --- a/arch/powerpc/sysdev/xics/Makefile +++ b/arch/powerpc/sysdev/xics/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-y += xics-common.o obj-$(CONFIG_PPC_ICP_NATIVE) += icp-native.o diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile index 536d6e5706e3..dea2abc23f4d 100644 --- a/arch/powerpc/sysdev/xive/Makefile +++ b/arch/powerpc/sysdev/xive/Makefile @@ -1,4 +1,3 @@ -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-y += common.o obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 9d7d8e6d705c..69e7fb47bcaa 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -4,8 +4,6 @@ # Disable clang warning for using setjmp without setjmp.h header subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) -subdir-ccflags-$(CONFIG_PPC_WERROR) += -Werror - GCOV_PROFILE := n UBSAN_SANITIZE := n From a0e102914aa3f619a5bc68a0d33e17d1788cdf4c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 16 Oct 2018 12:33:40 +0000 Subject: [PATCH 186/221] powerpc/io: remove old GCC version implementation GCC 4.6 is the minimum supported now. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/io.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 0a034519957d..3ef40b703c4a 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -111,25 +111,6 @@ extern bool isa_io_special; #define IO_SET_SYNC_FLAG() #endif -/* gcc 4.0 and older doesn't have 'Z' constraint */ -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ == 0) -#define DEF_MMIO_IN_X(name, size, insn) \ -static inline u##size name(const volatile u##size __iomem *addr) \ -{ \ - u##size ret; \ - __asm__ __volatile__("sync;"#insn" %0,0,%1;twi 0,%0,0;isync" \ - : "=r" (ret) : "r" (addr), "m" (*addr) : "memory"); \ - return ret; \ -} - -#define DEF_MMIO_OUT_X(name, size, insn) \ -static inline void name(volatile u##size __iomem *addr, u##size val) \ -{ \ - __asm__ __volatile__("sync;"#insn" %1,0,%2" \ - : "=m" (*addr) : "r" (val), "r" (addr) : "memory"); \ - IO_SET_SYNC_FLAG(); \ -} -#else /* newer gcc */ #define DEF_MMIO_IN_X(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ { \ @@ -146,7 +127,6 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ : "=Z" (*addr) : "r" (val) : "memory"); \ IO_SET_SYNC_FLAG(); \ } -#endif #define DEF_MMIO_IN_D(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ From 22a3d03d69dfd3bfc606d1996969820d619b7c27 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 15 Oct 2018 23:01:42 +1100 Subject: [PATCH 187/221] powerpc/uapi: Fix sigcontext definition to use user_pt_regs My recent patch to split pt_regs between user and kernel missed the usage in struct sigcontext. Because this is a user visible struct it should be using the user visible definition, which when we're building for the kernel is called struct user_pt_regs. As far as I can see this hasn't actually caused a bug (yet), because we don't use the sizeof() the sigcontext->regs anywhere. But we should still fix it to avoid confusion and future bugs. Fixes: 002af9391bfb ("powerpc: Split user/kernel definitions of struct pt_regs") Reported-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/include/uapi/asm/sigcontext.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/uapi/asm/sigcontext.h b/arch/powerpc/include/uapi/asm/sigcontext.h index 2fbe485acdb4..630aeda56d59 100644 --- a/arch/powerpc/include/uapi/asm/sigcontext.h +++ b/arch/powerpc/include/uapi/asm/sigcontext.h @@ -22,7 +22,11 @@ struct sigcontext { #endif unsigned long handler; unsigned long oldmask; - struct pt_regs __user *regs; +#ifdef __KERNEL__ + struct user_pt_regs __user *regs; +#else + struct pt_regs *regs; +#endif #ifdef __powerpc64__ elf_gregset_t gp_regs; elf_fpregset_t fp_regs; From 6ce7bff045f6a0eaa94c63617d73ad0bc66ce40c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 15 Oct 2018 23:01:43 +1100 Subject: [PATCH 188/221] powerpc/aout: Fix struct user definition to use user_pt_regs I'm pretty sure this is dead code, it's only used by the a.out core dump code, and we don't support a.out. We should remove it. But while it's in the tree it should be using the ABI version of pt_regs which is called user_pt_regs in the kernel, because the whole struct is written to the core dump and so its size shouldn't change. Note this isn't a uapi header so we don't need an ifdef. Fixes: 002af9391bfb ("powerpc: Split user/kernel definitions of struct pt_regs") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/user.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/user.h b/arch/powerpc/include/asm/user.h index 5c0e082eae7b..99443b8594e7 100644 --- a/arch/powerpc/include/asm/user.h +++ b/arch/powerpc/include/asm/user.h @@ -31,7 +31,7 @@ * to write an integer number of pages. */ struct user { - struct pt_regs regs; /* entire machine state */ + struct user_pt_regs regs; /* entire machine state */ size_t u_tsize; /* text size (pages) */ size_t u_dsize; /* data size (pages) */ size_t u_ssize; /* stack size (pages) */ From b4d16ab58c41ff0125822464bdff074cebd0fe47 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 17 Oct 2018 23:39:41 +1100 Subject: [PATCH 189/221] powerpc/time: Fix clockevent_decrementer initalisation for PR KVM In the recent commit 8b78fdb045de ("powerpc/time: Use clockevents_register_device(), fixing an issue with large decrementer") we changed the way we initialise the decrementer clockevent(s). We no longer initialise the mult & shift values of decrementer_clockevent itself. This has the effect of breaking PR KVM, because it uses those values in kvmppc_emulate_dec(). The symptom is guest kernels spin forever mid-way through boot. For now fix it by assigning back to decrementer_clockevent the mult and shift values. Fixes: 8b78fdb045de ("powerpc/time: Use clockevents_register_device(), fixing an issue with large decrementer") Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 40868f3ee113..68e8f963d108 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -989,6 +989,10 @@ static void register_decrementer_clockevent(int cpu) printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); + + /* Set values for KVM, see kvm_emulate_dec() */ + decrementer_clockevent.mult = dec->mult; + decrementer_clockevent.shift = dec->shift; } static void enable_large_decrementer(void) From 7cd01b08d35f1b7d55686ed8cd57c94d3406ec8f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 7 Jun 2018 15:22:02 +0530 Subject: [PATCH 190/221] powerpc: Add support for function error injection We implement regs_set_return_value() and override_function_with_return() for this purpose. On powerpc, a return from a function (blr) just branches to the location contained in the link register. So, we can just update pt_regs rather than redirecting execution to a dummy function that returns. Signed-off-by: Naveen N. Rao Reviewed-by: Samuel Mendoza-Jonas Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/error-injection.h | 13 +++++++++++++ arch/powerpc/include/asm/ptrace.h | 5 +++++ arch/powerpc/lib/Makefile | 2 ++ arch/powerpc/lib/error-inject.c | 16 ++++++++++++++++ 5 files changed, 37 insertions(+) create mode 100644 arch/powerpc/include/asm/error-injection.h create mode 100644 arch/powerpc/lib/error-inject.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3d008115fe18..076e05ae9b04 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -190,6 +190,7 @@ config PPC select HAVE_EBPF_JIT if PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC diff --git a/arch/powerpc/include/asm/error-injection.h b/arch/powerpc/include/asm/error-injection.h new file mode 100644 index 000000000000..62fd24739852 --- /dev/null +++ b/arch/powerpc/include/asm/error-injection.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include +#include +#include +#include + +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 2ba2a1e52291..33196b311964 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -133,6 +133,11 @@ static inline long regs_return_value(struct pt_regs *regs) return -regs->gpr[3]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gpr[3] = rc; +} + #ifdef __powerpc64__ #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) #else diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 703afa1808ed..3bf9fc6fd36c 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -12,6 +12,8 @@ obj-y += string.o alloc.o code-patching.o feature-fixups.o obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + # See corresponding test in arch/powerpc/Makefile # 64-bit linker creates .sfpr on demand for final link (vmlinux), # so it is only needed for modules, and only for older linkers which diff --git a/arch/powerpc/lib/error-inject.c b/arch/powerpc/lib/error-inject.c new file mode 100644 index 000000000000..407b992fb02f --- /dev/null +++ b/arch/powerpc/lib/error-inject.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'blr'. 'regs' represents the state on entry of a predefined + * function in the kernel/module, captured on a kprobe. We don't need + * to worry about 32-bit userspace on a 64-bit kernel. + */ + regs->nip = regs->link; +} +NOKPROBE_SYMBOL(override_function_with_return); From 59fe7eaf3598a89cbcd72e645b1d08afd76f7b29 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 29 May 2018 12:21:00 +0530 Subject: [PATCH 191/221] powerpc64/module elfv1: Set opd addresses after module relocation module_frob_arch_sections() is called before the module is moved to its final location. The function descriptor section addresses we are setting here are thus invalid. Fix this by processing opd section during module_finalize() Fixes: 5633e85b2c313 ("powerpc64: Add .opd based function descriptor dereference") Cc: stable@vger.kernel.org # v4.16 Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/module.c | 8 ++++++++ arch/powerpc/kernel/module_64.c | 5 ----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 77371c9ef3d8..2d861a36662e 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -74,6 +74,14 @@ int module_finalize(const Elf_Ehdr *hdr, (void *)sect->sh_addr + sect->sh_size); #endif /* CONFIG_PPC64 */ +#ifdef PPC64_ELF_ABI_v1 + sect = find_section(hdr, sechdrs, ".opd"); + if (sect != NULL) { + me->arch.start_opd = sect->sh_addr; + me->arch.end_opd = sect->sh_addr + sect->sh_size; + } +#endif /* PPC64_ELF_ABI_v1 */ + #ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); if (sect != NULL) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index b8d61e019d06..2c53de9f3b6a 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -360,11 +360,6 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) dedotify_versions((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size); - else if (!strcmp(secstrings + sechdrs[i].sh_name, ".opd")) { - me->arch.start_opd = sechdrs[i].sh_addr; - me->arch.end_opd = sechdrs[i].sh_addr + - sechdrs[i].sh_size; - } /* We don't handle .init for the moment: rename to _init */ while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) From db3848515aff369404c31ef5a63699799913518f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 21 May 2018 20:43:56 +0530 Subject: [PATCH 192/221] selftests/powerpc: Move UCONTEXT_NIA() into utils.h ... so that it can be used by others. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/include/utils.h | 8 ++++++++ .../selftests/powerpc/primitives/load_unaligned_zeropad.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index c58c370828b4..da1b963cdb32 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -80,4 +80,12 @@ do { \ #define PPC_FEATURE2_ARCH_3_00 0x00800000 #endif +#if defined(__powerpc64__) +#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP] +#elif defined(__powerpc__) +#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP] +#else +#error implement UCONTEXT_NIA +#endif + #endif /* _SELFTESTS_POWERPC_UTILS_H */ diff --git a/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c b/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c index ed3239bbfae2..ee1e9ca22f0d 100644 --- a/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c +++ b/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c @@ -65,14 +65,6 @@ static int unprotect_region(void) extern char __start___ex_table[]; extern char __stop___ex_table[]; -#if defined(__powerpc64__) -#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP] -#elif defined(__powerpc__) -#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP] -#else -#error implement UCONTEXT_NIA -#endif - struct extbl_entry { int insn; int fixup; From d2bf793237b3aa9c4275a466eef3893eef593691 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 21 May 2018 20:43:57 +0530 Subject: [PATCH 193/221] selftests/powerpc: Add test to verify rfi flush across a system call This adds a test to verify proper functioning of the rfi flush capability implemented to mitigate meltdown. The test works by measuring the number of L1d cache misses encountered while loading data from memory. Across a system call, since the L1d cache is flushed when rfi_flush is enabled, the number of cache misses is expected to be relative to the number of cachelines corresponding to the data being loaded. The current system setting is reflected via powerpc/rfi_flush under debugfs (assumed to be /sys/kernel/debug/). This test verifies the expected result with rfi_flush enabled as well as when it is disabled. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman Signed-off-by: Naveen N. Rao [mpe: Add SPDX tags, clang format, skip if the debugfs is missing, use __u64 and SANE_USERSPACE_TYPES to avoid printf() build errors.] Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/Makefile | 3 +- .../testing/selftests/powerpc/include/utils.h | 10 ++ .../selftests/powerpc/security/Makefile | 9 ++ .../selftests/powerpc/security/rfi_flush.c | 132 +++++++++++++++ tools/testing/selftests/powerpc/utils.c | 152 ++++++++++++++++++ 5 files changed, 305 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/security/Makefile create mode 100644 tools/testing/selftests/powerpc/security/rfi_flush.c diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 201b598558b9..b3ad909aefbc 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -28,7 +28,8 @@ SUB_DIRS = alignment \ tm \ vphn \ math \ - ptrace + ptrace \ + security endif diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index da1b963cdb32..49621822d7c3 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "reg.h" /* Avoid headaches with PRI?64 - just use %ll? always */ @@ -31,6 +32,15 @@ void *get_auxv_entry(int type); int pick_online_cpu(void); +int read_debugfs_file(char *debugfs_file, int *result); +int write_debugfs_file(char *debugfs_file, int result); +void set_dscr(unsigned long val); +int perf_event_open_counter(unsigned int type, + unsigned long config, int group_fd); +int perf_event_enable(int fd); +int perf_event_disable(int fd); +int perf_event_reset(int fd); + static inline bool have_hwcap(unsigned long ftr) { return ((unsigned long)get_auxv_entry(AT_HWCAP) & ftr) == ftr; diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile new file mode 100644 index 000000000000..44690f1bb26a --- /dev/null +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0+ + +TEST_GEN_PROGS := rfi_flush + +CFLAGS += -I../../../../../usr/include + +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c new file mode 100644 index 000000000000..564ed45bbf73 --- /dev/null +++ b/tools/testing/selftests/powerpc/security/rfi_flush.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright 2018 IBM Corporation. + */ + +#define __SANE_USERSPACE_TYPES__ + +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" + +#define CACHELINE_SIZE 128 + +struct perf_event_read { + __u64 nr; + __u64 l1d_misses; +}; + +static inline __u64 load(void *addr) +{ + __u64 tmp; + + asm volatile("ld %0,0(%1)" : "=r"(tmp) : "b"(addr)); + + return tmp; +} + +static void syscall_loop(char *p, unsigned long iterations, + unsigned long zero_size) +{ + for (unsigned long i = 0; i < iterations; i++) { + for (unsigned long j = 0; j < zero_size; j += CACHELINE_SIZE) + load(p + j); + getppid(); + } +} + +int rfi_flush_test(void) +{ + char *p; + int repetitions = 10; + int fd, passes = 0, iter, rc = 0; + struct perf_event_read v; + __u64 l1d_misses_total = 0; + unsigned long iterations = 100000, zero_size = 24 * 1024; + int rfi_flush_org, rfi_flush; + + SKIP_IF(geteuid() != 0); + + if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_org)) { + perror("Unable to read powerpc/rfi_flush debugfs file"); + SKIP_IF(1); + } + + rfi_flush = rfi_flush_org; + + fd = perf_event_open_counter(PERF_TYPE_RAW, /* L1d miss */ 0x400f0, -1); + FAIL_IF(fd < 0); + + p = (char *)memalign(zero_size, CACHELINE_SIZE); + + FAIL_IF(perf_event_enable(fd)); + + set_dscr(1); + + iter = repetitions; + +again: + FAIL_IF(perf_event_reset(fd)); + + syscall_loop(p, iterations, zero_size); + + FAIL_IF(read(fd, &v, sizeof(v)) != sizeof(v)); + + /* Expect at least zero_size/CACHELINE_SIZE misses per iteration */ + if (v.l1d_misses >= (iterations * zero_size / CACHELINE_SIZE) && rfi_flush) + passes++; + else if (v.l1d_misses < iterations && !rfi_flush) + passes++; + + l1d_misses_total += v.l1d_misses; + + while (--iter) + goto again; + + if (passes < repetitions) { + printf("FAIL (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d failures]\n", + rfi_flush, l1d_misses_total, rfi_flush ? '<' : '>', + rfi_flush ? (repetitions * iterations * zero_size / CACHELINE_SIZE) : iterations, + repetitions - passes, repetitions); + rc = 1; + } else + printf("PASS (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d pass]\n", + rfi_flush, l1d_misses_total, rfi_flush ? '>' : '<', + rfi_flush ? (repetitions * iterations * zero_size / CACHELINE_SIZE) : iterations, + passes, repetitions); + + if (rfi_flush == rfi_flush_org) { + rfi_flush = !rfi_flush_org; + if (write_debugfs_file("powerpc/rfi_flush", rfi_flush) < 0) { + perror("error writing to powerpc/rfi_flush debugfs file"); + return 1; + } + iter = repetitions; + l1d_misses_total = 0; + passes = 0; + goto again; + } + + perf_event_disable(fd); + close(fd); + + set_dscr(0); + + if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_org) < 0) { + perror("unable to restore original value of powerpc/rfi_flush debugfs file"); + return 1; + } + + return rc; +} + +int main(int argc, char *argv[]) +{ + return test_harness(rfi_flush_test, "rfi_flush_test"); +} diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c index aa8fc1e6365b..43c342845be0 100644 --- a/tools/testing/selftests/powerpc/utils.c +++ b/tools/testing/selftests/powerpc/utils.c @@ -10,16 +10,22 @@ #include #include #include +#include #include +#include #include +#include #include #include #include #include +#include +#include #include "utils.h" static char auxv[4096]; +extern unsigned int dscr_insn[]; int read_auxv(char *buf, ssize_t buf_size) { @@ -121,3 +127,149 @@ bool is_ppc64le(void) return strcmp(uts.machine, "ppc64le") == 0; } + +int read_debugfs_file(char *debugfs_file, int *result) +{ + int rc = -1, fd; + char path[PATH_MAX]; + char value[16]; + + strcpy(path, "/sys/kernel/debug/"); + strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1); + + if ((fd = open(path, O_RDONLY)) < 0) + return rc; + + if ((rc = read(fd, value, sizeof(value))) < 0) + return rc; + + value[15] = 0; + *result = atoi(value); + close(fd); + + return 0; +} + +int write_debugfs_file(char *debugfs_file, int result) +{ + int rc = -1, fd; + char path[PATH_MAX]; + char value[16]; + + strcpy(path, "/sys/kernel/debug/"); + strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1); + + if ((fd = open(path, O_WRONLY)) < 0) + return rc; + + snprintf(value, 16, "%d", result); + + if ((rc = write(fd, value, strlen(value))) < 0) + return rc; + + close(fd); + + return 0; +} + +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); +} + +static void perf_event_attr_init(struct perf_event_attr *event_attr, + unsigned int type, + unsigned long config) +{ + memset(event_attr, 0, sizeof(*event_attr)); + + event_attr->type = type; + event_attr->size = sizeof(struct perf_event_attr); + event_attr->config = config; + event_attr->read_format = PERF_FORMAT_GROUP; + event_attr->disabled = 1; + event_attr->exclude_kernel = 1; + event_attr->exclude_hv = 1; + event_attr->exclude_guest = 1; +} + +int perf_event_open_counter(unsigned int type, + unsigned long config, int group_fd) +{ + int fd; + struct perf_event_attr event_attr; + + perf_event_attr_init(&event_attr, type, config); + + fd = perf_event_open(&event_attr, 0, -1, group_fd, 0); + + if (fd < 0) + perror("perf_event_open() failed"); + + return fd; +} + +int perf_event_enable(int fd) +{ + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { + perror("error while enabling perf events"); + return -1; + } + + return 0; +} + +int perf_event_disable(int fd) +{ + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { + perror("error disabling perf events"); + return -1; + } + + return 0; +} + +int perf_event_reset(int fd) +{ + if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { + perror("error resetting perf events"); + return -1; + } + + return 0; +} + +static void sigill_handler(int signr, siginfo_t *info, void *unused) +{ + static int warned = 0; + ucontext_t *ctx = (ucontext_t *)unused; + unsigned long *pc = &UCONTEXT_NIA(ctx); + + if (*pc == (unsigned long)&dscr_insn) { + if (!warned++) + printf("WARNING: Skipping over dscr setup. Consider running 'ppc64_cpu --dscr=1' manually.\n"); + *pc += 4; + } else { + printf("SIGILL at %p\n", pc); + abort(); + } +} + +void set_dscr(unsigned long val) +{ + static int init = 0; + struct sigaction sa; + + if (!init) { + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigill_handler; + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGILL, &sa, NULL)) + perror("sigill_handler"); + init = 1; + } + + asm volatile("dscr_insn: mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR)); +} From c6c26fb55e8e4b3fc376be5611685990a17de27a Mon Sep 17 00:00:00 2001 From: Aravinda Prasad Date: Tue, 16 Oct 2018 17:20:05 +0530 Subject: [PATCH 194/221] powerpc/pseries: Export raw per-CPU VPA data via debugfs This patch exports the raw per-CPU VPA data via debugfs. A per-CPU file is created which exports the VPA data of that CPU to help debug some of the VPA related issues or to analyze the per-CPU VPA related statistics. v3: Removed offline CPU check. v2: Included offline CPU check and other review comments. Signed-off-by: Aravinda Prasad Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lpar.c | 54 +++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 0b5081085a44..32d4452973e7 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "pseries.h" @@ -1239,3 +1240,56 @@ static int __init reserve_vrma_context_id(void) return 0; } machine_device_initcall(pseries, reserve_vrma_context_id); + +#ifdef CONFIG_DEBUG_FS +/* debugfs file interface for vpa data */ +static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + int cpu = (long)filp->private_data; + struct lppaca *lppaca = &lppaca_of(cpu); + + return simple_read_from_buffer(buf, len, pos, lppaca, + sizeof(struct lppaca)); +} + +static const struct file_operations vpa_fops = { + .open = simple_open, + .read = vpa_file_read, + .llseek = default_llseek, +}; + +static int __init vpa_debugfs_init(void) +{ + char name[16]; + long i; + static struct dentry *vpa_dir; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + if (!vpa_dir) { + pr_warn("%s: can't create vpa root dir\n", __func__); + return -ENOMEM; + } + + /* set up the per-cpu vpa file*/ + for_each_possible_cpu(i) { + struct dentry *d; + + sprintf(name, "cpu-%ld", i); + + d = debugfs_create_file(name, 0400, vpa_dir, (void *)i, + &vpa_fops); + if (!d) { + pr_warn("%s: can't create per-cpu vpa file\n", + __func__); + return -ENOMEM; + } + } + + return 0; +} +machine_arch_initcall(pseries, vpa_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ From 3ff38e1874863827374b02b4f31c73faa3744e1c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Oct 2018 06:37:41 +0000 Subject: [PATCH 195/221] powerpc/book3s64: fix dump_linuxpagetables "present" flag Since commit bd0dbb73e013 ("powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid."), _PAGE_PRESENT doesn't mean exactly that a page is present. A page is also considered preset when _PAGE_INVALID is set. This patch changes the meaning of "present" and adds a status "valid" associated to the _PAGE_PRESENT flag. Fixes: bd0dbb73e013 ("powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.") Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/dump_linuxpagetables-book3s64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c index a637e612b205..ed6fcf78256e 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c +++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c @@ -38,8 +38,13 @@ static const struct flag_info flag_array[] = { }, { .mask = _PAGE_PRESENT, .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", + .set = "valid", + .clear = " ", + }, { + .mask = _PAGE_PRESENT | _PAGE_INVALID, + .val = 0, + .set = " ", + .clear = "present", }, { .mask = H_PAGE_HASHPTE, .val = H_PAGE_HASHPTE, From cf4a6085151ae3f4e78dd91981833e65aaae8bc6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Mar 2018 15:16:58 +0100 Subject: [PATCH 196/221] powerpc/mm: Add missing tracepoint for tlbie commit 0428491cba927 ("powerpc/mm: Trace tlbie(l) instructions") added tracepoints for tlbie calls, but _tlbil_va() was forgotten Fixes: 0428491cba927 ("powerpc/mm: Trace tlbie(l) instructions") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_decl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index dd7f9b951d25..1db2027a0110 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -22,6 +22,7 @@ #include #ifdef CONFIG_PPC_MMU_NOHASH +#include /* * On 40x and 8xx, we directly inline tlbia and tlbivax @@ -55,6 +56,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); + trace_tlbie(0, 0, address, pid, 0, 0, 0); } #elif defined(CONFIG_PPC_BOOK3E) extern void _tlbil_va(unsigned long address, unsigned int pid, From 8114c36ea6486aba2269d0590c5d553108ee9558 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Mar 2018 15:17:00 +0100 Subject: [PATCH 197/221] powerpc/mm: Trace tlbia instruction Add a trace point for tlbia (Translation Lookaside Buffer Invalidate All) instruction. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/trace.h | 15 +++++++++++++++ arch/powerpc/mm/mmu_decl.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index d018e8602694..58ef8c43a89d 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -201,6 +201,21 @@ TRACE_EVENT(tlbie, __entry->r) ); +TRACE_EVENT(tlbia, + + TP_PROTO(unsigned long id), + TP_ARGS(id), + TP_STRUCT__entry( + __field(unsigned long, id) + ), + + TP_fast_assign( + __entry->id = id; + ), + + TP_printk("ctx.id=0x%lx", __entry->id) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 1db2027a0110..8574fbbc45e0 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -31,10 +31,12 @@ static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(MMU_NO_CONTEXT); } static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(pid); } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) From aa5456abdc20568f5da348209148a9c75a32468a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Jun 2018 13:49:48 +0000 Subject: [PATCH 198/221] powerpc/mm: fix missing prototypes in slice.c This patch fixes the following warnings (obtained with make W=1). arch/powerpc/mm/slice.c: At top level: arch/powerpc/mm/slice.c:682:15: error: no previous prototype for 'arch_get_unmapped_area' [-Werror=missing-prototypes] unsigned long arch_get_unmapped_area(struct file *filp, ^ arch/powerpc/mm/slice.c:692:15: error: no previous prototype for 'arch_get_unmapped_area_topdown' [-Werror=missing-prototypes] unsigned long arch_get_unmapped_area_topdown(struct file *filp, ^ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 50ba3d0456a5..8dafb426554d 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include From 37e9c674e7e6f445e12cb1151017bd4bacdd1e2d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Aug 2018 09:25:55 +0000 Subject: [PATCH 199/221] powerpc/mm: fix always true/false warning in slice.c This patch fixes the following warnings (obtained with make W=1). arch/powerpc/mm/slice.c: In function 'slice_range_to_mask': arch/powerpc/mm/slice.c:73:12: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (start < SLICE_LOW_TOP) { ^ arch/powerpc/mm/slice.c:81:20: error: comparison is always false due to limited range of data type [-Werror=type-limits] if ((start + len) > SLICE_LOW_TOP) { ^ arch/powerpc/mm/slice.c: In function 'slice_mask_for_free': arch/powerpc/mm/slice.c:136:17: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (high_limit <= SLICE_LOW_TOP) ^ arch/powerpc/mm/slice.c: In function 'slice_check_range_fits': arch/powerpc/mm/slice.c:185:12: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (start < SLICE_LOW_TOP) { ^ arch/powerpc/mm/slice.c:195:39: error: comparison is always false due to limited range of data type [-Werror=type-limits] if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) { ^ arch/powerpc/mm/slice.c: In function 'slice_scan_available': arch/powerpc/mm/slice.c:306:11: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (addr < SLICE_LOW_TOP) { ^ arch/powerpc/mm/slice.c: In function 'get_slice_psize': arch/powerpc/mm/slice.c:709:11: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (addr < SLICE_LOW_TOP) { ^ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 8dafb426554d..06898c13901d 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -62,6 +62,13 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) { #endif +static inline bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + static void slice_range_to_mask(unsigned long start, unsigned long len, struct slice_mask *ret) { @@ -71,7 +78,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len, if (SLICE_NUM_HIGH) bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); - if (start < SLICE_LOW_TOP) { + if (slice_addr_is_low(start)) { unsigned long mend = min(end, (unsigned long)(SLICE_LOW_TOP - 1)); @@ -79,7 +86,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len, - (1u << GET_LOW_SLICE_INDEX(start)); } - if ((start + len) > SLICE_LOW_TOP) { + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { unsigned long start_index = GET_HIGH_SLICE_INDEX(start); unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; @@ -134,7 +141,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (high_limit <= SLICE_LOW_TOP) + if (slice_addr_is_low(high_limit - 1)) return; for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) @@ -183,7 +190,7 @@ static bool slice_check_range_fits(struct mm_struct *mm, unsigned long end = start + len - 1; u64 low_slices = 0; - if (start < SLICE_LOW_TOP) { + if (slice_addr_is_low(start)) { unsigned long mend = min(end, (unsigned long)(SLICE_LOW_TOP - 1)); @@ -193,7 +200,7 @@ static bool slice_check_range_fits(struct mm_struct *mm, if ((low_slices & available->low_slices) != low_slices) return false; - if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) { + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { unsigned long start_index = GET_HIGH_SLICE_INDEX(start); unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; @@ -304,7 +311,7 @@ static bool slice_scan_available(unsigned long addr, int end, unsigned long *boundary_addr) { unsigned long slice; - if (addr < SLICE_LOW_TOP) { + if (slice_addr_is_low(addr)) { slice = GET_LOW_SLICE_INDEX(addr); *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; return !!(available->low_slices & (1u << slice)); @@ -707,7 +714,7 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) VM_BUG_ON(radix_enabled()); - if (addr < SLICE_LOW_TOP) { + if (slice_addr_is_low(addr)) { psizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); } else { From 6beb3381b12793726783a5e9428250743659c6cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 27 Aug 2018 08:27:27 +0000 Subject: [PATCH 200/221] powerpc/sysdev/ipic: check primary_ipic NULL pointer before using it ipic_get_mcp_status() is used by targets implementing NMI watchdog in target specific machine check handler in order to known whether a machine check results from a watchdog NMI reset. In case of very early machine check, primary_ipic pointer might not have been set yet, so ipic_get_mcp_status() needs to check it for nullity before using it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/ipic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 535cf1f6941c..6300123ce965 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -846,7 +846,7 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) u32 ipic_get_mcp_status(void) { - return ipic_read(primary_ipic->regs, IPIC_SERSR); + return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; } void ipic_clear_mcp_status(u32 mask) From fb978ca207743badfe7efd9eebe68bcbb4969f79 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 18 Sep 2018 09:26:03 +0000 Subject: [PATCH 201/221] powerpc/kgdb: add kgdb_arch_set/remove_breakpoint() Generic implementation fails to remove breakpoints after init when CONFIG_STRICT_KERNEL_RWX is selected: [ 13.251285] KGDB: BP remove failed: c001c338 [ 13.259587] kgdbts: ERROR PUT: end of test buffer on 'do_fork_test' line 8 expected OK got $E14#aa [ 13.268969] KGDB: re-enter exception: ALL breakpoints killed [ 13.275099] CPU: 0 PID: 1 Comm: init Not tainted 4.18.0-g82bbb913ffd8 #860 [ 13.282836] Call Trace: [ 13.285313] [c60e1ba0] [c0080ef0] kgdb_handle_exception+0x6f4/0x720 (unreliable) [ 13.292618] [c60e1c30] [c000e97c] kgdb_handle_breakpoint+0x3c/0x98 [ 13.298709] [c60e1c40] [c000af54] program_check_exception+0x104/0x700 [ 13.305083] [c60e1c60] [c000e45c] ret_from_except_full+0x0/0x4 [ 13.310845] [c60e1d20] [c02a22ac] run_simple_test+0x2b4/0x2d4 [ 13.316532] [c60e1d30] [c0081698] put_packet+0xb8/0x158 [ 13.321694] [c60e1d60] [c00820b4] gdb_serial_stub+0x230/0xc4c [ 13.327374] [c60e1dc0] [c0080af8] kgdb_handle_exception+0x2fc/0x720 [ 13.333573] [c60e1e50] [c000e928] kgdb_singlestep+0xb4/0xcc [ 13.339068] [c60e1e70] [c000ae1c] single_step_exception+0x90/0xac [ 13.345100] [c60e1e80] [c000e45c] ret_from_except_full+0x0/0x4 [ 13.350865] [c60e1f40] [c000e11c] ret_from_syscall+0x0/0x38 [ 13.356346] Kernel panic - not syncing: Recursive entry to debugger This patch creates powerpc specific version of kgdb_arch_set_breakpoint() and kgdb_arch_remove_breakpoint() using patch_instruction() Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kgdb.h | 5 +++- arch/powerpc/kernel/kgdb.c | 43 +++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h index 9db24e77b9f4..a9e098a3b881 100644 --- a/arch/powerpc/include/asm/kgdb.h +++ b/arch/powerpc/include/asm/kgdb.h @@ -26,9 +26,12 @@ #define BREAK_INSTR_SIZE 4 #define BUFMAX ((NUMREGBYTES * 2) + 512) #define OUTBUFMAX ((NUMREGBYTES * 2) + 512) + +#define BREAK_INSTR 0x7d821008 /* twge r2, r2 */ + static inline void arch_kgdb_breakpoint(void) { - asm(".long 0x7d821008"); /* twge r2, r2 */ + asm(stringify_in_c(.long BREAK_INSTR)); } #define CACHE_FLUSH_IS_SAFE 1 #define DBG_MAX_REG_NUM 70 diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 35e240a0a408..59c578f865aa 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -144,7 +145,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0) return 0; - if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) + if (*(u32 *)regs->nip == BREAK_INSTR) regs->nip += BREAK_INSTR_SIZE; return 1; @@ -441,16 +442,42 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, return -1; } +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = probe_kernel_address(addr, instr); + if (err) + return err; + + err = patch_instruction(addr, BREAK_INSTR); + if (err) + return -EFAULT; + + *(unsigned int *)bpt->saved_instr = instr; + + return 0; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr = *(unsigned int *)bpt->saved_instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = patch_instruction(addr, instr); + if (err) + return -EFAULT; + + return 0; +} + /* * Global data */ -struct kgdb_arch arch_kgdb_ops = { -#ifdef __LITTLE_ENDIAN__ - .gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d}, -#else - .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, -#endif -}; +struct kgdb_arch arch_kgdb_ops; static int kgdb_not_implemented(struct pt_regs *regs) { From b38a181c11d0b5e84b40732dbb06cc9d68140d60 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Aug 2018 07:53:57 +0000 Subject: [PATCH 202/221] powerpc/time: isolate scaled cputime accounting in dedicated functions. scaled cputime is only meaningfull when the processor has SPURR and/or PURR, which means only on PPC64. In preparation of the following patch that will remove CONFIG_ARCH_HAS_SCALED_CPUTIME on PPC32, this patch moves all scaled cputing accounting logic into dedicated functions. This patch doesn't change any functionality. It's only code reorganisation. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 69 ++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 68e8f963d108..cf0d5c2834d0 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -282,26 +282,16 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * Account time for a transition between system, hard irq * or soft irq state. */ -static unsigned long vtime_delta(struct task_struct *tsk, - unsigned long *stime_scaled, - unsigned long *steal_time) +static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, + unsigned long now, unsigned long stime) { - unsigned long now, nowscaled, deltascaled; - unsigned long stime; + unsigned long stime_scaled; + unsigned long nowscaled, deltascaled; unsigned long utime, utime_scaled; - struct cpu_accounting_data *acct = get_accounting(tsk); - WARN_ON_ONCE(!irqs_disabled()); - - now = mftb(); nowscaled = read_spurr(now); - stime = now - acct->starttime; - acct->starttime = now; deltascaled = nowscaled - acct->startspurr; acct->startspurr = nowscaled; - - *steal_time = calculate_stolen_time(now); - utime = acct->utime - acct->utime_sspurr; acct->utime_sspurr = acct->utime; @@ -315,18 +305,38 @@ static unsigned long vtime_delta(struct task_struct *tsk, * the user ticks get saved up in paca->user_time_scaled to be * used by account_process_tick. */ - *stime_scaled = stime; + stime_scaled = stime; utime_scaled = utime; if (deltascaled != stime + utime) { if (utime) { - *stime_scaled = deltascaled * stime / (stime + utime); - utime_scaled = deltascaled - *stime_scaled; + stime_scaled = deltascaled * stime / (stime + utime); + utime_scaled = deltascaled - stime_scaled; } else { - *stime_scaled = deltascaled; + stime_scaled = deltascaled; } } acct->utime_scaled += utime_scaled; + return stime_scaled; +} + +static unsigned long vtime_delta(struct task_struct *tsk, + unsigned long *stime_scaled, + unsigned long *steal_time) +{ + unsigned long now, stime; + struct cpu_accounting_data *acct = get_accounting(tsk); + + WARN_ON_ONCE(!irqs_disabled()); + + now = mftb(); + stime = now - acct->starttime; + acct->starttime = now; + + *stime_scaled = vtime_delta_scaled(acct, now, stime); + + *steal_time = calculate_stolen_time(now); + return stime; } @@ -365,6 +375,19 @@ void vtime_account_idle(struct task_struct *tsk) acct->idle_time += stime + steal_time; } +static void vtime_flush_scaled(struct task_struct *tsk, + struct cpu_accounting_data *acct) +{ + if (acct->utime_scaled) + tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); + if (acct->stime_scaled) + tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); + + acct->utime_scaled = 0; + acct->utime_sspurr = 0; + acct->stime_scaled = 0; +} + /* * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. @@ -379,9 +402,6 @@ void vtime_flush(struct task_struct *tsk) if (acct->utime) account_user_time(tsk, cputime_to_nsecs(acct->utime)); - if (acct->utime_scaled) - tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); - if (acct->gtime) account_guest_time(tsk, cputime_to_nsecs(acct->gtime)); @@ -394,8 +414,6 @@ void vtime_flush(struct task_struct *tsk) if (acct->stime) account_system_index_time(tsk, cputime_to_nsecs(acct->stime), CPUTIME_SYSTEM); - if (acct->stime_scaled) - tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); if (acct->hardirq_time) account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time), @@ -404,14 +422,13 @@ void vtime_flush(struct task_struct *tsk) account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time), CPUTIME_SOFTIRQ); + vtime_flush_scaled(tsk, acct); + acct->utime = 0; - acct->utime_scaled = 0; - acct->utime_sspurr = 0; acct->gtime = 0; acct->steal_time = 0; acct->idle_time = 0; acct->stime = 0; - acct->stime_scaled = 0; acct->hardirq_time = 0; acct->softirq_time = 0; } From abcff86df2d2ec0a0ca9470fa5d2a184af18928a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Aug 2018 07:53:59 +0000 Subject: [PATCH 203/221] powerpc/time: Only set CONFIG_ARCH_HAS_SCALED_CPUTIME on PPC64 scaled cputime is only meaningfull when the processor has SPURR and/or PURR, which means only on PPC64. Removing it on PPC32 significantly reduces the size of vtime_account_system() and vtime_account_idle() on an 8xx: Before: 00000000 l F .text 000000a8 vtime_delta 00000280 g F .text 0000010c vtime_account_system 0000038c g F .text 00000048 vtime_account_idle After: (vtime_delta gets inlined inside the two functions) 000001d8 g F .text 000000a0 vtime_account_system 00000278 g F .text 00000038 vtime_account_idle In terms of performance, we also get approximatly 7% improvement on task switch. The following small benchmark app is run with perf stat: void *thread(void *arg) { int i; for (i = 0; i < atoi((char*)arg); i++) pthread_yield(); } int main(int argc, char **argv) { pthread_t th1, th2; pthread_create(&th1, NULL, thread, argv[1]); pthread_create(&th2, NULL, thread, argv[1]); pthread_join(th1, NULL); pthread_join(th2, NULL); return 0; } Before the patch: Performance counter stats for 'chrt -f 98 ./sched 100000' (50 runs): 8228.476465 task-clock (msec) # 0.954 CPUs utilized ( +- 0.23% ) 200004 context-switches # 0.024 M/sec ( +- 0.00% ) After the patch: Performance counter stats for 'chrt -f 98 ./sched 100000' (50 runs): 7649.070444 task-clock (msec) # 0.955 CPUs utilized ( +- 0.27% ) 200004 context-switches # 0.026 M/sec ( +- 0.00% ) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/accounting.h | 4 ++++ arch/powerpc/include/asm/cputime.h | 1 - arch/powerpc/kernel/time.c | 12 ++++++++++-- arch/powerpc/xmon/xmon.c | 4 ++++ 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 076e05ae9b04..e84943d24e5c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,7 +137,7 @@ config PPC select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS - select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h index 3abcf98ed2e0..c607c5d835cc 100644 --- a/arch/powerpc/include/asm/accounting.h +++ b/arch/powerpc/include/asm/accounting.h @@ -15,8 +15,10 @@ struct cpu_accounting_data { /* Accumulated cputime values to flush on ticks*/ unsigned long utime; unsigned long stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME unsigned long utime_scaled; unsigned long stime_scaled; +#endif unsigned long gtime; unsigned long hardirq_time; unsigned long softirq_time; @@ -25,8 +27,10 @@ struct cpu_accounting_data { /* Internal counters */ unsigned long starttime; /* TB value snapshot */ unsigned long starttime_user; /* TB value on exit to usermode */ +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME unsigned long startspurr; /* SPURR value snapshot */ unsigned long utime_sspurr; /* ->user_time when ->startspurr set */ +#endif }; #endif diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 133672744b2e..ae73dc8da2d4 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -61,7 +61,6 @@ static inline void arch_vtime_task_switch(struct task_struct *prev) struct cpu_accounting_data *acct0 = get_accounting(prev); acct->starttime = acct0->starttime; - acct->startspurr = acct0->startspurr; } #endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index cf0d5c2834d0..9289fac75af7 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -176,7 +176,7 @@ static void calc_cputime_factors(void) * Read the SPURR on systems that have it, otherwise the PURR, * or if that doesn't exist return the timebase value passed in. */ -static unsigned long read_spurr(unsigned long tb) +static inline unsigned long read_spurr(unsigned long tb) { if (cpu_has_feature(CPU_FTR_SPURR)) return mfspr(SPRN_SPURR); @@ -285,7 +285,8 @@ static inline u64 calculate_stolen_time(u64 stop_tb) static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, unsigned long now, unsigned long stime) { - unsigned long stime_scaled; + unsigned long stime_scaled = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME unsigned long nowscaled, deltascaled; unsigned long utime, utime_scaled; @@ -316,6 +317,7 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, } } acct->utime_scaled += utime_scaled; +#endif return stime_scaled; } @@ -352,7 +354,9 @@ void vtime_account_system(struct task_struct *tsk) if ((tsk->flags & PF_VCPU) && !irq_count()) { acct->gtime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; +#endif } else { if (hardirq_count()) acct->hardirq_time += stime; @@ -361,7 +365,9 @@ void vtime_account_system(struct task_struct *tsk) else acct->stime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; +#endif } } EXPORT_SYMBOL_GPL(vtime_account_system); @@ -378,6 +384,7 @@ void vtime_account_idle(struct task_struct *tsk) static void vtime_flush_scaled(struct task_struct *tsk, struct cpu_accounting_data *acct) { +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME if (acct->utime_scaled) tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); if (acct->stime_scaled) @@ -386,6 +393,7 @@ static void vtime_flush_scaled(struct task_struct *tsk, acct->utime_scaled = 0; acct->utime_sspurr = 0; acct->stime_scaled = 0; +#endif } /* diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 58e67b67a97c..36b8dc47a3c3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2454,11 +2454,15 @@ static void dump_one_paca(int cpu) DUMP(p, accounting.utime, "%#-*lx"); DUMP(p, accounting.stime, "%#-*lx"); +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME DUMP(p, accounting.utime_scaled, "%#-*lx"); +#endif DUMP(p, accounting.starttime, "%#-*lx"); DUMP(p, accounting.starttime_user, "%#-*lx"); +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME DUMP(p, accounting.startspurr, "%#-*lx"); DUMP(p, accounting.utime_sspurr, "%#-*lx"); +#endif DUMP(p, accounting.steal_time, "%#-*lx"); #undef DUMP From 51eeef9e135ac00cf706fad1a3bde775ca578462 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Aug 2018 07:54:01 +0000 Subject: [PATCH 204/221] powerpc/time: no steal_time when CONFIG_PPC_SPLPAR is not selected If CONFIG_PPC_SPLPAR is not selected, steal_time will always be NUL, so accounting it is pointless Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 9289fac75af7..3646affae963 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -413,8 +413,10 @@ void vtime_flush(struct task_struct *tsk) if (acct->gtime) account_guest_time(tsk, cputime_to_nsecs(acct->gtime)); - if (acct->steal_time) + if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) { account_steal_time(cputime_to_nsecs(acct->steal_time)); + acct->steal_time = 0; + } if (acct->idle_time) account_idle_time(cputime_to_nsecs(acct->idle_time)); @@ -434,7 +436,6 @@ void vtime_flush(struct task_struct *tsk) acct->utime = 0; acct->gtime = 0; - acct->steal_time = 0; acct->idle_time = 0; acct->stime = 0; acct->hardirq_time = 0; From d8a2fe29d3c97038c8efcc328d5e7940c5310565 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 19 Oct 2018 00:11:33 +1100 Subject: [PATCH 205/221] selftests/powerpc: Fix out-of-tree build errors Some of our Makefiles don't do the right thing when building the selftests with O=, fix them up. Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/cache_shape/Makefile | 2 -- tools/testing/selftests/powerpc/ptrace/Makefile | 2 -- tools/testing/selftests/powerpc/signal/Makefile | 2 -- tools/testing/selftests/powerpc/switch_endian/Makefile | 1 + 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index ede4d3dae750..62e947ca9921 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 TEST_PROGS := cache_shape -all: $(TEST_PROGS) - $(TEST_PROGS): ../harness.c ../utils.c top_srcdir = ../../../../.. diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 9b35ca8e8f13..6ac71b629276 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -7,8 +7,6 @@ TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ top_srcdir = ../../../../.. include ../../lib.mk -all: $(TEST_PROGS) - CFLAGS += -m64 -I../../../../../usr/include -I../tm -mhtm -fno-pie ptrace-pkey core-pkey: child.h diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 1fca25c6ace0..d34a7c7710db 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 TEST_PROGS := signal signal_tm -all: $(TEST_PROGS) - $(TEST_PROGS): ../harness.c ../utils.c signal.S CFLAGS += -maltivec diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index fcd2dcb8972b..bdc081afedb0 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -8,6 +8,7 @@ EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +$(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S $(OUTPUT)/check-reversed.o: $(OUTPUT)/check.o From dd0e144a632bcf2b5a6f04e2628e32b16d499277 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 13 Oct 2018 22:18:15 +0530 Subject: [PATCH 206/221] powerpc/mm: Fix WARN_ON with THP NUMA migration WARNING: CPU: 12 PID: 4322 at /arch/powerpc/mm/pgtable-book3s64.c:76 set_pmd_at+0x4c/0x2b0 Modules linked in: CPU: 12 PID: 4322 Comm: qemu-system-ppc Tainted: G W 4.19.0-rc3-00758-g8f0c636b0542 #36 NIP: c0000000000872fc LR: c000000000484eec CTR: 0000000000000000 REGS: c000003fba876fe0 TRAP: 0700 Tainted: G W (4.19.0-rc3-00758-g8f0c636b0542) MSR: 900000010282b033 CR: 24282884 XER: 00000000 CFAR: c000000000484ee8 IRQMASK: 0 GPR00: c000000000484eec c000003fba877268 c000000001f0ec00 c000003fbd229f80 GPR04: 00007c8fe8e00000 c000003f864c5a38 860300853e0000c0 0000000000000080 GPR08: 0000000080000000 0000000000000001 0401000000000080 0000000000000001 GPR12: 0000000000002000 c000003fffff5400 c000003fce292000 00007c9024570000 GPR16: 0000000000000000 0000000000ffffff 0000000000000001 c000000001885950 GPR20: 0000000000000000 001ffffc0004807c 0000000000000008 c000000001f49d05 GPR24: 00007c8fe8e00000 c0000000020f2468 ffffffffffffffff c000003fcd33b090 GPR28: 00007c8fe8e00000 c000003fbd229f80 c000003f864c5a38 860300853e0000c0 NIP [c0000000000872fc] set_pmd_at+0x4c/0x2b0 LR [c000000000484eec] do_huge_pmd_numa_page+0xb1c/0xc20 Call Trace: [c000003fba877268] [c00000000045931c] mpol_misplaced+0x1bc/0x230 (unreliable) [c000003fba8772c8] [c000000000484eec] do_huge_pmd_numa_page+0xb1c/0xc20 [c000003fba877398] [c00000000040d344] __handle_mm_fault+0x5e4/0x2300 [c000003fba8774d8] [c00000000040f400] handle_mm_fault+0x3a0/0x420 [c000003fba877528] [c0000000003ff6f4] __get_user_pages+0x2e4/0x560 [c000003fba877628] [c000000000400314] get_user_pages_unlocked+0x104/0x2a0 [c000003fba8776c8] [c000000000118f44] __gfn_to_pfn_memslot+0x284/0x6a0 [c000003fba877748] [c0000000001463a0] kvmppc_book3s_radix_page_fault+0x360/0x12d0 [c000003fba877838] [c000000000142228] kvmppc_book3s_hv_page_fault+0x48/0x1300 [c000003fba877988] [c00000000013dc08] kvmppc_vcpu_run_hv+0x1808/0x1b50 [c000003fba877af8] [c000000000126b44] kvmppc_vcpu_run+0x34/0x50 [c000003fba877b18] [c000000000123268] kvm_arch_vcpu_ioctl_run+0x288/0x2d0 [c000003fba877b98] [c00000000011253c] kvm_vcpu_ioctl+0x1fc/0x8c0 [c000003fba877d08] [c0000000004e9b24] do_vfs_ioctl+0xa44/0xae0 [c000003fba877db8] [c0000000004e9c44] ksys_ioctl+0x84/0xf0 [c000003fba877e08] [c0000000004e9cd8] sys_ioctl+0x28/0x80 We removed the pte_protnone check earlier with the understanding that we mark the pte invalid before the set_pte/set_pmd usage. But the huge pmd autonuma still use the set_pmd_at directly. This is ok because a protnone pte won't have translation cache in TLB. Fixes: da7ad366b497 ("powerpc/mm/book3s: Update pmd_present to look at _PAGE_PRESENT bit") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-book3s64.c | 3 ++- arch/powerpc/mm/pgtable.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 43e99e1d947b..9f93c9f985c5 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -73,7 +73,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, * Make sure hardware valid bit is not set. We don't do * tlb flush for this update. */ - WARN_ON(pte_val(pmd_pte(*pmdp)) & _PAGE_PRESENT); + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); #endif diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ca4b1f7ac39d..010e1c616cb2 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -184,7 +184,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * Make sure hardware valid bit is not set. We don't do * tlb flush for this update. */ - VM_WARN_ON(pte_hw_valid(*ptep)); + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Add the pte bit when trying to set a pte */ pte = pte_mkpte(pte); From 67361cf80712867329a9cd3ff0e3171545cfc867 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 17 Oct 2018 01:55:00 +0530 Subject: [PATCH 207/221] powerpc/ftrace: Handle large kernel configs Currently, we expect to be able to reach ftrace_caller() from all ftrace-enabled functions through a single relative branch. With large kernel configs, we see functions outside of 32MB of ftrace_caller() causing ftrace_init() to bail. In such configurations, gcc/ld emits two types of trampolines for mcount(): 1. A long_branch, which has a single branch to mcount() for functions that are one hop away from mcount(): c0000000019e8544 <00031b56.long_branch._mcount>: c0000000019e8544: 4a 69 3f ac b c00000000007c4f0 <._mcount> 2. A plt_branch, for functions that are farther away from mcount(): c0000000051f33f8 <0008ba04.plt_branch._mcount>: c0000000051f33f8: 3d 82 ff a4 addis r12,r2,-92 c0000000051f33fc: e9 8c 04 20 ld r12,1056(r12) c0000000051f3400: 7d 89 03 a6 mtctr r12 c0000000051f3404: 4e 80 04 20 bctr We can reuse those trampolines for ftrace if we can have those trampolines go to ftrace_caller() instead. However, with ABIv2, we cannot depend on r2 being valid. As such, we use only the long_branch trampolines by patching those to instead branch to ftrace_caller or ftrace_regs_caller. In addition, we add additional trampolines around .text and .init.text to catch locations that are covered by the plt branches. This allows ftrace to work with most large kernel configurations. For now, we always patch the trampolines to go to ftrace_regs_caller, which is slightly inefficient. This can be optimized further at a later point. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/trace/ftrace.c | 261 +++++++++++++++++++++++++- arch/powerpc/kernel/trace/ftrace_64.S | 12 ++ arch/powerpc/kernel/vmlinux.lds.S | 13 +- 3 files changed, 281 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4bfbb54dee51..4bf051d3e21e 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -30,6 +30,16 @@ #ifdef CONFIG_DYNAMIC_FTRACE + +/* + * We generally only have a single long_branch tramp and at most 2 or 3 plt + * tramps generated. But, we don't use the plt tramps currently. We also allot + * 2 tramps after .text and .init.text. So, we only end up with around 3 usable + * tramps in total. Set aside 8 just to be sure. + */ +#define NUM_FTRACE_TRAMPS 8 +static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; + static unsigned int ftrace_call_replace(unsigned long ip, unsigned long addr, int link) { @@ -85,13 +95,16 @@ static int test_24bit_addr(unsigned long ip, unsigned long addr) return create_branch((unsigned int *)ip, addr, 0); } -#ifdef CONFIG_MODULES - static int is_bl_op(unsigned int op) { return (op & 0xfc000003) == 0x48000001; } +static int is_b_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000000; +} + static unsigned long find_bl_target(unsigned long ip, unsigned int op) { static int offset; @@ -104,6 +117,7 @@ static unsigned long find_bl_target(unsigned long ip, unsigned int op) return ip + (long)offset; } +#ifdef CONFIG_MODULES #ifdef CONFIG_PPC64 static int __ftrace_make_nop(struct module *mod, @@ -270,6 +284,146 @@ __ftrace_make_nop(struct module *mod, #endif /* PPC64 */ #endif /* CONFIG_MODULES */ +static unsigned long find_ftrace_tramp(unsigned long ip) +{ + int i; + + /* + * We have the compiler generated long_branch tramps at the end + * and we prefer those + */ + for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) + if (!ftrace_tramps[i]) + continue; + else if (create_branch((void *)ip, ftrace_tramps[i], 0)) + return ftrace_tramps[i]; + + return 0; +} + +static int add_ftrace_tramp(unsigned long tramp) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) { + ftrace_tramps[i] = tramp; + return 0; + } + + return -1; +} + +/* + * If this is a compiler generated long_branch trampoline (essentially, a + * trampoline that has a branch to _mcount()), we re-write the branch to + * instead go to ftrace_[regs_]caller() and note down the location of this + * trampoline. + */ +static int setup_mcount_compiler_tramp(unsigned long tramp) +{ + int i, op; + unsigned long ptr; + static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; + + /* Is this a known long jump tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) + break; + else if (ftrace_tramps[i] == tramp) + return 0; + + /* Is this a known plt tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_plt_tramps[i]) + break; + else if (ftrace_plt_tramps[i] == tramp) + return -1; + + /* New trampoline -- read where this goes */ + if (probe_kernel_read(&op, (void *)tramp, sizeof(int))) { + pr_debug("Fetching opcode failed.\n"); + return -1; + } + + /* Is this a 24 bit branch? */ + if (!is_b_op(op)) { + pr_debug("Trampoline is not a long branch tramp.\n"); + return -1; + } + + /* lets find where the pointer goes */ + ptr = find_bl_target(tramp, op); + + if (ptr != ppc_global_function_entry((void *)_mcount)) { + pr_debug("Trampoline target %p is not _mcount\n", (void *)ptr); + return -1; + } + + /* Let's re-write the tramp to go to ftrace_[regs_]caller */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + ptr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + ptr = ppc_global_function_entry((void *)ftrace_caller); +#endif + if (!create_branch((void *)tramp, ptr, 0)) { + pr_debug("%ps is not reachable from existing mcount tramp\n", + (void *)ptr); + return -1; + } + + if (patch_branch((unsigned int *)tramp, ptr, 0)) { + pr_debug("REL24 out of range!\n"); + return -1; + } + + if (add_ftrace_tramp(tramp)) { + pr_debug("No tramp locations left\n"); + return -1; + } + + return 0; +} + +static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long tramp, ip = rec->ip; + unsigned int op; + + /* Read where this goes */ + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* Let's find where the pointer goes */ + tramp = find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (setup_mcount_compiler_tramp(tramp)) { + /* Are other trampolines reachable? */ + if (!find_ftrace_tramp(ip)) { + pr_err("No ftrace trampolines reachable from %ps\n", + (void *)ip); + return -EINVAL; + } + } + + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + + return 0; +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -286,7 +440,8 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr, 1); new = PPC_INST_NOP; return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_nop_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -456,6 +611,53 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) #endif /* CONFIG_PPC64 */ #endif /* CONFIG_MODULES */ +static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + void *ip = (void *)rec->ip; + unsigned long tramp, entry, ptr; + + /* Make sure we're being asked to patch branch to a known ftrace addr */ + entry = ppc_global_function_entry((void *)ftrace_caller); + ptr = ppc_global_function_entry((void *)addr); + + if (ptr != entry) { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + entry = ppc_global_function_entry((void *)ftrace_regs_caller); + if (ptr != entry) { +#endif + pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); + return -EINVAL; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + } +#endif + } + + /* Make sure we have a nop */ + if (probe_kernel_read(&op, ip, sizeof(op))) { + pr_err("Unable to read ftrace location %p\n", ip); + return -EFAULT; + } + + if (op != PPC_INST_NOP) { + pr_err("Unexpected call sequence at %p: %x\n", ip, op); + return -EINVAL; + } + + tramp = find_ftrace_tramp((unsigned long)ip); + if (!tramp) { + pr_err("No ftrace trampolines reachable from %ps\n", ip); + return -EINVAL; + } + + if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { + pr_err("Error patching branch to ftrace tramp!\n"); + return -EINVAL; + } + + return 0; +} + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long ip = rec->ip; @@ -471,7 +673,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = PPC_INST_NOP; new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_call_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -603,6 +806,12 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, old = ftrace_call_replace(ip, old_addr, 1); new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + /* + * We always patch out of range locations to go to the regs + * variant, so there is nothing to do here + */ + return 0; } #ifdef CONFIG_MODULES @@ -654,10 +863,54 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } +#ifdef CONFIG_PPC64 +#define PACATOC offsetof(struct paca_struct, kernel_toc) + +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; + +int __init ftrace_dyn_arch_init(void) +{ + int i; + unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; + u32 stub_insns[] = { + 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */ + 0x3d8c0000, /* addis r12,r12, */ + 0x398c0000, /* addi r12,r12, */ + 0x7d8903a6, /* mtctr r12 */ + 0x4e800420, /* bctr */ + }; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); +#endif + long reladdr = addr - kernel_toc_addr(); + + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + pr_err("Address of %ps out of range of kernel_toc.\n", + (void *)addr); + return -1; + } + + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][1] |= PPC_HA(reladdr); + tramp[i][2] |= PPC_LO(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + + return 0; +} +#else int __init ftrace_dyn_arch_init(void) { return 0; } +#endif #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S index e25f77c10a72..1782af2d1496 100644 --- a/arch/powerpc/kernel/trace/ftrace_64.S +++ b/arch/powerpc/kernel/trace/ftrace_64.S @@ -14,6 +14,18 @@ #include #include +.pushsection ".tramp.ftrace.text","aw",@progbits; +.globl ftrace_tramp_text +ftrace_tramp_text: + .space 64 +.popsection + +.pushsection ".tramp.ftrace.init","aw",@progbits; +.globl ftrace_tramp_init +ftrace_tramp_init: + .space 64 +.popsection + _GLOBAL(mcount) _GLOBAL(_mcount) EXPORT_SYMBOL(_mcount) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ac0ceb31b336..2d50b362f835 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -102,6 +102,9 @@ SECTIONS #endif /* careful! __ftr_alt_* sections need to be close to .text */ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.text); +#endif SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -184,7 +187,15 @@ SECTIONS */ . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; - INIT_TEXT_SECTION(PAGE_SIZE) :kernel + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.init); +#endif + } :kernel /* .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table From 5c6499b7041b43807dfaeda28aa87fc0e62558f7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 10 Aug 2018 22:29:26 +1000 Subject: [PATCH 208/221] powerpc/mm/radix: Fix off-by-one in split mapping logic When we have CONFIG_STRICT_KERNEL_RWX enabled, we try to split the kernel linear (1:1) mapping so that the kernel text is in a separate page to kernel data, so we can mark the former read-only. We could achieve that just by always using 64K pages for the linear mapping, but we try to be smarter. Instead we use huge pages when possible, and only switch to smaller pages when necessary. However we have an off-by-one bug in that logic, which causes us to calculate the wrong boundary between text and data. For example with the end of the kernel text at 16M we see: radix-mmu: Mapped 0x0000000000000000-0x0000000001200000 with 64.0 KiB pages radix-mmu: Mapped 0x0000000001200000-0x0000000040000000 with 2.00 MiB pages radix-mmu: Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages ie. we mapped from 0 to 18M with 64K pages, even though the boundary between text and data is at 16M. With the fix we see we're correctly hitting the 16M boundary: radix-mmu: Mapped 0x0000000000000000-0x0000000001000000 with 64.0 KiB pages radix-mmu: Mapped 0x0000000001000000-0x0000000040000000 with 2.00 MiB pages radix-mmu: Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index c879979faa73..d88d76231754 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -294,14 +294,14 @@ static int __meminit create_physical_mapping(unsigned long start, } if (split_text_mapping && (mapping_size == PUD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && + (addr < __pa_symbol(__init_begin)) && (addr + mapping_size) >= __pa_symbol(_stext)) { max_mapping_size = PMD_SIZE; goto retry; } if (split_text_mapping && (mapping_size == PMD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && + (addr < __pa_symbol(__init_begin)) && (addr + mapping_size) >= __pa_symbol(_stext)) { mapping_size = PAGE_SIZE; psize = mmu_virtual_psize; From 3b5657ed5b4e27ccf593a41ff3c5aa27dae8df18 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 14 Aug 2018 20:48:22 +1000 Subject: [PATCH 209/221] powerpc/mm/radix: Fix overuse of small pages in splitting logic When we have CONFIG_STRICT_KERNEL_RWX enabled, we want to split the linear mapping at the text/data boundary so we can map the kernel text read only. But the current logic uses small pages for the entire text section, regardless of whether a larger page size would fit. eg. with the boundary at 16M we could use 2M pages, but instead we use 64K pages up to the 16M boundary: Mapped 0x0000000000000000-0x0000000001000000 with 64.0 KiB pages Mapped 0x0000000001000000-0x0000000040000000 with 2.00 MiB pages Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages This is because the test is checking if addr is < __init_begin and addr + mapping_size is >= _stext. But that is true for all pages between _stext and __init_begin. Instead what we want to check is if we are crossing the text/data boundary, which is at __init_begin. With that fixed we see: Mapped 0x0000000000000000-0x0000000000e00000 with 2.00 MiB pages Mapped 0x0000000000e00000-0x0000000001000000 with 64.0 KiB pages Mapped 0x0000000001000000-0x0000000040000000 with 2.00 MiB pages Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages ie. we're correctly using 2MB pages below __init_begin, but we still drop down to 64K pages unnecessarily at the boundary. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index d88d76231754..bb85c58b96c8 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -295,14 +295,14 @@ static int __meminit create_physical_mapping(unsigned long start, if (split_text_mapping && (mapping_size == PUD_SIZE) && (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) { + (addr + mapping_size) >= __pa_symbol(__init_begin)) { max_mapping_size = PMD_SIZE; goto retry; } if (split_text_mapping && (mapping_size == PMD_SIZE) && (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) { + (addr + mapping_size) >= __pa_symbol(__init_begin)) { mapping_size = PAGE_SIZE; psize = mmu_virtual_psize; } From 81d1b54dec95209ab5e5be2cf37182885f998753 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 14 Aug 2018 21:05:20 +1000 Subject: [PATCH 210/221] powerpc/mm/radix: Fix small page at boundary when splitting When we have CONFIG_STRICT_KERNEL_RWX enabled, we want to split the linear mapping at the text/data boundary so we can map the kernel text read only. Currently we always use a small page at the text/data boundary, even when that's not necessary: Mapped 0x0000000000000000-0x0000000000e00000 with 2.00 MiB pages Mapped 0x0000000000e00000-0x0000000001000000 with 64.0 KiB pages Mapped 0x0000000001000000-0x0000000040000000 with 2.00 MiB pages This is because the check that the mapping crosses the __init_begin boundary is too strict, it also returns true when we map exactly up to the boundary. So fix it to check that the mapping would actually map past __init_begin, and with that we see: Mapped 0x0000000000000000-0x0000000040000000 with 2.00 MiB pages Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index bb85c58b96c8..7a44ec276290 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -295,14 +295,14 @@ static int __meminit create_physical_mapping(unsigned long start, if (split_text_mapping && (mapping_size == PUD_SIZE) && (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(__init_begin)) { + (addr + mapping_size) > __pa_symbol(__init_begin)) { max_mapping_size = PMD_SIZE; goto retry; } if (split_text_mapping && (mapping_size == PMD_SIZE) && (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(__init_begin)) { + (addr + mapping_size) > __pa_symbol(__init_begin)) { mapping_size = PAGE_SIZE; psize = mmu_virtual_psize; } From 57306c663d53f2e4fd856950703dc6bcfc98f7cb Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 14 Aug 2018 22:01:44 +1000 Subject: [PATCH 211/221] powerpc/mm/radix: Remove the retry in the split mapping logic When we have CONFIG_STRICT_KERNEL_RWX enabled, we want to split the linear mapping at the text/data boundary so we can map the kernel text read only. The current logic uses a goto inside the for loop, which works, but is hard to reason about. When we hit the goto retry case we set max_mapping_size to PMD_SIZE and go back to the start. Setting max_mapping_size means we skip the PUD case and go to the PMD case. We know we will pass the alignment and gap checks because the only reason we are there is we hit the goto retry, and that is guarded by mapping_size == PUD_SIZE, which means addr is PUD aligned and gap is greater or equal to PUD_SIZE. So the only part of the check that can fail is the mmu_psize_defs check for the 2M page size. If we just duplicate that check we can avoid the goto, and we get the same result. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 7a44ec276290..030543451229 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -261,7 +261,6 @@ static int __meminit create_physical_mapping(unsigned long start, { unsigned long vaddr, addr, mapping_size = 0; pgprot_t prot; - unsigned long max_mapping_size; #ifdef CONFIG_STRICT_KERNEL_RWX int split_text_mapping = 1; #else @@ -276,12 +275,9 @@ static int __meminit create_physical_mapping(unsigned long start, gap = end - addr; previous_size = mapping_size; - max_mapping_size = PUD_SIZE; -retry: if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift && - PUD_SIZE <= max_mapping_size) { + mmu_psize_defs[MMU_PAGE_1G].shift) { mapping_size = PUD_SIZE; psize = MMU_PAGE_1G; } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && @@ -296,8 +292,10 @@ static int __meminit create_physical_mapping(unsigned long start, if (split_text_mapping && (mapping_size == PUD_SIZE) && (addr < __pa_symbol(__init_begin)) && (addr + mapping_size) > __pa_symbol(__init_begin)) { - max_mapping_size = PMD_SIZE; - goto retry; + if (mmu_psize_defs[MMU_PAGE_2M].shift) + mapping_size = PMD_SIZE; + else + mapping_size = PAGE_SIZE; } if (split_text_mapping && (mapping_size == PMD_SIZE) && From 232aa407633cef2b43806c1603fc8689aecf826c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 14 Aug 2018 22:37:32 +1000 Subject: [PATCH 212/221] powerpc/mm/radix: Simplify split mapping logic If we look closely at the logic in create_physical_mapping(), when we're doing STRICT_KERNEL_RWX, we do the following steps: - determine the gap from where we are to the end of the range - choose an appropriate mapping_size based on the gap - check if that mapping_size would overlap the __init_begin boundary, and if not choose an appropriate mapping_size We can simplify the logic by taking the __init_begin boundary into account when we calculate the initial gap. So add a next_boundary() function which tells us what the next boundary is, either the __init_begin boundary or end. In future we can add more boundaries. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 030543451229..0e87733eed80 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -255,17 +255,21 @@ static inline void __meminit print_mapping(unsigned long start, pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); } +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + if (addr < __pa_symbol(__init_begin)) + return __pa_symbol(__init_begin); +#endif + return end; +} + static int __meminit create_physical_mapping(unsigned long start, unsigned long end, int nid) { unsigned long vaddr, addr, mapping_size = 0; pgprot_t prot; -#ifdef CONFIG_STRICT_KERNEL_RWX - int split_text_mapping = 1; -#else - int split_text_mapping = 0; -#endif int psize; start = _ALIGN_UP(start, PAGE_SIZE); @@ -273,7 +277,7 @@ static int __meminit create_physical_mapping(unsigned long start, unsigned long gap, previous_size; int rc; - gap = end - addr; + gap = next_boundary(addr, end) - addr; previous_size = mapping_size; if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && @@ -289,22 +293,6 @@ static int __meminit create_physical_mapping(unsigned long start, psize = mmu_virtual_psize; } - if (split_text_mapping && (mapping_size == PUD_SIZE) && - (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) > __pa_symbol(__init_begin)) { - if (mmu_psize_defs[MMU_PAGE_2M].shift) - mapping_size = PMD_SIZE; - else - mapping_size = PAGE_SIZE; - } - - if (split_text_mapping && (mapping_size == PMD_SIZE) && - (addr < __pa_symbol(__init_begin)) && - (addr + mapping_size) > __pa_symbol(__init_begin)) { - mapping_size = PAGE_SIZE; - psize = mmu_virtual_psize; - } - if (mapping_size != previous_size) { print_mapping(start, addr, previous_size); start = addr; From afb6d0647fd250a068efd985987b5ff2c0d1b853 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 17 Oct 2018 23:53:38 +1100 Subject: [PATCH 213/221] powerpc/mm/radix: Display if mappings are exec or not At boot we print the ranges we've mapped for the linear mapping and what page size we've used. Also track whether the range is mapped executable or not and display that as well. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 0e87733eed80..931156069a81 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -241,9 +241,8 @@ void radix__mark_initmem_nx(void) } #endif /* CONFIG_STRICT_KERNEL_RWX */ -static inline void __meminit print_mapping(unsigned long start, - unsigned long end, - unsigned long size) +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) { char buf[10]; @@ -252,7 +251,8 @@ static inline void __meminit print_mapping(unsigned long start, string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); - pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); } static unsigned long next_boundary(unsigned long addr, unsigned long end) @@ -269,6 +269,7 @@ static int __meminit create_physical_mapping(unsigned long start, int nid) { unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; pgprot_t prot; int psize; @@ -279,6 +280,7 @@ static int __meminit create_physical_mapping(unsigned long start, gap = next_boundary(addr, end) - addr; previous_size = mapping_size; + prev_exec = exec; if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && mmu_psize_defs[MMU_PAGE_1G].shift) { @@ -293,18 +295,21 @@ static int __meminit create_physical_mapping(unsigned long start, psize = mmu_virtual_psize; } - if (mapping_size != previous_size) { - print_mapping(start, addr, previous_size); - start = addr; - } - vaddr = (unsigned long)__va(addr); if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || - overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { prot = PAGE_KERNEL_X; - else + exec = true; + } else { prot = PAGE_KERNEL; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); if (rc) @@ -313,7 +318,7 @@ static int __meminit create_physical_mapping(unsigned long start, update_page_count(psize, 1); } - print_mapping(start, addr, mapping_size); + print_mapping(start, addr, mapping_size, exec); return 0; } From 0d923962ab69c27cca664a2d535e90ef655110ca Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 15 Aug 2018 21:29:45 +1000 Subject: [PATCH 214/221] powerpc/mm: Fix page table dump to work on Radix When we're running on Book3S with the Radix MMU enabled the page table dump currently prints the wrong addresses because it uses the wrong start address. Fix it to use PAGE_OFFSET rather than KERN_VIRT_START. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/dump_linuxpagetables.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index e60aa6d7456d..2b74f8adf4d0 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -267,12 +267,13 @@ static void walk_pagetables(struct pg_state *st) unsigned int i; unsigned long addr; + addr = st->start_address; + /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { - addr = KERN_VIRT_START + i * PGDIR_SIZE; + for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { if (!pgd_none(*pgd) && !pgd_huge(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); @@ -321,9 +322,14 @@ static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .seq = m, - .start_address = KERN_VIRT_START, .marker = address_markers, }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + /* Traverse kernel page tables */ walk_pagetables(&st); note_page(&st, 0, 0, 0); From b7683fc66eba91674e52c30f4d8e596bfb5cbcf4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 24 Jul 2018 15:53:22 +1000 Subject: [PATCH 215/221] selftests/powerpc: Add a test of wild bctr This tests that a bctr (Branch to counter and link), ie. a function call, to a wildly out-of-bounds address is handled correctly. Some old kernel versions didn't handle it correctly, see eg: "powerpc/slb: Force a full SLB flush when we insert for a bad EA" https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-April/157397.html Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/include/reg.h | 1 + tools/testing/selftests/powerpc/mm/.gitignore | 3 +- tools/testing/selftests/powerpc/mm/Makefile | 4 +- .../testing/selftests/powerpc/mm/wild_bctr.c | 155 ++++++++++++++++++ 4 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/mm/wild_bctr.c diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h index 7f348c059bc2..52b4710469d2 100644 --- a/tools/testing/selftests/powerpc/include/reg.h +++ b/tools/testing/selftests/powerpc/include/reg.h @@ -17,6 +17,7 @@ : "memory") #define mb() asm volatile("sync" : : : "memory"); +#define barrier() asm volatile("" : : : "memory"); #define SPRN_MMCR2 769 #define SPRN_MMCRA 770 diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index 7d7c42ed6de9..ba919308fe30 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -2,4 +2,5 @@ hugetlb_vs_thp_test subpage_prot tempfile prot_sao -segv_errors \ No newline at end of file +segv_errors +wild_bctr \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 33ced6e0ad25..43d68420e363 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -2,7 +2,7 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors +TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. @@ -12,6 +12,8 @@ $(TEST_GEN_PROGS): ../harness.c $(OUTPUT)/prot_sao: ../utils.c +$(OUTPUT)/wild_bctr: CFLAGS += -m64 + $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c b/tools/testing/selftests/powerpc/mm/wild_bctr.c new file mode 100644 index 000000000000..1b0e9e9a2ddc --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright 2018, Michael Ellerman, IBM Corp. + * + * Test that an out-of-bounds branch to counter behaves as expected. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + + +#define BAD_NIP 0x788c545a18000000ull + +static struct pt_regs signal_regs; +static jmp_buf setjmp_env; + +static void save_regs(ucontext_t *ctxt) +{ + struct pt_regs *regs = ctxt->uc_mcontext.regs; + + memcpy(&signal_regs, regs, sizeof(signal_regs)); +} + +static void segv_handler(int signum, siginfo_t *info, void *ctxt_v) +{ + save_regs(ctxt_v); + longjmp(setjmp_env, 1); +} + +static void usr2_handler(int signum, siginfo_t *info, void *ctxt_v) +{ + save_regs(ctxt_v); +} + +static int ok(void) +{ + printf("Everything is OK in here.\n"); + return 0; +} + +#define REG_POISON 0x5a5aUL +#define POISONED_REG(n) ((REG_POISON << 48) | ((n) << 32) | (REG_POISON << 16) | (n)) + +static inline void poison_regs(void) +{ + #define POISON_REG(n) \ + "lis " __stringify(n) "," __stringify(REG_POISON) ";" \ + "addi " __stringify(n) "," __stringify(n) "," __stringify(n) ";" \ + "sldi " __stringify(n) "," __stringify(n) ", 32 ;" \ + "oris " __stringify(n) "," __stringify(n) "," __stringify(REG_POISON) ";" \ + "addi " __stringify(n) "," __stringify(n) "," __stringify(n) ";" + + asm (POISON_REG(15) + POISON_REG(16) + POISON_REG(17) + POISON_REG(18) + POISON_REG(19) + POISON_REG(20) + POISON_REG(21) + POISON_REG(22) + POISON_REG(23) + POISON_REG(24) + POISON_REG(25) + POISON_REG(26) + POISON_REG(27) + POISON_REG(28) + POISON_REG(29) + : // inputs + : // outputs + : "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", + "26", "27", "28", "29" + ); + #undef POISON_REG +} + +static int check_regs(void) +{ + unsigned long i; + + for (i = 15; i <= 29; i++) + FAIL_IF(signal_regs.gpr[i] != POISONED_REG(i)); + + printf("Regs OK\n"); + return 0; +} + +static void dump_regs(void) +{ + for (int i = 0; i < 32; i += 4) { + printf("r%02d 0x%016lx r%02d 0x%016lx " \ + "r%02d 0x%016lx r%02d 0x%016lx\n", + i, signal_regs.gpr[i], + i+1, signal_regs.gpr[i+1], + i+2, signal_regs.gpr[i+2], + i+3, signal_regs.gpr[i+3]); + } +} + +int test_wild_bctr(void) +{ + int (*func_ptr)(void); + struct sigaction segv = { + .sa_sigaction = segv_handler, + .sa_flags = SA_SIGINFO + }; + struct sigaction usr2 = { + .sa_sigaction = usr2_handler, + .sa_flags = SA_SIGINFO + }; + + FAIL_IF(sigaction(SIGSEGV, &segv, NULL)); + FAIL_IF(sigaction(SIGUSR2, &usr2, NULL)); + + bzero(&signal_regs, sizeof(signal_regs)); + + if (setjmp(setjmp_env) == 0) { + func_ptr = ok; + func_ptr(); + + kill(getpid(), SIGUSR2); + printf("Regs before:\n"); + dump_regs(); + bzero(&signal_regs, sizeof(signal_regs)); + + poison_regs(); + + func_ptr = (int (*)(void))BAD_NIP; + func_ptr(); + + FAIL_IF(1); /* we didn't segv? */ + } + + FAIL_IF(signal_regs.nip != BAD_NIP); + + printf("All good - took SEGV as expected branching to 0x%llx\n", BAD_NIP); + + dump_regs(); + FAIL_IF(check_regs()); + + return 0; +} + +int main(void) +{ + return test_harness(test_wild_bctr, "wild_bctr"); +} From dd76ff5af35350fd6d5bb5b069e73b6017f66893 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 28 Aug 2018 18:11:27 +1000 Subject: [PATCH 216/221] powerpc/64s/radix: Fix radix__flush_tlb_collapsed_pmd double flushing pmd Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 4e798f33c530..6a6399108072 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -1008,7 +1008,6 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) goto local; } _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - goto local; } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); From b851ba02a6f3075f0f99c60c4bc30a4af80cf428 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 29 Aug 2018 21:56:56 +1000 Subject: [PATCH 217/221] powerpc/64/module: REL32 relocation range check The recent module relocation overflow crash demonstrated that we have no range checking on REL32 relative relocations. This patch implements a basic check, the same kernel that previously oopsed and rebooted now continues with some of these errors when loading the module: module_64: x_tables: REL32 527703503449812 out of range! Possibly other relocations (ADDR32, REL16, TOC16, etc.) should also have overflow checks. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/module_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 2c53de9f3b6a..8661eea78503 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -680,7 +680,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_PPC64_REL32: /* 32 bits relative (used by relative exception tables) */ - *(u32 *)location = value - (unsigned long)location; + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x80000000 > 0xffffffff) { + pr_err("%s: REL32 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + *(u32 *)location = value; break; case R_PPC64_TOCSAVE: From daf00ae71dad8aa05965713c62558aeebf2df48e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 13 Oct 2018 09:16:22 +0000 Subject: [PATCH 218/221] powerpc/traps: restore recoverability of machine_check interrupts commit b96672dd840f ("powerpc: Machine check interrupt is a non- maskable interrupt") added a call to nmi_enter() at the beginning of machine check restart exception handler. Due to that, in_interrupt() always returns true regardless of the state before entering the exception, and die() panics even when the system was not already in interrupt. This patch calls nmi_exit() before calling die() in order to restore the interrupt state we had before calling nmi_enter() Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f1629a8acc4b..47904cd1ff46 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -763,12 +763,17 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - die("Machine check", regs, SIGBUS); - /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable Machine check"); + if (!nested) + nmi_exit(); + + die("Machine check", regs, SIGBUS); + + return; + bail: if (!nested) nmi_exit(); From b6aeddea74b08518289fc86545297cf18a0b53a7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 19 Oct 2018 16:19:10 +1100 Subject: [PATCH 219/221] powerpc: Fix stack protector crashes on CPU hotplug Recently in commit 7241d26e8175 ("powerpc/64: properly initialise the stackprotector canary on SMP.") we fixed a crash with stack protector on SMP by initialising the stack canary in cpu_idle_thread_init(). But this can also causes crashes, when a CPU comes back online after being offline: Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: pnv_smp_cpu_kill_self+0x2a0/0x2b0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.19.0-rc3-gcc-7.3.1-00168-g4ffe713b7587 #94 Call Trace: dump_stack+0xb0/0xf4 (unreliable) panic+0x144/0x328 __stack_chk_fail+0x2c/0x30 pnv_smp_cpu_kill_self+0x2a0/0x2b0 cpu_die+0x48/0x70 arch_cpu_idle_dead+0x20/0x40 do_idle+0x274/0x390 cpu_startup_entry+0x38/0x50 start_secondary+0x5e4/0x600 start_secondary_prolog+0x10/0x14 Looking at the stack we see that the canary value in the stack frame doesn't match the canary in the task/paca. That is because we have reinitialised the task/paca value, but then the CPU coming online has returned into a function using the old canary value. That causes the comparison to fail. Instead we can call boot_init_stack_canary() from start_secondary() which never returns. This is essentially what the generic code does in cpu_startup_entry() under #ifdef X86, we should make that non-x86 specific in a future patch. Fixes: 7241d26e8175 ("powerpc/64: properly initialise the stackprotector canary on SMP.") Reported-by: Joel Stanley Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy --- arch/powerpc/kernel/smp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8e3a5da24d59..3f15edf25a0d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1014,16 +1015,9 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) { struct thread_info *ti = task_thread_info(idle); -#ifdef CONFIG_STACKPROTECTOR - idle->stack_canary = get_random_canary(); -#endif - #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; -#ifdef CONFIG_STACKPROTECTOR - paca_ptrs[cpu]->canary = idle->stack_canary; -#endif #endif ti->cpu = cpu; secondary_ti = current_set[cpu] = ti; @@ -1316,6 +1310,8 @@ void start_secondary(void *unused) notify_cpu_starting(cpu); set_cpu_online(cpu, true); + boot_init_stack_canary(); + local_irq_enable(); /* We can enable ftrace for secondary cpus now */ From 0f99153def98134403c9149128e59d3e1786cf04 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 19 Oct 2018 06:12:50 +0000 Subject: [PATCH 220/221] powerpc/msi: Fix compile error on mpc83xx mpic_get_primary_version() is not defined when not using MPIC. The compile error log like: arch/powerpc/sysdev/built-in.o: In function `fsl_of_msi_probe': fsl_msi.c:(.text+0x150c): undefined reference to `fsl_mpic_primary_get_version' Signed-off-by: Jia Hongtao Signed-off-by: Scott Wood Reported-by: Radu Rendec Fixes: 807d38b73b6 ("powerpc/mpic: Add get_version API both for internal and external use") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mpic.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index fad8ddd697ac..0abf2e7fd222 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -393,7 +393,14 @@ extern struct bus_type mpic_subsys; #define MPIC_REGSET_TSI108 MPIC_REGSET(1) /* Tsi108/109 PIC */ /* Get the version of primary MPIC */ +#ifdef CONFIG_MPIC extern u32 fsl_mpic_primary_get_version(void); +#else +static inline u32 fsl_mpic_primary_get_version(void) +{ + return 0; +} +#endif /* Allocate the controller structure and setup the linux irq descs * for the range if interrupts passed in. No HW initialization is From 58cfbac25b1fd2b76f94566aed28a3662b0ff8c6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 26 Oct 2018 04:24:15 +1100 Subject: [PATCH 221/221] Revert "selftests/powerpc: Fix out-of-tree build errors" This reverts commit d8a2fe29d3c97038c8efcc328d5e7940c5310565. That commit, by me, fixed the out of tree build errors by causing some of the tests not to build at all. --- tools/testing/selftests/powerpc/cache_shape/Makefile | 2 ++ tools/testing/selftests/powerpc/ptrace/Makefile | 2 ++ tools/testing/selftests/powerpc/signal/Makefile | 2 ++ tools/testing/selftests/powerpc/switch_endian/Makefile | 1 - 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 62e947ca9921..ede4d3dae750 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 TEST_PROGS := cache_shape +all: $(TEST_PROGS) + $(TEST_PROGS): ../harness.c ../utils.c top_srcdir = ../../../../.. diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 6ac71b629276..9b35ca8e8f13 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -7,6 +7,8 @@ TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ top_srcdir = ../../../../.. include ../../lib.mk +all: $(TEST_PROGS) + CFLAGS += -m64 -I../../../../../usr/include -I../tm -mhtm -fno-pie ptrace-pkey core-pkey: child.h diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index d34a7c7710db..1fca25c6ace0 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 TEST_PROGS := signal signal_tm +all: $(TEST_PROGS) + $(TEST_PROGS): ../harness.c ../utils.c signal.S CFLAGS += -maltivec diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afedb0..fcd2dcb8972b 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -8,7 +8,6 @@ EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk -$(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S $(OUTPUT)/check-reversed.o: $(OUTPUT)/check.o