Borislav Petkov e2def7d49d x86/mce: Make mce_rdmsrl() panic on an inaccessible MSR
If an exception needs to be handled while reading an MSR - which is in
most of the cases caused by a #GP on a non-existent MSR - then this
is most likely the incarnation of a BIOS or a hardware bug. Such bug
violates the architectural guarantee that MCA banks are present with all
MSRs belonging to them.

The proper fix belongs in the hardware/firmware - not in the kernel.

Handling an #MC exception which is raised while an NMI is being handled
would cause the nasty NMI nesting issue because of the shortcoming of
IRET of reenabling NMIs when executed. And the machine is in an #MC
context already so <Deity> be at its side.

Tracing MSR accesses while in #MC is another no-no due to tracing being
inherently a bad idea in atomic context:

  vmlinux.o: warning: objtool: do_machine_check()+0x4a: call to mce_rdmsrl() leaves .noinstr.text section

so remove all that "additional" functionality from mce_rdmsrl() and
provide it with a special exception handler which panics the machine
when that MSR is not accessible.

The exception handler prints a human-readable message explaining what
the panic reason is but, what is more, it panics while in the #GP
handler and latter won't have executed an IRET, thus opening the NMI
nesting issue in the case when the #MC has happened while handling
an NMI. (#MC itself won't be reenabled until MCG_STATUS hasn't been
cleared).

Suggested-by: Andy Lutomirski <luto@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
[ Add missing prototypes for ex_handler_* ]
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200906212130.GA28456@zn.tnic
2020-09-11 08:25:43 +02:00

199 lines
5.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_MCE_INTERNAL_H__
#define __X86_MCE_INTERNAL_H__
#undef pr_fmt
#define pr_fmt(fmt) "mce: " fmt
#include <linux/device.h>
#include <asm/mce.h>
/* Pointer to the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *);
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
MCE_KEEP_SEVERITY,
MCE_SOME_SEVERITY,
MCE_AO_SEVERITY,
MCE_UC_SEVERITY,
MCE_AR_SEVERITY,
MCE_PANIC_SEVERITY,
};
extern struct blocking_notifier_head x86_mce_decoder_chain;
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
struct mce_evt_llist {
struct llist_node llnode;
struct mce mce;
};
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; };
#endif
void mce_timer_kick(unsigned long interval);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
int apei_check_mce(void);
int apei_clear_mce(u64 record_id);
#else
static inline int apei_write_mce(struct mce *m)
{
return -EINVAL;
}
static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
return 0;
}
static inline int apei_check_mce(void)
{
return 0;
}
static inline int apei_clear_mce(u64 record_id)
{
return -EINVAL;
}
#endif
/*
* We consider records to be equivalent if bank+status+addr+misc all match.
* This is only used when the system is going down because of a fatal error
* to avoid cluttering the console log with essentially repeated information.
* In normal processing all errors seen are logged.
*/
static inline bool mce_cmp(struct mce *m1, struct mce *m2)
{
return m1->bank != m2->bank ||
m1->status != m2->status ||
m1->addr != m2->addr ||
m1->misc != m2->misc;
}
extern struct device_attribute dev_attr_trigger;
#ifdef CONFIG_X86_MCELOG_LEGACY
void mce_work_trigger(void);
void mce_register_injector_chain(struct notifier_block *nb);
void mce_unregister_injector_chain(struct notifier_block *nb);
#else
static inline void mce_work_trigger(void) { }
static inline void mce_register_injector_chain(struct notifier_block *nb) { }
static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
bool ignore_ce;
bool print_all;
__u64 lmce_disabled : 1,
disabled : 1,
ser : 1,
recovery : 1,
bios_cmci_threshold : 1,
__reserved : 59;
s8 bootlog;
int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
};
extern struct mca_config mca_cfg;
DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_vendor_flags {
/*
* Indicates that overflow conditions are not fatal, when set.
*/
__u64 overflow_recov : 1,
/*
* (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
* the register space for each MCA bank and also increases number of
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
smca : 1,
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
__reserved_0 : 60;
};
extern struct mce_vendor_flags mce_flags;
struct mca_msr_regs {
u32 (*ctl) (int bank);
u32 (*status) (int bank);
u32 (*addr) (int bank);
u32 (*misc) (int bank);
};
extern struct mca_msr_regs msr_ops;
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
#else
static inline bool amd_filter_mce(struct mce *m) { return false; };
#endif
__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr);
__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr);
#endif /* __X86_MCE_INTERNAL_H__ */