From 1d44e8288a0557c28c447d7e511f50d06ff93a34 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 9 May 2011 11:35:19 -0500 Subject: [PATCH] x86, UV: Fix NMI handler for UV platforms This fixes problems seen on UV systems handling NMIs from the node controller. I isolated the "dazed..." messages that I saw earlier to a bug in the BMC on our platform. It was sending NMIs w/o properly setting a register that indicated the source of NMI. So rather than _assuming_ any unhandled NMI came from the UV system maintenance console (SMC), add a check to verify that the SMC actually sent the NMI. Signed-off-by: Jack Steiner Cc: gorcunov@gmail.com Cc: dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 2 ++ arch/x86/include/asm/uv/uv_mmrs.h | 16 +++++++++- arch/x86/kernel/apic/x2apic_uv_x.c | 48 ++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a501741c2335..4298002d0c83 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -398,6 +398,8 @@ struct uv_blade_info { unsigned short nr_online_cpus; unsigned short pnode; short memory_nid; + spinlock_t nmi_lock; + unsigned long nmi_count; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 20cafeac7455..f5bb64a823d7 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x00778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5 : 64; /* RW, W1CS */ + } s; +}; #endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 33b10a0fc095..7acd2d2ac965 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -37,6 +37,13 @@ #include #include #include +#include + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { + unsigned long real_uv_nmi; + int bid; + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -720,8 +756,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -747,6 +784,7 @@ void __init uv_system_init(void) uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; }