x86, UV: Fix NMI handler for UV platforms

This fixes problems seen on UV systems handling NMIs from the
node controller.

I isolated the "dazed..." messages that I saw earlier to a bug in
the BMC on our platform. It was sending NMIs w/o properly setting
a register that indicated the source of NMI.

So rather than _assuming_ any unhandled NMI came from the UV system
maintenance console (SMC), add a check to verify that the SMC actually
sent the NMI.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: gorcunov@gmail.com
Cc: dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Jack Steiner 2011-05-09 11:35:19 -05:00 committed by Ingo Molnar
parent 693d92a1bb
commit 1d44e8288a
3 changed files with 60 additions and 6 deletions

View File

@ -398,6 +398,8 @@ struct uv_blade_info {
unsigned short nr_online_cpus;
unsigned short pnode;
short memory_nid;
spinlock_t nmi_lock;
unsigned long nmi_count;
};
extern struct uv_blade_info *uv_blade_info;
extern short *uv_node_to_blade;

View File

@ -5,7 +5,7 @@
*
* SGI UV MMR definitions
*
* Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
* Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u {
} s;
};
/* ========================================================================= */
/* UVH_SCRATCH5 */
/* ========================================================================= */
#define UVH_SCRATCH5 0x2d0200UL
#define UVH_SCRATCH5_32 0x00778
#define UVH_SCRATCH5_SCRATCH5_SHFT 0
#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
union uvh_scratch5_u {
unsigned long v;
struct uvh_scratch5_s {
unsigned long scratch5 : 64; /* RW, W1CS */
} s;
};
#endif /* __ASM_UV_MMRS_X86_H__ */

View File

@ -37,6 +37,13 @@
#include <asm/smp.h>
#include <asm/x86_init.h>
#include <asm/emergency-restart.h>
#include <asm/nmi.h>
/* BMC sets a bit this MMR non-zero before sending an NMI */
#define UVH_NMI_MMR UVH_SCRATCH5
#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
#define UV_NMI_PENDING_MASK (1UL << 63)
DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
DEFINE_PER_CPU(int, x2apic_extra_bits);
@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
unsigned long real_uv_nmi;
int bid;
if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
/* do nothing if entering the crash kernel */
return NOTIFY_OK;
/*
* Use a lock so only one cpu prints at a time
* to prevent intermixed output.
* Each blade has an MMR that indicates when an NMI has been sent
* to cpus on the blade. If an NMI is detected, atomically
* clear the MMR and update a per-blade NMI count used to
* cause each cpu on the blade to notice a new NMI.
*/
bid = uv_numa_blade_id();
real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
if (unlikely(real_uv_nmi)) {
spin_lock(&uv_blade_info[bid].nmi_lock);
real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
if (real_uv_nmi) {
uv_blade_info[bid].nmi_count++;
uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
}
spin_unlock(&uv_blade_info[bid].nmi_lock);
}
if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
return NOTIFY_DONE;
__get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
/*
* Use a lock so only one cpu prints at a time.
* This prevents intermixed output.
*/
spin_lock(&uv_nmi_lock);
pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
dump_stack();
spin_unlock(&uv_nmi_lock);
@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
}
static struct notifier_block uv_dump_stack_nmi_nb = {
.notifier_call = uv_handle_nmi
.notifier_call = uv_handle_nmi,
.priority = NMI_LOCAL_LOW_PRIOR - 1,
};
void uv_register_nmi_notifier(void)
@ -720,8 +756,9 @@ void __init uv_system_init(void)
printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = kmalloc(bytes, GFP_KERNEL);
uv_blade_info = kzalloc(bytes, GFP_KERNEL);
BUG_ON(!uv_blade_info);
for (blade = 0; blade < uv_num_possible_blades(); blade++)
uv_blade_info[blade].memory_nid = -1;
@ -747,6 +784,7 @@ void __init uv_system_init(void)
uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
spin_lock_init(&uv_blade_info[blade].nmi_lock);
max_pnode = max(pnode, max_pnode);
blade++;
}