[PATCH] x86-64: Make lockless machine check record passing a bit more robust.

One machine is constantly throwing NMI watchdog timeouts in mce_log This was one attempt to fix it. (AK: this doesn't actually fix the bug I'm seeing unfortunately, probably drop. I don't like it that the reader can spin forever now waiting for a writer) Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-12 18:49:24 +02:00 · 2005-09-12 18:49:24 +02:00 · 673242c10d
commit 673242c10d
parent a54e678b8f
1 changed files with 21 additions and 11 deletions
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@ -56,15 +56,19 @@ void mce_log(struct mce *mce)
 	smp_wmb();
 	for (;;) {
 		entry = rcu_dereference(mcelog.next);
-		/* When the buffer fills up discard new entries. Assume 
-		   that the earlier errors are the more interesting. */
-		if (entry >= MCE_LOG_LEN) {
-			set_bit(MCE_OVERFLOW, &mcelog.flags);
-			return;
+		for (;;) {
+			/* When the buffer fills up discard new entries. Assume
+			   that the earlier errors are the more interesting. */
+			if (entry >= MCE_LOG_LEN) {
+				set_bit(MCE_OVERFLOW, &mcelog.flags);
+				return;
+			}
+			/* Old left over entry. Skip. */
+			if (mcelog.entry[entry].finished) {
+				entry++;
+				continue;
+			}
 		}
-		/* Old left over entry. Skip. */
-		if (mcelog.entry[entry].finished)
-			continue;
 		smp_rmb();
 		next = entry + 1;
 		if (cmpxchg(&mcelog.next, entry, next) == entry)
@ -404,9 +408,15 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
 	}

 	err = 0;
-	for (i = 0; i < next; i++) {
-		if (!mcelog.entry[i].finished)
-			continue;
+	for (i = 0; i < next; i++) {		
+		unsigned long start = jiffies;
+		while (!mcelog.entry[i].finished) {
+			if (!time_before(jiffies, start + 2)) {
+				memset(mcelog.entry + i,0, sizeof(struct mce));
+				continue;
+			}
+			cpu_relax();
+		}
 		smp_rmb();
 		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 		buf += sizeof(struct mce);