~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kernel/cpu/mcheck/mce.c

Version: ~ [ linux-5.3 ] ~ [ linux-5.2.15 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.73 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.144 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.193 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.193 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.73 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Machine check handler.
  3  *
  4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5  * Rest from unknown author(s).
  6  * 2004 Andi Kleen. Rewrote most of it.
  7  * Copyright 2008 Intel Corporation
  8  * Author: Andi Kleen
  9  */
 10 
 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 12 
 13 #include <linux/thread_info.h>
 14 #include <linux/capability.h>
 15 #include <linux/miscdevice.h>
 16 #include <linux/ratelimit.h>
 17 #include <linux/kallsyms.h>
 18 #include <linux/rcupdate.h>
 19 #include <linux/kobject.h>
 20 #include <linux/uaccess.h>
 21 #include <linux/kdebug.h>
 22 #include <linux/kernel.h>
 23 #include <linux/percpu.h>
 24 #include <linux/string.h>
 25 #include <linux/device.h>
 26 #include <linux/syscore_ops.h>
 27 #include <linux/delay.h>
 28 #include <linux/ctype.h>
 29 #include <linux/sched.h>
 30 #include <linux/sysfs.h>
 31 #include <linux/types.h>
 32 #include <linux/slab.h>
 33 #include <linux/init.h>
 34 #include <linux/kmod.h>
 35 #include <linux/poll.h>
 36 #include <linux/nmi.h>
 37 #include <linux/cpu.h>
 38 #include <linux/smp.h>
 39 #include <linux/fs.h>
 40 #include <linux/mm.h>
 41 #include <linux/debugfs.h>
 42 #include <linux/irq_work.h>
 43 #include <linux/export.h>
 44 #include <linux/jump_label.h>
 45 
 46 #include <asm/intel-family.h>
 47 #include <asm/processor.h>
 48 #include <asm/traps.h>
 49 #include <asm/tlbflush.h>
 50 #include <asm/mce.h>
 51 #include <asm/msr.h>
 52 
 53 #include "mce-internal.h"
 54 
 55 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 56 
 57 static int mce_chrdev_open_count;       /* #times opened */
 58 
 59 #define mce_log_get_idx_check(p) \
 60 ({ \
 61         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
 62                          !lockdep_is_held(&mce_chrdev_read_mutex), \
 63                          "suspicious mce_log_get_idx_check() usage"); \
 64         smp_load_acquire(&(p)); \
 65 })
 66 
 67 #define CREATE_TRACE_POINTS
 68 #include <trace/events/mce.h>
 69 
 70 #define SPINUNIT                100     /* 100ns */
 71 
 72 DEFINE_PER_CPU(unsigned, mce_exception_count);
 73 
 74 struct mce_bank *mce_banks __read_mostly;
 75 struct mce_vendor_flags mce_flags __read_mostly;
 76 
 77 struct mca_config mca_cfg __read_mostly = {
 78         .bootlog  = -1,
 79         /*
 80          * Tolerant levels:
 81          * 0: always panic on uncorrected errors, log corrected errors
 82          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
 83          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
 84          * 3: never panic or SIGBUS, log all errors (for testing only)
 85          */
 86         .tolerant = 1,
 87         .monarch_timeout = -1
 88 };
 89 
 90 /* User mode helper program triggered by machine check event */
 91 static unsigned long            mce_need_notify;
 92 static char                     mce_helper[128];
 93 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
 94 
 95 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 96 
 97 static DEFINE_PER_CPU(struct mce, mces_seen);
 98 static int                      cpu_missing;
 99 
100 /*
101  * MCA banks polled by the period polling timer for corrected events.
102  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
103  */
104 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
105         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
106 };
107 
108 /*
109  * MCA banks controlled through firmware first for corrected errors.
110  * This is a global list of banks for which we won't enable CMCI and we
111  * won't poll. Firmware controls these banks and is responsible for
112  * reporting corrected errors through GHES. Uncorrected/recoverable
113  * errors are still notified through a machine check.
114  */
115 mce_banks_t mce_banks_ce_disabled;
116 
117 static struct work_struct mce_work;
118 static struct irq_work mce_irq_work;
119 
120 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
121 
122 /*
123  * CPU/chipset specific EDAC code can register a notifier call here to print
124  * MCE errors in a human-readable form.
125  */
126 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
127 
128 /* Do initial initialization of a struct mce */
129 void mce_setup(struct mce *m)
130 {
131         memset(m, 0, sizeof(struct mce));
132         m->cpu = m->extcpu = smp_processor_id();
133         /* We hope get_seconds stays lockless */
134         m->time = get_seconds();
135         m->cpuvendor = boot_cpu_data.x86_vendor;
136         m->cpuid = cpuid_eax(1);
137         m->socketid = cpu_data(m->extcpu).phys_proc_id;
138         m->apicid = cpu_data(m->extcpu).initial_apicid;
139         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
140 
141         if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
142                 rdmsrl(MSR_PPIN, m->ppin);
143 }
144 
145 DEFINE_PER_CPU(struct mce, injectm);
146 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
147 
148 /*
149  * Lockless MCE logging infrastructure.
150  * This avoids deadlocks on printk locks without having to break locks. Also
151  * separate MCEs from kernel messages to avoid bogus bug reports.
152  */
153 
154 static struct mce_log mcelog = {
155         .signature      = MCE_LOG_SIGNATURE,
156         .len            = MCE_LOG_LEN,
157         .recordlen      = sizeof(struct mce),
158 };
159 
160 void mce_log(struct mce *mce)
161 {
162         unsigned next, entry;
163 
164         /* Emit the trace record: */
165         trace_mce_record(mce);
166 
167         if (!mce_gen_pool_add(mce))
168                 irq_work_queue(&mce_irq_work);
169 
170         wmb();
171         for (;;) {
172                 entry = mce_log_get_idx_check(mcelog.next);
173                 for (;;) {
174 
175                         /*
176                          * When the buffer fills up discard new entries.
177                          * Assume that the earlier errors are the more
178                          * interesting ones:
179                          */
180                         if (entry >= MCE_LOG_LEN) {
181                                 set_bit(MCE_OVERFLOW,
182                                         (unsigned long *)&mcelog.flags);
183                                 return;
184                         }
185                         /* Old left over entry. Skip: */
186                         if (mcelog.entry[entry].finished) {
187                                 entry++;
188                                 continue;
189                         }
190                         break;
191                 }
192                 smp_rmb();
193                 next = entry + 1;
194                 if (cmpxchg(&mcelog.next, entry, next) == entry)
195                         break;
196         }
197         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
198         wmb();
199         mcelog.entry[entry].finished = 1;
200         wmb();
201 
202         set_bit(0, &mce_need_notify);
203 }
204 
205 void mce_inject_log(struct mce *m)
206 {
207         mutex_lock(&mce_chrdev_read_mutex);
208         mce_log(m);
209         mutex_unlock(&mce_chrdev_read_mutex);
210 }
211 EXPORT_SYMBOL_GPL(mce_inject_log);
212 
213 static struct notifier_block mce_srao_nb;
214 
215 static atomic_t num_notifiers;
216 
217 void mce_register_decode_chain(struct notifier_block *nb)
218 {
219         atomic_inc(&num_notifiers);
220 
221         WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
222 
223         blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
224 }
225 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
226 
227 void mce_unregister_decode_chain(struct notifier_block *nb)
228 {
229         atomic_dec(&num_notifiers);
230 
231         blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
232 }
233 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
234 
235 static inline u32 ctl_reg(int bank)
236 {
237         return MSR_IA32_MCx_CTL(bank);
238 }
239 
240 static inline u32 status_reg(int bank)
241 {
242         return MSR_IA32_MCx_STATUS(bank);
243 }
244 
245 static inline u32 addr_reg(int bank)
246 {
247         return MSR_IA32_MCx_ADDR(bank);
248 }
249 
250 static inline u32 misc_reg(int bank)
251 {
252         return MSR_IA32_MCx_MISC(bank);
253 }
254 
255 static inline u32 smca_ctl_reg(int bank)
256 {
257         return MSR_AMD64_SMCA_MCx_CTL(bank);
258 }
259 
260 static inline u32 smca_status_reg(int bank)
261 {
262         return MSR_AMD64_SMCA_MCx_STATUS(bank);
263 }
264 
265 static inline u32 smca_addr_reg(int bank)
266 {
267         return MSR_AMD64_SMCA_MCx_ADDR(bank);
268 }
269 
270 static inline u32 smca_misc_reg(int bank)
271 {
272         return MSR_AMD64_SMCA_MCx_MISC(bank);
273 }
274 
275 struct mca_msr_regs msr_ops = {
276         .ctl    = ctl_reg,
277         .status = status_reg,
278         .addr   = addr_reg,
279         .misc   = misc_reg
280 };
281 
282 static void __print_mce(struct mce *m)
283 {
284         pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
285                  m->extcpu,
286                  (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
287                  m->mcgstatus, m->bank, m->status);
288 
289         if (m->ip) {
290                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
291                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
292                         m->cs, m->ip);
293 
294                 if (m->cs == __KERNEL_CS)
295                         print_symbol("{%s}", m->ip);
296                 pr_cont("\n");
297         }
298 
299         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
300         if (m->addr)
301                 pr_cont("ADDR %llx ", m->addr);
302         if (m->misc)
303                 pr_cont("MISC %llx ", m->misc);
304 
305         if (mce_flags.smca) {
306                 if (m->synd)
307                         pr_cont("SYND %llx ", m->synd);
308                 if (m->ipid)
309                         pr_cont("IPID %llx ", m->ipid);
310         }
311 
312         pr_cont("\n");
313         /*
314          * Note this output is parsed by external tools and old fields
315          * should not be changed.
316          */
317         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
318                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
319                 cpu_data(m->extcpu).microcode);
320 }
321 
322 static void print_mce(struct mce *m)
323 {
324         __print_mce(m);
325         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
326 }
327 
328 #define PANIC_TIMEOUT 5 /* 5 seconds */
329 
330 static atomic_t mce_panicked;
331 
332 static int fake_panic;
333 static atomic_t mce_fake_panicked;
334 
335 /* Panic in progress. Enable interrupts and wait for final IPI */
336 static void wait_for_panic(void)
337 {
338         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
339 
340         preempt_disable();
341         local_irq_enable();
342         while (timeout-- > 0)
343                 udelay(1);
344         if (panic_timeout == 0)
345                 panic_timeout = mca_cfg.panic_timeout;
346         panic("Panicing machine check CPU died");
347 }
348 
349 static void mce_panic(const char *msg, struct mce *final, char *exp)
350 {
351         int apei_err = 0;
352         struct llist_node *pending;
353         struct mce_evt_llist *l;
354 
355         if (!fake_panic) {
356                 /*
357                  * Make sure only one CPU runs in machine check panic
358                  */
359                 if (atomic_inc_return(&mce_panicked) > 1)
360                         wait_for_panic();
361                 barrier();
362 
363                 bust_spinlocks(1);
364                 console_verbose();
365         } else {
366                 /* Don't log too much for fake panic */
367                 if (atomic_inc_return(&mce_fake_panicked) > 1)
368                         return;
369         }
370         pending = mce_gen_pool_prepare_records();
371         /* First print corrected ones that are still unlogged */
372         llist_for_each_entry(l, pending, llnode) {
373                 struct mce *m = &l->mce;
374                 if (!(m->status & MCI_STATUS_UC)) {
375                         print_mce(m);
376                         if (!apei_err)
377                                 apei_err = apei_write_mce(m);
378                 }
379         }
380         /* Now print uncorrected but with the final one last */
381         llist_for_each_entry(l, pending, llnode) {
382                 struct mce *m = &l->mce;
383                 if (!(m->status & MCI_STATUS_UC))
384                         continue;
385                 if (!final || mce_cmp(m, final)) {
386                         print_mce(m);
387                         if (!apei_err)
388                                 apei_err = apei_write_mce(m);
389                 }
390         }
391         if (final) {
392                 print_mce(final);
393                 if (!apei_err)
394                         apei_err = apei_write_mce(final);
395         }
396         if (cpu_missing)
397                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
398         if (exp)
399                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
400         if (!fake_panic) {
401                 if (panic_timeout == 0)
402                         panic_timeout = mca_cfg.panic_timeout;
403                 panic(msg);
404         } else
405                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
406 }
407 
408 /* Support code for software error injection */
409 
410 static int msr_to_offset(u32 msr)
411 {
412         unsigned bank = __this_cpu_read(injectm.bank);
413 
414         if (msr == mca_cfg.rip_msr)
415                 return offsetof(struct mce, ip);
416         if (msr == msr_ops.status(bank))
417                 return offsetof(struct mce, status);
418         if (msr == msr_ops.addr(bank))
419                 return offsetof(struct mce, addr);
420         if (msr == msr_ops.misc(bank))
421                 return offsetof(struct mce, misc);
422         if (msr == MSR_IA32_MCG_STATUS)
423                 return offsetof(struct mce, mcgstatus);
424         return -1;
425 }
426 
427 /* MSR access wrappers used for error injection */
428 static u64 mce_rdmsrl(u32 msr)
429 {
430         u64 v;
431 
432         if (__this_cpu_read(injectm.finished)) {
433                 int offset = msr_to_offset(msr);
434 
435                 if (offset < 0)
436                         return 0;
437                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
438         }
439 
440         if (rdmsrl_safe(msr, &v)) {
441                 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
442                 /*
443                  * Return zero in case the access faulted. This should
444                  * not happen normally but can happen if the CPU does
445                  * something weird, or if the code is buggy.
446                  */
447                 v = 0;
448         }
449 
450         return v;
451 }
452 
453 static void mce_wrmsrl(u32 msr, u64 v)
454 {
455         if (__this_cpu_read(injectm.finished)) {
456                 int offset = msr_to_offset(msr);
457 
458                 if (offset >= 0)
459                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
460                 return;
461         }
462         wrmsrl(msr, v);
463 }
464 
465 /*
466  * Collect all global (w.r.t. this processor) status about this machine
467  * check into our "mce" struct so that we can use it later to assess
468  * the severity of the problem as we read per-bank specific details.
469  */
470 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
471 {
472         mce_setup(m);
473 
474         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
475         if (regs) {
476                 /*
477                  * Get the address of the instruction at the time of
478                  * the machine check error.
479                  */
480                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
481                         m->ip = regs->ip;
482                         m->cs = regs->cs;
483 
484                         /*
485                          * When in VM86 mode make the cs look like ring 3
486                          * always. This is a lie, but it's better than passing
487                          * the additional vm86 bit around everywhere.
488                          */
489                         if (v8086_mode(regs))
490                                 m->cs |= 3;
491                 }
492                 /* Use accurate RIP reporting if available. */
493                 if (mca_cfg.rip_msr)
494                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
495         }
496 }
497 
498 int mce_available(struct cpuinfo_x86 *c)
499 {
500         if (mca_cfg.disabled)
501                 return 0;
502         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
503 }
504 
505 static void mce_schedule_work(void)
506 {
507         if (!mce_gen_pool_empty())
508                 schedule_work(&mce_work);
509 }
510 
511 static void mce_irq_work_cb(struct irq_work *entry)
512 {
513         mce_notify_irq();
514         mce_schedule_work();
515 }
516 
517 static void mce_report_event(struct pt_regs *regs)
518 {
519         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
520                 mce_notify_irq();
521                 /*
522                  * Triggering the work queue here is just an insurance
523                  * policy in case the syscall exit notify handler
524                  * doesn't run soon enough or ends up running on the
525                  * wrong CPU (can happen when audit sleeps)
526                  */
527                 mce_schedule_work();
528                 return;
529         }
530 
531         irq_work_queue(&mce_irq_work);
532 }
533 
534 /*
535  * Check if the address reported by the CPU is in a format we can parse.
536  * It would be possible to add code for most other cases, but all would
537  * be somewhat complicated (e.g. segment offset would require an instruction
538  * parser). So only support physical addresses up to page granuality for now.
539  */
540 static int mce_usable_address(struct mce *m)
541 {
542         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
543                 return 0;
544 
545         /* Checks after this one are Intel-specific: */
546         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
547                 return 1;
548 
549         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
550                 return 0;
551         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
552                 return 0;
553         return 1;
554 }
555 
556 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
557                                 void *data)
558 {
559         struct mce *mce = (struct mce *)data;
560         unsigned long pfn;
561 
562         if (!mce)
563                 return NOTIFY_DONE;
564 
565         if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
566                 pfn = mce->addr >> PAGE_SHIFT;
567                 memory_failure(pfn, MCE_VECTOR, 0);
568         }
569 
570         return NOTIFY_OK;
571 }
572 static struct notifier_block mce_srao_nb = {
573         .notifier_call  = srao_decode_notifier,
574         .priority       = MCE_PRIO_SRAO,
575 };
576 
577 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
578                                 void *data)
579 {
580         struct mce *m = (struct mce *)data;
581 
582         if (!m)
583                 return NOTIFY_DONE;
584 
585         /*
586          * Run the default notifier if we have only the SRAO
587          * notifier and us registered.
588          */
589         if (atomic_read(&num_notifiers) > 2)
590                 return NOTIFY_DONE;
591 
592         /* Don't print when mcelog is running */
593         if (mce_chrdev_open_count > 0)
594                 return NOTIFY_DONE;
595 
596         __print_mce(m);
597 
598         return NOTIFY_DONE;
599 }
600 
601 static struct notifier_block mce_default_nb = {
602         .notifier_call  = mce_default_notifier,
603         /* lowest prio, we want it to run last. */
604         .priority       = MCE_PRIO_LOWEST,
605 };
606 
607 /*
608  * Read ADDR and MISC registers.
609  */
610 static void mce_read_aux(struct mce *m, int i)
611 {
612         if (m->status & MCI_STATUS_MISCV)
613                 m->misc = mce_rdmsrl(msr_ops.misc(i));
614 
615         if (m->status & MCI_STATUS_ADDRV) {
616                 m->addr = mce_rdmsrl(msr_ops.addr(i));
617 
618                 /*
619                  * Mask the reported address by the reported granularity.
620                  */
621                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
622                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
623                         m->addr >>= shift;
624                         m->addr <<= shift;
625                 }
626 
627                 /*
628                  * Extract [55:<lsb>] where lsb is the least significant
629                  * *valid* bit of the address bits.
630                  */
631                 if (mce_flags.smca) {
632                         u8 lsb = (m->addr >> 56) & 0x3f;
633 
634                         m->addr &= GENMASK_ULL(55, lsb);
635                 }
636         }
637 
638         if (mce_flags.smca) {
639                 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
640 
641                 if (m->status & MCI_STATUS_SYNDV)
642                         m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
643         }
644 }
645 
646 bool mce_is_memory_error(struct mce *m)
647 {
648         if (m->cpuvendor == X86_VENDOR_AMD) {
649                 /* ErrCodeExt[20:16] */
650                 u8 xec = (m->status >> 16) & 0x1f;
651 
652                 return (xec == 0x0 || xec == 0x8);
653         } else if (m->cpuvendor == X86_VENDOR_INTEL) {
654                 /*
655                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
656                  *
657                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
658                  * indicating a memory error. Bit 8 is used for indicating a
659                  * cache hierarchy error. The combination of bit 2 and bit 3
660                  * is used for indicating a `generic' cache hierarchy error
661                  * But we can't just blindly check the above bits, because if
662                  * bit 11 is set, then it is a bus/interconnect error - and
663                  * either way the above bits just gives more detail on what
664                  * bus/interconnect error happened. Note that bit 12 can be
665                  * ignored, as it's the "filter" bit.
666                  */
667                 return (m->status & 0xef80) == BIT(7) ||
668                        (m->status & 0xef00) == BIT(8) ||
669                        (m->status & 0xeffc) == 0xc;
670         }
671 
672         return false;
673 }
674 EXPORT_SYMBOL_GPL(mce_is_memory_error);
675 
676 DEFINE_PER_CPU(unsigned, mce_poll_count);
677 
678 /*
679  * Poll for corrected events or events that happened before reset.
680  * Those are just logged through /dev/mcelog.
681  *
682  * This is executed in standard interrupt context.
683  *
684  * Note: spec recommends to panic for fatal unsignalled
685  * errors here. However this would be quite problematic --
686  * we would need to reimplement the Monarch handling and
687  * it would mess up the exclusion between exception handler
688  * and poll hander -- * so we skip this for now.
689  * These cases should not happen anyways, or only when the CPU
690  * is already totally * confused. In this case it's likely it will
691  * not fully execute the machine check handler either.
692  */
693 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
694 {
695         bool error_seen = false;
696         struct mce m;
697         int severity;
698         int i;
699 
700         this_cpu_inc(mce_poll_count);
701 
702         mce_gather_info(&m, NULL);
703 
704         if (flags & MCP_TIMESTAMP)
705                 m.tsc = rdtsc();
706 
707         for (i = 0; i < mca_cfg.banks; i++) {
708                 if (!mce_banks[i].ctl || !test_bit(i, *b))
709                         continue;
710 
711                 m.misc = 0;
712                 m.addr = 0;
713                 m.bank = i;
714 
715                 barrier();
716                 m.status = mce_rdmsrl(msr_ops.status(i));
717                 if (!(m.status & MCI_STATUS_VAL))
718                         continue;
719 
720                 /*
721                  * Uncorrected or signalled events are handled by the exception
722                  * handler when it is enabled, so don't process those here.
723                  *
724                  * TBD do the same check for MCI_STATUS_EN here?
725                  */
726                 if (!(flags & MCP_UC) &&
727                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
728                         continue;
729 
730                 error_seen = true;
731 
732                 mce_read_aux(&m, i);
733 
734                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
735 
736                 if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
737                         if (m.status & MCI_STATUS_ADDRV)
738                                 m.severity = severity;
739 
740                 /*
741                  * Don't get the IP here because it's unlikely to
742                  * have anything to do with the actual error location.
743                  */
744                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
745                         mce_log(&m);
746                 else if (mce_usable_address(&m)) {
747                         /*
748                          * Although we skipped logging this, we still want
749                          * to take action. Add to the pool so the registered
750                          * notifiers will see it.
751                          */
752                         if (!mce_gen_pool_add(&m))
753                                 mce_schedule_work();
754                 }
755 
756                 /*
757                  * Clear state for this bank.
758                  */
759                 mce_wrmsrl(msr_ops.status(i), 0);
760         }
761 
762         /*
763          * Don't clear MCG_STATUS here because it's only defined for
764          * exceptions.
765          */
766 
767         sync_core();
768 
769         return error_seen;
770 }
771 EXPORT_SYMBOL_GPL(machine_check_poll);
772 
773 /*
774  * Do a quick check if any of the events requires a panic.
775  * This decides if we keep the events around or clear them.
776  */
777 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
778                           struct pt_regs *regs)
779 {
780         int i, ret = 0;
781         char *tmp;
782 
783         for (i = 0; i < mca_cfg.banks; i++) {
784                 m->status = mce_rdmsrl(msr_ops.status(i));
785                 if (m->status & MCI_STATUS_VAL) {
786                         __set_bit(i, validp);
787                         if (quirk_no_way_out)
788                                 quirk_no_way_out(i, m, regs);
789                 }
790 
791                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
792                         *msg = tmp;
793                         ret = 1;
794                 }
795         }
796         return ret;
797 }
798 
799 /*
800  * Variable to establish order between CPUs while scanning.
801  * Each CPU spins initially until executing is equal its number.
802  */
803 static atomic_t mce_executing;
804 
805 /*
806  * Defines order of CPUs on entry. First CPU becomes Monarch.
807  */
808 static atomic_t mce_callin;
809 
810 /*
811  * Check if a timeout waiting for other CPUs happened.
812  */
813 static int mce_timed_out(u64 *t, const char *msg)
814 {
815         /*
816          * The others already did panic for some reason.
817          * Bail out like in a timeout.
818          * rmb() to tell the compiler that system_state
819          * might have been modified by someone else.
820          */
821         rmb();
822         if (atomic_read(&mce_panicked))
823                 wait_for_panic();
824         if (!mca_cfg.monarch_timeout)
825                 goto out;
826         if ((s64)*t < SPINUNIT) {
827                 if (mca_cfg.tolerant <= 1)
828                         mce_panic(msg, NULL, NULL);
829                 cpu_missing = 1;
830                 return 1;
831         }
832         *t -= SPINUNIT;
833 out:
834         touch_nmi_watchdog();
835         return 0;
836 }
837 
838 /*
839  * The Monarch's reign.  The Monarch is the CPU who entered
840  * the machine check handler first. It waits for the others to
841  * raise the exception too and then grades them. When any
842  * error is fatal panic. Only then let the others continue.
843  *
844  * The other CPUs entering the MCE handler will be controlled by the
845  * Monarch. They are called Subjects.
846  *
847  * This way we prevent any potential data corruption in a unrecoverable case
848  * and also makes sure always all CPU's errors are examined.
849  *
850  * Also this detects the case of a machine check event coming from outer
851  * space (not detected by any CPUs) In this case some external agent wants
852  * us to shut down, so panic too.
853  *
854  * The other CPUs might still decide to panic if the handler happens
855  * in a unrecoverable place, but in this case the system is in a semi-stable
856  * state and won't corrupt anything by itself. It's ok to let the others
857  * continue for a bit first.
858  *
859  * All the spin loops have timeouts; when a timeout happens a CPU
860  * typically elects itself to be Monarch.
861  */
862 static void mce_reign(void)
863 {
864         int cpu;
865         struct mce *m = NULL;
866         int global_worst = 0;
867         char *msg = NULL;
868         char *nmsg = NULL;
869 
870         /*
871          * This CPU is the Monarch and the other CPUs have run
872          * through their handlers.
873          * Grade the severity of the errors of all the CPUs.
874          */
875         for_each_possible_cpu(cpu) {
876                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
877                                             mca_cfg.tolerant,
878                                             &nmsg, true);
879                 if (severity > global_worst) {
880                         msg = nmsg;
881                         global_worst = severity;
882                         m = &per_cpu(mces_seen, cpu);
883                 }
884         }
885 
886         /*
887          * Cannot recover? Panic here then.
888          * This dumps all the mces in the log buffer and stops the
889          * other CPUs.
890          */
891         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
892                 mce_panic("Fatal machine check", m, msg);
893 
894         /*
895          * For UC somewhere we let the CPU who detects it handle it.
896          * Also must let continue the others, otherwise the handling
897          * CPU could deadlock on a lock.
898          */
899 
900         /*
901          * No machine check event found. Must be some external
902          * source or one CPU is hung. Panic.
903          */
904         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
905                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
906 
907         /*
908          * Now clear all the mces_seen so that they don't reappear on
909          * the next mce.
910          */
911         for_each_possible_cpu(cpu)
912                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
913 }
914 
915 static atomic_t global_nwo;
916 
917 /*
918  * Start of Monarch synchronization. This waits until all CPUs have
919  * entered the exception handler and then determines if any of them
920  * saw a fatal event that requires panic. Then it executes them
921  * in the entry order.
922  * TBD double check parallel CPU hotunplug
923  */
924 static int mce_start(int *no_way_out)
925 {
926         int order;
927         int cpus = num_online_cpus();
928         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
929 
930         if (!timeout)
931                 return -1;
932 
933         atomic_add(*no_way_out, &global_nwo);
934         /*
935          * Rely on the implied barrier below, such that global_nwo
936          * is updated before mce_callin.
937          */
938         order = atomic_inc_return(&mce_callin);
939 
940         /*
941          * Wait for everyone.
942          */
943         while (atomic_read(&mce_callin) != cpus) {
944                 if (mce_timed_out(&timeout,
945                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
946                         atomic_set(&global_nwo, 0);
947                         return -1;
948                 }
949                 ndelay(SPINUNIT);
950         }
951 
952         /*
953          * mce_callin should be read before global_nwo
954          */
955         smp_rmb();
956 
957         if (order == 1) {
958                 /*
959                  * Monarch: Starts executing now, the others wait.
960                  */
961                 atomic_set(&mce_executing, 1);
962         } else {
963                 /*
964                  * Subject: Now start the scanning loop one by one in
965                  * the original callin order.
966                  * This way when there are any shared banks it will be
967                  * only seen by one CPU before cleared, avoiding duplicates.
968                  */
969                 while (atomic_read(&mce_executing) < order) {
970                         if (mce_timed_out(&timeout,
971                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
972                                 atomic_set(&global_nwo, 0);
973                                 return -1;
974                         }
975                         ndelay(SPINUNIT);
976                 }
977         }
978 
979         /*
980          * Cache the global no_way_out state.
981          */
982         *no_way_out = atomic_read(&global_nwo);
983 
984         return order;
985 }
986 
987 /*
988  * Synchronize between CPUs after main scanning loop.
989  * This invokes the bulk of the Monarch processing.
990  */
991 static int mce_end(int order)
992 {
993         int ret = -1;
994         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
995 
996         if (!timeout)
997                 goto reset;
998         if (order < 0)
999                 goto reset;
1000 
1001         /*
1002          * Allow others to run.
1003          */
1004         atomic_inc(&mce_executing);
1005 
1006         if (order == 1) {
1007                 /* CHECKME: Can this race with a parallel hotplug? */
1008                 int cpus = num_online_cpus();
1009 
1010                 /*
1011                  * Monarch: Wait for everyone to go through their scanning
1012                  * loops.
1013                  */
1014                 while (atomic_read(&mce_executing) <= cpus) {
1015                         if (mce_timed_out(&timeout,
1016                                           "Timeout: Monarch CPU unable to finish machine check processing"))
1017                                 goto reset;
1018                         ndelay(SPINUNIT);
1019                 }
1020 
1021                 mce_reign();
1022                 barrier();
1023                 ret = 0;
1024         } else {
1025                 /*
1026                  * Subject: Wait for Monarch to finish.
1027                  */
1028                 while (atomic_read(&mce_executing) != 0) {
1029                         if (mce_timed_out(&timeout,
1030                                           "Timeout: Monarch CPU did not finish machine check processing"))
1031                                 goto reset;
1032                         ndelay(SPINUNIT);
1033                 }
1034 
1035                 /*
1036                  * Don't reset anything. That's done by the Monarch.
1037                  */
1038                 return 0;
1039         }
1040 
1041         /*
1042          * Reset all global state.
1043          */
1044 reset:
1045         atomic_set(&global_nwo, 0);
1046         atomic_set(&mce_callin, 0);
1047         barrier();
1048 
1049         /*
1050          * Let others run again.
1051          */
1052         atomic_set(&mce_executing, 0);
1053         return ret;
1054 }
1055 
1056 static void mce_clear_state(unsigned long *toclear)
1057 {
1058         int i;
1059 
1060         for (i = 0; i < mca_cfg.banks; i++) {
1061                 if (test_bit(i, toclear))
1062                         mce_wrmsrl(msr_ops.status(i), 0);
1063         }
1064 }
1065 
1066 static int do_memory_failure(struct mce *m)
1067 {
1068         int flags = MF_ACTION_REQUIRED;
1069         int ret;
1070 
1071         pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1072         if (!(m->mcgstatus & MCG_STATUS_RIPV))
1073                 flags |= MF_MUST_KILL;
1074         ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
1075         if (ret)
1076                 pr_err("Memory error not recovered");
1077         return ret;
1078 }
1079 
1080 /*
1081  * The actual machine check handler. This only handles real
1082  * exceptions when something got corrupted coming in through int 18.
1083  *
1084  * This is executed in NMI context not subject to normal locking rules. This
1085  * implies that most kernel services cannot be safely used. Don't even
1086  * think about putting a printk in there!
1087  *
1088  * On Intel systems this is entered on all CPUs in parallel through
1089  * MCE broadcast. However some CPUs might be broken beyond repair,
1090  * so be always careful when synchronizing with others.
1091  */
1092 void do_machine_check(struct pt_regs *regs, long error_code)
1093 {
1094         struct mca_config *cfg = &mca_cfg;
1095         struct mce m, *final;
1096         int i;
1097         int worst = 0;
1098         int severity;
1099 
1100         /*
1101          * Establish sequential order between the CPUs entering the machine
1102          * check handler.
1103          */
1104         int order = -1;
1105         /*
1106          * If no_way_out gets set, there is no safe way to recover from this
1107          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1108          */
1109         int no_way_out = 0;
1110         /*
1111          * If kill_it gets set, there might be a way to recover from this
1112          * error.
1113          */
1114         int kill_it = 0;
1115         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1116         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1117         char *msg = "Unknown";
1118 
1119         /*
1120          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1121          * on Intel.
1122          */
1123         int lmce = 1;
1124 
1125         /* If this CPU is offline, just bail out. */
1126         if (cpu_is_offline(smp_processor_id())) {
1127                 u64 mcgstatus;
1128 
1129                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1130                 if (mcgstatus & MCG_STATUS_RIPV) {
1131                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1132                         return;
1133                 }
1134         }
1135 
1136         ist_enter(regs);
1137 
1138         this_cpu_inc(mce_exception_count);
1139 
1140         if (!cfg->banks)
1141                 goto out;
1142 
1143         mce_gather_info(&m, regs);
1144         m.tsc = rdtsc();
1145 
1146         final = this_cpu_ptr(&mces_seen);
1147         *final = m;
1148 
1149         memset(valid_banks, 0, sizeof(valid_banks));
1150         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1151 
1152         barrier();
1153 
1154         /*
1155          * When no restart IP might need to kill or panic.
1156          * Assume the worst for now, but if we find the
1157          * severity is MCE_AR_SEVERITY we have other options.
1158          */
1159         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1160                 kill_it = 1;
1161 
1162         /*
1163          * Check if this MCE is signaled to only this logical processor,
1164          * on Intel only.
1165          */
1166         if (m.cpuvendor == X86_VENDOR_INTEL)
1167                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1168 
1169         /*
1170          * Go through all banks in exclusion of the other CPUs. This way we
1171          * don't report duplicated events on shared banks because the first one
1172          * to see it will clear it. If this is a Local MCE, then no need to
1173          * perform rendezvous.
1174          */
1175         if (!lmce)
1176                 order = mce_start(&no_way_out);
1177 
1178         for (i = 0; i < cfg->banks; i++) {
1179                 __clear_bit(i, toclear);
1180                 if (!test_bit(i, valid_banks))
1181                         continue;
1182                 if (!mce_banks[i].ctl)
1183                         continue;
1184 
1185                 m.misc = 0;
1186                 m.addr = 0;
1187                 m.bank = i;
1188 
1189                 m.status = mce_rdmsrl(msr_ops.status(i));
1190                 if ((m.status & MCI_STATUS_VAL) == 0)
1191                         continue;
1192 
1193                 /*
1194                  * Non uncorrected or non signaled errors are handled by
1195                  * machine_check_poll. Leave them alone, unless this panics.
1196                  */
1197                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1198                         !no_way_out)
1199                         continue;
1200 
1201                 /*
1202                  * Set taint even when machine check was not enabled.
1203                  */
1204                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1205 
1206                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1207 
1208                 /*
1209                  * When machine check was for corrected/deferred handler don't
1210                  * touch, unless we're panicing.
1211                  */
1212                 if ((severity == MCE_KEEP_SEVERITY ||
1213                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1214                         continue;
1215                 __set_bit(i, toclear);
1216                 if (severity == MCE_NO_SEVERITY) {
1217                         /*
1218                          * Machine check event was not enabled. Clear, but
1219                          * ignore.
1220                          */
1221                         continue;
1222                 }
1223 
1224                 mce_read_aux(&m, i);
1225 
1226                 /* assuming valid severity level != 0 */
1227                 m.severity = severity;
1228 
1229                 mce_log(&m);
1230 
1231                 if (severity > worst) {
1232                         *final = m;
1233                         worst = severity;
1234                 }
1235         }
1236 
1237         /* mce_clear_state will clear *final, save locally for use later */
1238         m = *final;
1239 
1240         if (!no_way_out)
1241                 mce_clear_state(toclear);
1242 
1243         /*
1244          * Do most of the synchronization with other CPUs.
1245          * When there's any problem use only local no_way_out state.
1246          */
1247         if (!lmce) {
1248                 if (mce_end(order) < 0)
1249                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1250         } else {
1251                 /*
1252                  * Local MCE skipped calling mce_reign()
1253                  * If we found a fatal error, we need to panic here.
1254                  */
1255                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1256                         mce_panic("Machine check from unknown source",
1257                                 NULL, NULL);
1258         }
1259 
1260         /*
1261          * If tolerant is at an insane level we drop requests to kill
1262          * processes and continue even when there is no way out.
1263          */
1264         if (cfg->tolerant == 3)
1265                 kill_it = 0;
1266         else if (no_way_out)
1267                 mce_panic("Fatal machine check on current CPU", &m, msg);
1268 
1269         if (worst > 0)
1270                 mce_report_event(regs);
1271         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1272 out:
1273         sync_core();
1274 
1275         if (worst != MCE_AR_SEVERITY && !kill_it)
1276                 goto out_ist;
1277 
1278         /* Fault was in user mode and we need to take some action */
1279         if ((m.cs & 3) == 3) {
1280                 ist_begin_non_atomic(regs);
1281                 local_irq_enable();
1282 
1283                 if (kill_it || do_memory_failure(&m))
1284                         force_sig(SIGBUS, current);
1285                 local_irq_disable();
1286                 ist_end_non_atomic();
1287         } else {
1288                 if (!fixup_exception(regs, X86_TRAP_MC))
1289                         mce_panic("Failed kernel mode recovery", &m, NULL);
1290         }
1291 
1292 out_ist:
1293         ist_exit(regs);
1294 }
1295 EXPORT_SYMBOL_GPL(do_machine_check);
1296 
1297 #ifndef CONFIG_MEMORY_FAILURE
1298 int memory_failure(unsigned long pfn, int vector, int flags)
1299 {
1300         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1301         BUG_ON(flags & MF_ACTION_REQUIRED);
1302         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1303                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1304                pfn);
1305 
1306         return 0;
1307 }
1308 #endif
1309 
1310 /*
1311  * Periodic polling timer for "silent" machine check errors.  If the
1312  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1313  * errors, poll 2x slower (up to check_interval seconds).
1314  */
1315 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1316 
1317 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1318 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1319 
1320 static unsigned long mce_adjust_timer_default(unsigned long interval)
1321 {
1322         return interval;
1323 }
1324 
1325 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1326 
1327 static void __start_timer(struct timer_list *t, unsigned long interval)
1328 {
1329         unsigned long when = jiffies + interval;
1330         unsigned long flags;
1331 
1332         local_irq_save(flags);
1333 
1334         if (!timer_pending(t) || time_before(when, t->expires))
1335                 mod_timer(t, round_jiffies(when));
1336 
1337         local_irq_restore(flags);
1338 }
1339 
1340 static void mce_timer_fn(unsigned long data)
1341 {
1342         struct timer_list *t = this_cpu_ptr(&mce_timer);
1343         int cpu = smp_processor_id();
1344         unsigned long iv;
1345 
1346         WARN_ON(cpu != data);
1347 
1348         iv = __this_cpu_read(mce_next_interval);
1349 
1350         if (mce_available(this_cpu_ptr(&cpu_info))) {
1351                 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1352 
1353                 if (mce_intel_cmci_poll()) {
1354                         iv = mce_adjust_timer(iv);
1355                         goto done;
1356                 }
1357         }
1358 
1359         /*
1360          * Alert userspace if needed. If we logged an MCE, reduce the polling
1361          * interval, otherwise increase the polling interval.
1362          */
1363         if (mce_notify_irq())
1364                 iv = max(iv / 2, (unsigned long) HZ/100);
1365         else
1366                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1367 
1368 done:
1369         __this_cpu_write(mce_next_interval, iv);
1370         __start_timer(t, iv);
1371 }
1372 
1373 /*
1374  * Ensure that the timer is firing in @interval from now.
1375  */
1376 void mce_timer_kick(unsigned long interval)
1377 {
1378         struct timer_list *t = this_cpu_ptr(&mce_timer);
1379         unsigned long iv = __this_cpu_read(mce_next_interval);
1380 
1381         __start_timer(t, interval);
1382 
1383         if (interval < iv)
1384                 __this_cpu_write(mce_next_interval, interval);
1385 }
1386 
1387 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1388 static void mce_timer_delete_all(void)
1389 {
1390         int cpu;
1391 
1392         for_each_online_cpu(cpu)
1393                 del_timer_sync(&per_cpu(mce_timer, cpu));
1394 }
1395 
1396 static void mce_do_trigger(struct work_struct *work)
1397 {
1398         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1399 }
1400 
1401 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1402 
1403 /*
1404  * Notify the user(s) about new machine check events.
1405  * Can be called from interrupt context, but not from machine check/NMI
1406  * context.
1407  */
1408 int mce_notify_irq(void)
1409 {
1410         /* Not more than two messages every minute */
1411         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1412 
1413         if (test_and_clear_bit(0, &mce_need_notify)) {
1414                 /* wake processes polling /dev/mcelog */
1415                 wake_up_interruptible(&mce_chrdev_wait);
1416 
1417                 if (mce_helper[0])
1418                         schedule_work(&mce_trigger_work);
1419 
1420                 if (__ratelimit(&ratelimit))
1421                         pr_info(HW_ERR "Machine check events logged\n");
1422 
1423                 return 1;
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL_GPL(mce_notify_irq);
1428 
1429 static int __mcheck_cpu_mce_banks_init(void)
1430 {
1431         int i;
1432         u8 num_banks = mca_cfg.banks;
1433 
1434         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1435         if (!mce_banks)
1436                 return -ENOMEM;
1437 
1438         for (i = 0; i < num_banks; i++) {
1439                 struct mce_bank *b = &mce_banks[i];
1440 
1441                 b->ctl = -1ULL;
1442                 b->init = 1;
1443         }
1444         return 0;
1445 }
1446 
1447 /*
1448  * Initialize Machine Checks for a CPU.
1449  */
1450 static int __mcheck_cpu_cap_init(void)
1451 {
1452         unsigned b;
1453         u64 cap;
1454 
1455         rdmsrl(MSR_IA32_MCG_CAP, cap);
1456 
1457         b = cap & MCG_BANKCNT_MASK;
1458         if (!mca_cfg.banks)
1459                 pr_info("CPU supports %d MCE banks\n", b);
1460 
1461         if (b > MAX_NR_BANKS) {
1462                 pr_warn("Using only %u machine check banks out of %u\n",
1463                         MAX_NR_BANKS, b);
1464                 b = MAX_NR_BANKS;
1465         }
1466 
1467         /* Don't support asymmetric configurations today */
1468         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1469         mca_cfg.banks = b;
1470 
1471         if (!mce_banks) {
1472                 int err = __mcheck_cpu_mce_banks_init();
1473 
1474                 if (err)
1475                         return err;
1476         }
1477 
1478         /* Use accurate RIP reporting if available. */
1479         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1480                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1481 
1482         if (cap & MCG_SER_P)
1483                 mca_cfg.ser = true;
1484 
1485         return 0;
1486 }
1487 
1488 static void __mcheck_cpu_init_generic(void)
1489 {
1490         enum mcp_flags m_fl = 0;
1491         mce_banks_t all_banks;
1492         u64 cap;
1493 
1494         if (!mca_cfg.bootlog)
1495                 m_fl = MCP_DONTLOG;
1496 
1497         /*
1498          * Log the machine checks left over from the previous reset.
1499          */
1500         bitmap_fill(all_banks, MAX_NR_BANKS);
1501         machine_check_poll(MCP_UC | m_fl, &all_banks);
1502 
1503         cr4_set_bits(X86_CR4_MCE);
1504 
1505         rdmsrl(MSR_IA32_MCG_CAP, cap);
1506         if (cap & MCG_CTL_P)
1507                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1508 }
1509 
1510 static void __mcheck_cpu_init_clear_banks(void)
1511 {
1512         int i;
1513 
1514         for (i = 0; i < mca_cfg.banks; i++) {
1515                 struct mce_bank *b = &mce_banks[i];
1516 
1517                 if (!b->init)
1518                         continue;
1519                 wrmsrl(msr_ops.ctl(i), b->ctl);
1520                 wrmsrl(msr_ops.status(i), 0);
1521         }
1522 }
1523 
1524 /*
1525  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1526  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1527  * Vol 3B Table 15-20). But this confuses both the code that determines
1528  * whether the machine check occurred in kernel or user mode, and also
1529  * the severity assessment code. Pretend that EIPV was set, and take the
1530  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1531  */
1532 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1533 {
1534         if (bank != 0)
1535                 return;
1536         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1537                 return;
1538         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1539                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1540                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1541                           MCACOD)) !=
1542                          (MCI_STATUS_UC|MCI_STATUS_EN|
1543                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1544                           MCI_STATUS_AR|MCACOD_INSTR))
1545                 return;
1546 
1547         m->mcgstatus |= MCG_STATUS_EIPV;
1548         m->ip = regs->ip;
1549         m->cs = regs->cs;
1550 }
1551 
1552 /* Add per CPU specific workarounds here */
1553 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1554 {
1555         struct mca_config *cfg = &mca_cfg;
1556 
1557         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1558                 pr_info("unknown CPU type - not enabling MCE support\n");
1559                 return -EOPNOTSUPP;
1560         }
1561 
1562         /* This should be disabled by the BIOS, but isn't always */
1563         if (c->x86_vendor == X86_VENDOR_AMD) {
1564                 if (c->x86 == 15 && cfg->banks > 4) {
1565                         /*
1566                          * disable GART TBL walk error reporting, which
1567                          * trips off incorrectly with the IOMMU & 3ware
1568                          * & Cerberus:
1569                          */
1570                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1571                 }
1572                 if (c->x86 < 17 && cfg->bootlog < 0) {
1573                         /*
1574                          * Lots of broken BIOS around that don't clear them
1575                          * by default and leave crap in there. Don't log:
1576                          */
1577                         cfg->bootlog = 0;
1578                 }
1579                 /*
1580                  * Various K7s with broken bank 0 around. Always disable
1581                  * by default.
1582                  */
1583                 if (c->x86 == 6 && cfg->banks > 0)
1584                         mce_banks[0].ctl = 0;
1585 
1586                 /*
1587                  * overflow_recov is supported for F15h Models 00h-0fh
1588                  * even though we don't have a CPUID bit for it.
1589                  */
1590                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1591                         mce_flags.overflow_recov = 1;
1592 
1593                 /*
1594                  * Turn off MC4_MISC thresholding banks on those models since
1595                  * they're not supported there.
1596                  */
1597                 if (c->x86 == 0x15 &&
1598                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1599                         int i;
1600                         u64 hwcr;
1601                         bool need_toggle;
1602                         u32 msrs[] = {
1603                                 0x00000413, /* MC4_MISC0 */
1604                                 0xc0000408, /* MC4_MISC1 */
1605                         };
1606 
1607                         rdmsrl(MSR_K7_HWCR, hwcr);
1608 
1609                         /* McStatusWrEn has to be set */
1610                         need_toggle = !(hwcr & BIT(18));
1611 
1612                         if (need_toggle)
1613                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1614 
1615                         /* Clear CntP bit safely */
1616                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1617                                 msr_clear_bit(msrs[i], 62);
1618 
1619                         /* restore old settings */
1620                         if (need_toggle)
1621                                 wrmsrl(MSR_K7_HWCR, hwcr);
1622                 }
1623         }
1624 
1625         if (c->x86_vendor == X86_VENDOR_INTEL) {
1626                 /*
1627                  * SDM documents that on family 6 bank 0 should not be written
1628                  * because it aliases to another special BIOS controlled
1629                  * register.
1630                  * But it's not aliased anymore on model 0x1a+
1631                  * Don't ignore bank 0 completely because there could be a
1632                  * valid event later, merely don't write CTL0.
1633                  */
1634 
1635                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1636                         mce_banks[0].init = 0;
1637 
1638                 /*
1639                  * All newer Intel systems support MCE broadcasting. Enable
1640                  * synchronization with a one second timeout.
1641                  */
1642                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1643                         cfg->monarch_timeout < 0)
1644                         cfg->monarch_timeout = USEC_PER_SEC;
1645 
1646                 /*
1647                  * There are also broken BIOSes on some Pentium M and
1648                  * earlier systems:
1649                  */
1650                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1651                         cfg->bootlog = 0;
1652 
1653                 if (c->x86 == 6 && c->x86_model == 45)
1654                         quirk_no_way_out = quirk_sandybridge_ifu;
1655         }
1656         if (cfg->monarch_timeout < 0)
1657                 cfg->monarch_timeout = 0;
1658         if (cfg->bootlog != 0)
1659                 cfg->panic_timeout = 30;
1660 
1661         return 0;
1662 }
1663 
1664 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1665 {
1666         if (c->x86 != 5)
1667                 return 0;
1668 
1669         switch (c->x86_vendor) {
1670         case X86_VENDOR_INTEL:
1671                 intel_p5_mcheck_init(c);
1672                 return 1;
1673                 break;
1674         case X86_VENDOR_CENTAUR:
1675                 winchip_mcheck_init(c);
1676                 return 1;
1677                 break;
1678         default:
1679                 return 0;
1680         }
1681 
1682         return 0;
1683 }
1684 
1685 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1686 {
1687         switch (c->x86_vendor) {
1688         case X86_VENDOR_INTEL:
1689                 mce_intel_feature_init(c);
1690                 mce_adjust_timer = cmci_intel_adjust_timer;
1691                 break;
1692 
1693         case X86_VENDOR_AMD: {
1694                 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1695                 mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1696                 mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1697 
1698                 /*
1699                  * Install proper ops for Scalable MCA enabled processors
1700                  */
1701                 if (mce_flags.smca) {
1702                         msr_ops.ctl     = smca_ctl_reg;
1703                         msr_ops.status  = smca_status_reg;
1704                         msr_ops.addr    = smca_addr_reg;
1705                         msr_ops.misc    = smca_misc_reg;
1706                 }
1707                 mce_amd_feature_init(c);
1708 
1709                 break;
1710                 }
1711 
1712         default:
1713                 break;
1714         }
1715 }
1716 
1717 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1718 {
1719         switch (c->x86_vendor) {
1720         case X86_VENDOR_INTEL:
1721                 mce_intel_feature_clear(c);
1722                 break;
1723         default:
1724                 break;
1725         }
1726 }
1727 
1728 static void mce_start_timer(struct timer_list *t)
1729 {
1730         unsigned long iv = check_interval * HZ;
1731 
1732         if (mca_cfg.ignore_ce || !iv)
1733                 return;
1734 
1735         this_cpu_write(mce_next_interval, iv);
1736         __start_timer(t, iv);
1737 }
1738 
1739 static void __mcheck_cpu_setup_timer(void)
1740 {
1741         struct timer_list *t = this_cpu_ptr(&mce_timer);
1742         unsigned int cpu = smp_processor_id();
1743 
1744         setup_pinned_timer(t, mce_timer_fn, cpu);
1745 }
1746 
1747 static void __mcheck_cpu_init_timer(void)
1748 {
1749         struct timer_list *t = this_cpu_ptr(&mce_timer);
1750         unsigned int cpu = smp_processor_id();
1751 
1752         setup_pinned_timer(t, mce_timer_fn, cpu);
1753         mce_start_timer(t);
1754 }
1755 
1756 /* Handle unconfigured int18 (should never happen) */
1757 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1758 {
1759         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1760                smp_processor_id());
1761 }
1762 
1763 /* Call the installed machine check handler for this CPU setup. */
1764 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1765                                                 unexpected_machine_check;
1766 
1767 /*
1768  * Called for each booted CPU to set up machine checks.
1769  * Must be called with preempt off:
1770  */
1771 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1772 {
1773         if (mca_cfg.disabled)
1774                 return;
1775 
1776         if (__mcheck_cpu_ancient_init(c))
1777                 return;
1778 
1779         if (!mce_available(c))
1780                 return;
1781 
1782         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1783                 mca_cfg.disabled = true;
1784                 return;
1785         }
1786 
1787         if (mce_gen_pool_init()) {
1788                 mca_cfg.disabled = true;
1789                 pr_emerg("Couldn't allocate MCE records pool!\n");
1790                 return;
1791         }
1792 
1793         machine_check_vector = do_machine_check;
1794 
1795         __mcheck_cpu_init_generic();
1796         __mcheck_cpu_init_vendor(c);
1797         __mcheck_cpu_init_clear_banks();
1798         __mcheck_cpu_setup_timer();
1799 }
1800 
1801 /*
1802  * Called for each booted CPU to clear some machine checks opt-ins
1803  */
1804 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1805 {
1806         if (mca_cfg.disabled)
1807                 return;
1808 
1809         if (!mce_available(c))
1810                 return;
1811 
1812         /*
1813          * Possibly to clear general settings generic to x86
1814          * __mcheck_cpu_clear_generic(c);
1815          */
1816         __mcheck_cpu_clear_vendor(c);
1817 
1818 }
1819 
1820 /*
1821  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1822  */
1823 
1824 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1825 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1826 
1827 static int mce_chrdev_open(struct inode *inode, struct file *file)
1828 {
1829         spin_lock(&mce_chrdev_state_lock);
1830 
1831         if (mce_chrdev_open_exclu ||
1832             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1833                 spin_unlock(&mce_chrdev_state_lock);
1834 
1835                 return -EBUSY;
1836         }
1837 
1838         if (file->f_flags & O_EXCL)
1839                 mce_chrdev_open_exclu = 1;
1840         mce_chrdev_open_count++;
1841 
1842         spin_unlock(&mce_chrdev_state_lock);
1843 
1844         return nonseekable_open(inode, file);
1845 }
1846 
1847 static int mce_chrdev_release(struct inode *inode, struct file *file)
1848 {
1849         spin_lock(&mce_chrdev_state_lock);
1850 
1851         mce_chrdev_open_count--;
1852         mce_chrdev_open_exclu = 0;
1853 
1854         spin_unlock(&mce_chrdev_state_lock);
1855 
1856         return 0;
1857 }
1858 
1859 static void collect_tscs(void *data)
1860 {
1861         unsigned long *cpu_tsc = (unsigned long *)data;
1862 
1863         cpu_tsc[smp_processor_id()] = rdtsc();
1864 }
1865 
1866 static int mce_apei_read_done;
1867 
1868 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1869 static int __mce_read_apei(char __user **ubuf, size_t usize)
1870 {
1871         int rc;
1872         u64 record_id;
1873         struct mce m;
1874 
1875         if (usize < sizeof(struct mce))
1876                 return -EINVAL;
1877 
1878         rc = apei_read_mce(&m, &record_id);
1879         /* Error or no more MCE record */
1880         if (rc <= 0) {
1881                 mce_apei_read_done = 1;
1882                 /*
1883                  * When ERST is disabled, mce_chrdev_read() should return
1884                  * "no record" instead of "no device."
1885                  */
1886                 if (rc == -ENODEV)
1887                         return 0;
1888                 return rc;
1889         }
1890         rc = -EFAULT;
1891         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1892                 return rc;
1893         /*
1894          * In fact, we should have cleared the record after that has
1895          * been flushed to the disk or sent to network in
1896          * /sbin/mcelog, but we have no interface to support that now,
1897          * so just clear it to avoid duplication.
1898          */
1899         rc = apei_clear_mce(record_id);
1900         if (rc) {
1901                 mce_apei_read_done = 1;
1902                 return rc;
1903         }
1904         *ubuf += sizeof(struct mce);
1905 
1906         return 0;
1907 }
1908 
1909 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1910                                 size_t usize, loff_t *off)
1911 {
1912         char __user *buf = ubuf;
1913         unsigned long *cpu_tsc;
1914         unsigned prev, next;
1915         int i, err;
1916 
1917         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1918         if (!cpu_tsc)
1919                 return -ENOMEM;
1920 
1921         mutex_lock(&mce_chrdev_read_mutex);
1922 
1923         if (!mce_apei_read_done) {
1924                 err = __mce_read_apei(&buf, usize);
1925                 if (err || buf != ubuf)
1926                         goto out;
1927         }
1928 
1929         next = mce_log_get_idx_check(mcelog.next);
1930 
1931         /* Only supports full reads right now */
1932         err = -EINVAL;
1933         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1934                 goto out;
1935 
1936         err = 0;
1937         prev = 0;
1938         do {
1939                 for (i = prev; i < next; i++) {
1940                         unsigned long start = jiffies;
1941                         struct mce *m = &mcelog.entry[i];
1942 
1943                         while (!m->finished) {
1944                                 if (time_after_eq(jiffies, start + 2)) {
1945                                         memset(m, 0, sizeof(*m));
1946                                         goto timeout;
1947                                 }
1948                                 cpu_relax();
1949                         }
1950                         smp_rmb();
1951                         err |= copy_to_user(buf, m, sizeof(*m));
1952                         buf += sizeof(*m);
1953 timeout:
1954                         ;
1955                 }
1956 
1957                 memset(mcelog.entry + prev, 0,
1958                        (next - prev) * sizeof(struct mce));
1959                 prev = next;
1960                 next = cmpxchg(&mcelog.next, prev, 0);
1961         } while (next != prev);
1962 
1963         synchronize_sched();
1964 
1965         /*
1966          * Collect entries that were still getting written before the
1967          * synchronize.
1968          */
1969         on_each_cpu(collect_tscs, cpu_tsc, 1);
1970 
1971         for (i = next; i < MCE_LOG_LEN; i++) {
1972                 struct mce *m = &mcelog.entry[i];
1973 
1974                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1975                         err |= copy_to_user(buf, m, sizeof(*m));
1976                         smp_rmb();
1977                         buf += sizeof(*m);
1978                         memset(m, 0, sizeof(*m));
1979                 }
1980         }
1981 
1982         if (err)
1983                 err = -EFAULT;
1984 
1985 out:
1986         mutex_unlock(&mce_chrdev_read_mutex);
1987         kfree(cpu_tsc);
1988 
1989         return err ? err : buf - ubuf;
1990 }
1991 
1992 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1993 {
1994         poll_wait(file, &mce_chrdev_wait, wait);
1995         if (READ_ONCE(mcelog.next))
1996                 return POLLIN | POLLRDNORM;
1997         if (!mce_apei_read_done && apei_check_mce())
1998                 return POLLIN | POLLRDNORM;
1999         return 0;
2000 }
2001 
2002 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2003                                 unsigned long arg)
2004 {
2005         int __user *p = (int __user *)arg;
2006 
2007         if (!capable(CAP_SYS_ADMIN))
2008                 return -EPERM;
2009 
2010         switch (cmd) {
2011         case MCE_GET_RECORD_LEN:
2012                 return put_user(sizeof(struct mce), p);
2013         case MCE_GET_LOG_LEN:
2014                 return put_user(MCE_LOG_LEN, p);
2015         case MCE_GETCLEAR_FLAGS: {
2016                 unsigned flags;
2017 
2018                 do {
2019                         flags = mcelog.flags;
2020                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
2021 
2022                 return put_user(flags, p);
2023         }
2024         default:
2025                 return -ENOTTY;
2026         }
2027 }
2028 
2029 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2030                             size_t usize, loff_t *off);
2031 
2032 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2033                              const char __user *ubuf,
2034                              size_t usize, loff_t *off))
2035 {
2036         mce_write = fn;
2037 }
2038 EXPORT_SYMBOL_GPL(register_mce_write_callback);
2039 
2040 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2041                                 size_t usize, loff_t *off)
2042 {
2043         if (mce_write)
2044                 return mce_write(filp, ubuf, usize, off);
2045         else
2046                 return -EINVAL;
2047 }
2048 
2049 static const struct file_operations mce_chrdev_ops = {
2050         .open                   = mce_chrdev_open,
2051         .release                = mce_chrdev_release,
2052         .read                   = mce_chrdev_read,
2053         .write                  = mce_chrdev_write,
2054         .poll                   = mce_chrdev_poll,
2055         .unlocked_ioctl         = mce_chrdev_ioctl,
2056         .llseek                 = no_llseek,
2057 };
2058 
2059 static struct miscdevice mce_chrdev_device = {
2060         MISC_MCELOG_MINOR,
2061         "mcelog",
2062         &mce_chrdev_ops,
2063 };
2064 
2065 static void __mce_disable_bank(void *arg)
2066 {
2067         int bank = *((int *)arg);
2068         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2069         cmci_disable_bank(bank);
2070 }
2071 
2072 void mce_disable_bank(int bank)
2073 {
2074         if (bank >= mca_cfg.banks) {
2075                 pr_warn(FW_BUG
2076                         "Ignoring request to disable invalid MCA bank %d.\n",
2077                         bank);
2078                 return;
2079         }
2080         set_bit(bank, mce_banks_ce_disabled);
2081         on_each_cpu(__mce_disable_bank, &bank, 1);
2082 }
2083 
2084 /*
2085  * mce=off Disables machine check
2086  * mce=no_cmci Disables CMCI
2087  * mce=no_lmce Disables LMCE
2088  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2089  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2090  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2091  *      monarchtimeout is how long to wait for other CPUs on machine
2092  *      check, or 0 to not wait
2093  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2094  * mce=nobootlog Don't log MCEs from before booting.
2095  * mce=bios_cmci_threshold Don't program the CMCI threshold
2096  * mce=recovery force enable memcpy_mcsafe()
2097  */
2098 static int __init mcheck_enable(char *str)
2099 {
2100         struct mca_config *cfg = &mca_cfg;
2101 
2102         if (*str == 0) {
2103                 enable_p5_mce();
2104                 return 1;
2105         }
2106         if (*str == '=')
2107                 str++;
2108         if (!strcmp(str, "off"))
2109                 cfg->disabled = true;
2110         else if (!strcmp(str, "no_cmci"))
2111                 cfg->cmci_disabled = true;
2112         else if (!strcmp(str, "no_lmce"))
2113                 cfg->lmce_disabled = true;
2114         else if (!strcmp(str, "dont_log_ce"))
2115                 cfg->dont_log_ce = true;
2116         else if (!strcmp(str, "ignore_ce"))
2117                 cfg->ignore_ce = true;
2118         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2119                 cfg->bootlog = (str[0] == 'b');
2120         else if (!strcmp(str, "bios_cmci_threshold"))
2121                 cfg->bios_cmci_threshold = true;
2122         else if (!strcmp(str, "recovery"))
2123                 cfg->recovery = true;
2124         else if (isdigit(str[0])) {
2125                 if (get_option(&str, &cfg->tolerant) == 2)
2126                         get_option(&str, &(cfg->monarch_timeout));
2127         } else {
2128                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2129                 return 0;
2130         }
2131         return 1;
2132 }
2133 __setup("mce", mcheck_enable);
2134 
2135 int __init mcheck_init(void)
2136 {
2137         mcheck_intel_therm_init();
2138         mce_register_decode_chain(&mce_srao_nb);
2139         mce_register_decode_chain(&mce_default_nb);
2140         mcheck_vendor_init_severity();
2141 
2142         INIT_WORK(&mce_work, mce_gen_pool_process);
2143         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2144 
2145         return 0;
2146 }
2147 
2148 /*
2149  * mce_syscore: PM support
2150  */
2151 
2152 /*
2153  * Disable machine checks on suspend and shutdown. We can't really handle
2154  * them later.
2155  */
2156 static void mce_disable_error_reporting(void)
2157 {
2158         int i;
2159 
2160         for (i = 0; i < mca_cfg.banks; i++) {
2161                 struct mce_bank *b = &mce_banks[i];
2162 
2163                 if (b->init)
2164                         wrmsrl(msr_ops.ctl(i), 0);
2165         }
2166         return;
2167 }
2168 
2169 static void vendor_disable_error_reporting(void)
2170 {
2171         /*
2172          * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2173          * Disabling them for just a single offlined CPU is bad, since it will
2174          * inhibit reporting for all shared resources on the socket like the
2175          * last level cache (LLC), the integrated memory controller (iMC), etc.
2176          */
2177         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2178                 return;
2179 
2180         mce_disable_error_reporting();
2181 }
2182 
2183 static int mce_syscore_suspend(void)
2184 {
2185         vendor_disable_error_reporting();
2186         return 0;
2187 }
2188 
2189 static void mce_syscore_shutdown(void)
2190 {
2191         vendor_disable_error_reporting();
2192 }
2193 
2194 /*
2195  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2196  * Only one CPU is active at this time, the others get re-added later using
2197  * CPU hotplug:
2198  */
2199 static void mce_syscore_resume(void)
2200 {
2201         __mcheck_cpu_init_generic();
2202         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2203         __mcheck_cpu_init_clear_banks();
2204 }
2205 
2206 static struct syscore_ops mce_syscore_ops = {
2207         .suspend        = mce_syscore_suspend,
2208         .shutdown       = mce_syscore_shutdown,
2209         .resume         = mce_syscore_resume,
2210 };
2211 
2212 /*
2213  * mce_device: Sysfs support
2214  */
2215 
2216 static void mce_cpu_restart(void *data)
2217 {
2218         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2219                 return;
2220         __mcheck_cpu_init_generic();
2221         __mcheck_cpu_init_clear_banks();
2222         __mcheck_cpu_init_timer();
2223 }
2224 
2225 /* Reinit MCEs after user configuration changes */
2226 static void mce_restart(void)
2227 {
2228         mce_timer_delete_all();
2229         on_each_cpu(mce_cpu_restart, NULL, 1);
2230 }
2231 
2232 /* Toggle features for corrected errors */
2233 static void mce_disable_cmci(void *data)
2234 {
2235         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2236                 return;
2237         cmci_clear();
2238 }
2239 
2240 static void mce_enable_ce(void *all)
2241 {
2242         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2243                 return;
2244         cmci_reenable();
2245         cmci_recheck();
2246         if (all)
2247                 __mcheck_cpu_init_timer();
2248 }
2249 
2250 static struct bus_type mce_subsys = {
2251         .name           = "machinecheck",
2252         .dev_name       = "machinecheck",
2253 };
2254 
2255 DEFINE_PER_CPU(struct device *, mce_device);
2256 
2257 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2258 {
2259         return container_of(attr, struct mce_bank, attr);
2260 }
2261 
2262 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2263                          char *buf)
2264 {
2265         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2266 }
2267 
2268 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2269                         const char *buf, size_t size)
2270 {
2271         u64 new;
2272 
2273         if (kstrtou64(buf, 0, &new) < 0)
2274                 return -EINVAL;
2275 
2276         attr_to_bank(attr)->ctl = new;
2277         mce_restart();
2278 
2279         return size;
2280 }
2281 
2282 static ssize_t
2283 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2284 {
2285         strcpy(buf, mce_helper);
2286         strcat(buf, "\n");
2287         return strlen(mce_helper) + 1;
2288 }
2289 
2290 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2291                                 const char *buf, size_t siz)
2292 {
2293         char *p;
2294 
2295         strncpy(mce_helper, buf, sizeof(mce_helper));
2296         mce_helper[sizeof(mce_helper)-1] = 0;
2297         p = strchr(mce_helper, '\n');
2298 
2299         if (p)
2300                 *p = 0;
2301 
2302         return strlen(mce_helper) + !!p;
2303 }
2304 
2305 static ssize_t set_ignore_ce(struct device *s,
2306                              struct device_attribute *attr,
2307                              const char *buf, size_t size)
2308 {
2309         u64 new;
2310 
2311         if (kstrtou64(buf, 0, &new) < 0)
2312                 return -EINVAL;
2313 
2314         if (mca_cfg.ignore_ce ^ !!new) {
2315                 if (new) {
2316                         /* disable ce features */
2317                         mce_timer_delete_all();
2318                         on_each_cpu(mce_disable_cmci, NULL, 1);
2319                         mca_cfg.ignore_ce = true;
2320                 } else {
2321                         /* enable ce features */
2322                         mca_cfg.ignore_ce = false;
2323                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2324                 }
2325         }
2326         return size;
2327 }
2328 
2329 static ssize_t set_cmci_disabled(struct device *s,
2330                                  struct device_attribute *attr,
2331                                  const char *buf, size_t size)
2332 {
2333         u64 new;
2334 
2335         if (kstrtou64(buf, 0, &new) < 0)
2336                 return -EINVAL;
2337 
2338         if (mca_cfg.cmci_disabled ^ !!new) {
2339                 if (new) {
2340                         /* disable cmci */
2341                         on_each_cpu(mce_disable_cmci, NULL, 1);
2342                         mca_cfg.cmci_disabled = true;
2343                 } else {
2344                         /* enable cmci */
2345                         mca_cfg.cmci_disabled = false;
2346                         on_each_cpu(mce_enable_ce, NULL, 1);
2347                 }
2348         }
2349         return size;
2350 }
2351 
2352 static ssize_t store_int_with_restart(struct device *s,
2353                                       struct device_attribute *attr,
2354                                       const char *buf, size_t size)
2355 {
2356         ssize_t ret = device_store_int(s, attr, buf, size);
2357         mce_restart();
2358         return ret;
2359 }
2360 
2361 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2362 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2363 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2364 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2365 
2366 static struct dev_ext_attribute dev_attr_check_interval = {
2367         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2368         &check_interval
2369 };
2370 
2371 static struct dev_ext_attribute dev_attr_ignore_ce = {
2372         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2373         &mca_cfg.ignore_ce
2374 };
2375 
2376 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2377         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2378         &mca_cfg.cmci_disabled
2379 };
2380 
2381 static struct device_attribute *mce_device_attrs[] = {
2382         &dev_attr_tolerant.attr,
2383         &dev_attr_check_interval.attr,
2384         &dev_attr_trigger,
2385         &dev_attr_monarch_timeout.attr,
2386         &dev_attr_dont_log_ce.attr,
2387         &dev_attr_ignore_ce.attr,
2388         &dev_attr_cmci_disabled.attr,
2389         NULL
2390 };
2391 
2392 static cpumask_var_t mce_device_initialized;
2393 
2394 static void mce_device_release(struct device *dev)
2395 {
2396         kfree(dev);
2397 }
2398 
2399 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2400 static int mce_device_create(unsigned int cpu)
2401 {
2402         struct device *dev;
2403         int err;
2404         int i, j;
2405 
2406         if (!mce_available(&boot_cpu_data))
2407                 return -EIO;
2408 
2409         dev = per_cpu(mce_device, cpu);
2410         if (dev)
2411                 return 0;
2412 
2413         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2414         if (!dev)
2415                 return -ENOMEM;
2416         dev->id  = cpu;
2417         dev->bus = &mce_subsys;
2418         dev->release = &mce_device_release;
2419 
2420         err = device_register(dev);
2421         if (err) {
2422                 put_device(dev);
2423                 return err;
2424         }
2425 
2426         for (i = 0; mce_device_attrs[i]; i++) {
2427                 err = device_create_file(dev, mce_device_attrs[i]);
2428                 if (err)
2429                         goto error;
2430         }
2431         for (j = 0; j < mca_cfg.banks; j++) {
2432                 err = device_create_file(dev, &mce_banks[j].attr);
2433                 if (err)
2434                         goto error2;
2435         }
2436         cpumask_set_cpu(cpu, mce_device_initialized);
2437         per_cpu(mce_device, cpu) = dev;
2438 
2439         return 0;
2440 error2:
2441         while (--j >= 0)
2442                 device_remove_file(dev, &mce_banks[j].attr);
2443 error:
2444         while (--i >= 0)
2445                 device_remove_file(dev, mce_device_attrs[i]);
2446 
2447         device_unregister(dev);
2448 
2449         return err;
2450 }
2451 
2452 static void mce_device_remove(unsigned int cpu)
2453 {
2454         struct device *dev = per_cpu(mce_device, cpu);
2455         int i;
2456 
2457         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2458                 return;
2459 
2460         for (i = 0; mce_device_attrs[i]; i++)
2461                 device_remove_file(dev, mce_device_attrs[i]);
2462 
2463         for (i = 0; i < mca_cfg.banks; i++)
2464                 device_remove_file(dev, &mce_banks[i].attr);
2465 
2466         device_unregister(dev);
2467         cpumask_clear_cpu(cpu, mce_device_initialized);
2468         per_cpu(mce_device, cpu) = NULL;
2469 }
2470 
2471 /* Make sure there are no machine checks on offlined CPUs. */
2472 static void mce_disable_cpu(void)
2473 {
2474         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2475                 return;
2476 
2477         if (!cpuhp_tasks_frozen)
2478                 cmci_clear();
2479 
2480         vendor_disable_error_reporting();
2481 }
2482 
2483 static void mce_reenable_cpu(void)
2484 {
2485         int i;
2486 
2487         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2488                 return;
2489 
2490         if (!cpuhp_tasks_frozen)
2491                 cmci_reenable();
2492         for (i = 0; i < mca_cfg.banks; i++) {
2493                 struct mce_bank *b = &mce_banks[i];
2494 
2495                 if (b->init)
2496                         wrmsrl(msr_ops.ctl(i), b->ctl);
2497         }
2498 }
2499 
2500 static int mce_cpu_dead(unsigned int cpu)
2501 {
2502         mce_intel_hcpu_update(cpu);
2503 
2504         /* intentionally ignoring frozen here */
2505         if (!cpuhp_tasks_frozen)
2506                 cmci_rediscover();
2507         return 0;
2508 }
2509 
2510 static int mce_cpu_online(unsigned int cpu)
2511 {
2512         struct timer_list *t = this_cpu_ptr(&mce_timer);
2513         int ret;
2514 
2515         mce_device_create(cpu);
2516 
2517         ret = mce_threshold_create_device(cpu);
2518         if (ret) {
2519                 mce_device_remove(cpu);
2520                 return ret;
2521         }
2522         mce_reenable_cpu();
2523         mce_start_timer(t);
2524         return 0;
2525 }
2526 
2527 static int mce_cpu_pre_down(unsigned int cpu)
2528 {
2529         struct timer_list *t = this_cpu_ptr(&mce_timer);
2530 
2531         mce_disable_cpu();
2532         del_timer_sync(t);
2533         mce_threshold_remove_device(cpu);
2534         mce_device_remove(cpu);
2535         return 0;
2536 }
2537 
2538 static __init void mce_init_banks(void)
2539 {
2540         int i;
2541 
2542         for (i = 0; i < mca_cfg.banks; i++) {
2543                 struct mce_bank *b = &mce_banks[i];
2544                 struct device_attribute *a = &b->attr;
2545 
2546                 sysfs_attr_init(&a->attr);
2547                 a->attr.name    = b->attrname;
2548                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2549 
2550                 a->attr.mode    = 0644;
2551                 a->show         = show_bank;
2552                 a->store        = set_bank;
2553         }
2554 }
2555 
2556 static __init int mcheck_init_device(void)
2557 {
2558         enum cpuhp_state hp_online;
2559         int err;
2560 
2561         if (!mce_available(&boot_cpu_data)) {
2562                 err = -EIO;
2563                 goto err_out;
2564         }
2565 
2566         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2567                 err = -ENOMEM;
2568                 goto err_out;
2569         }
2570 
2571         mce_init_banks();
2572 
2573         err = subsys_system_register(&mce_subsys, NULL);
2574         if (err)
2575                 goto err_out_mem;
2576 
2577         err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2578                                 mce_cpu_dead);
2579         if (err)
2580                 goto err_out_mem;
2581 
2582         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2583                                 mce_cpu_online, mce_cpu_pre_down);
2584         if (err < 0)
2585                 goto err_out_online;
2586         hp_online = err;
2587 
2588         register_syscore_ops(&mce_syscore_ops);
2589 
2590         /* register character device /dev/mcelog */
2591         err = misc_register(&mce_chrdev_device);
2592         if (err)
2593                 goto err_register;
2594 
2595         return 0;
2596 
2597 err_register:
2598         unregister_syscore_ops(&mce_syscore_ops);
2599         cpuhp_remove_state(hp_online);
2600 
2601 err_out_online:
2602         cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2603 
2604 err_out_mem:
2605         free_cpumask_var(mce_device_initialized);
2606 
2607 err_out:
2608         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2609 
2610         return err;
2611 }
2612 device_initcall_sync(mcheck_init_device);
2613 
2614 /*
2615  * Old style boot options parsing. Only for compatibility.
2616  */
2617 static int __init mcheck_disable(char *str)
2618 {
2619         mca_cfg.disabled = true;
2620         return 1;
2621 }
2622 __setup("nomce", mcheck_disable);
2623 
2624 #ifdef CONFIG_DEBUG_FS
2625 struct dentry *mce_get_debugfs_dir(void)
2626 {
2627         static struct dentry *dmce;
2628 
2629         if (!dmce)
2630                 dmce = debugfs_create_dir("mce", NULL);
2631 
2632         return dmce;
2633 }
2634 
2635 static void mce_reset(void)
2636 {
2637         cpu_missing = 0;
2638         atomic_set(&mce_fake_panicked, 0);
2639         atomic_set(&mce_executing, 0);
2640         atomic_set(&mce_callin, 0);
2641         atomic_set(&global_nwo, 0);
2642 }
2643 
2644 static int fake_panic_get(void *data, u64 *val)
2645 {
2646         *val = fake_panic;
2647         return 0;
2648 }
2649 
2650 static int fake_panic_set(void *data, u64 val)
2651 {
2652         mce_reset();
2653         fake_panic = val;
2654         return 0;
2655 }
2656 
2657 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2658                         fake_panic_set, "%llu\n");
2659 
2660 static int __init mcheck_debugfs_init(void)
2661 {
2662         struct dentry *dmce, *ffake_panic;
2663 
2664         dmce = mce_get_debugfs_dir();
2665         if (!dmce)
2666                 return -ENOMEM;
2667         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2668                                           &fake_panic_fops);
2669         if (!ffake_panic)
2670                 return -ENOMEM;
2671 
2672         return 0;
2673 }
2674 #else
2675 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2676 #endif
2677 
2678 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2679 EXPORT_SYMBOL_GPL(mcsafe_key);
2680 
2681 static int __init mcheck_late_init(void)
2682 {
2683         if (mca_cfg.recovery)
2684                 static_branch_inc(&mcsafe_key);
2685 
2686         mcheck_debugfs_init();
2687 
2688         /*
2689          * Flush out everything that has been logged during early boot, now that
2690          * everything has been initialized (workqueues, decoders, ...).
2691          */
2692         mce_schedule_work();
2693 
2694         return 0;
2695 }
2696 late_initcall(mcheck_late_init);
2697 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp