[PATCH] x86: i386/x86-64 Add nmi watchdog support for new Intel CPUs
[linux-2.6-microblaze.git] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/config.h>
17 #include <linux/delay.h>
18 #include <linux/interrupt.h>
19 #include <linux/module.h>
20 #include <linux/nmi.h>
21 #include <linux/sysdev.h>
22 #include <linux/sysctl.h>
23 #include <linux/percpu.h>
24 #include <linux/dmi.h>
25
26 #include <asm/smp.h>
27 #include <asm/nmi.h>
28 #include <asm/kdebug.h>
29 #include <asm/intel_arch_perfmon.h>
30
31 #include "mach_traps.h"
32
33 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
34  * evtsel_nmi_owner tracks the ownership of the event selection
35  * - different performance counters/ event selection may be reserved for
36  *   different subsystems this reservation system just tries to coordinate
37  *   things a little
38  */
39 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
40 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
41
42 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
43  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
44  */
45 #define NMI_MAX_COUNTER_BITS 66
46
47 /* nmi_active:
48  * >0: the lapic NMI watchdog is active, but can be disabled
49  * <0: the lapic NMI watchdog has not been set up, and cannot
50  *     be enabled
51  *  0: the lapic NMI watchdog is disabled, but can be enabled
52  */
53 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
54
55 unsigned int nmi_watchdog = NMI_DEFAULT;
56 static unsigned int nmi_hz = HZ;
57
58 struct nmi_watchdog_ctlblk {
59         int enabled;
60         u64 check_bit;
61         unsigned int cccr_msr;
62         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
63         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
64 };
65 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
66
67 /* local prototypes */
68 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
69
70 extern void show_registers(struct pt_regs *regs);
71 extern int unknown_nmi_panic;
72
73 /* converts an msr to an appropriate reservation bit */
74 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
75 {
76         /* returns the bit offset of the performance counter register */
77         switch (boot_cpu_data.x86_vendor) {
78         case X86_VENDOR_AMD:
79                 return (msr - MSR_K7_PERFCTR0);
80         case X86_VENDOR_INTEL:
81                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
82                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
83
84                 switch (boot_cpu_data.x86) {
85                 case 6:
86                         return (msr - MSR_P6_PERFCTR0);
87                 case 15:
88                         return (msr - MSR_P4_BPU_PERFCTR0);
89                 }
90         }
91         return 0;
92 }
93
94 /* converts an msr to an appropriate reservation bit */
95 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
96 {
97         /* returns the bit offset of the event selection register */
98         switch (boot_cpu_data.x86_vendor) {
99         case X86_VENDOR_AMD:
100                 return (msr - MSR_K7_EVNTSEL0);
101         case X86_VENDOR_INTEL:
102                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
103                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
104
105                 switch (boot_cpu_data.x86) {
106                 case 6:
107                         return (msr - MSR_P6_EVNTSEL0);
108                 case 15:
109                         return (msr - MSR_P4_BSU_ESCR0);
110                 }
111         }
112         return 0;
113 }
114
115 /* checks for a bit availability (hack for oprofile) */
116 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
117 {
118         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
119
120         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
121 }
122
123 /* checks the an msr for availability */
124 int avail_to_resrv_perfctr_nmi(unsigned int msr)
125 {
126         unsigned int counter;
127
128         counter = nmi_perfctr_msr_to_bit(msr);
129         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
130
131         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
132 }
133
134 int reserve_perfctr_nmi(unsigned int msr)
135 {
136         unsigned int counter;
137
138         counter = nmi_perfctr_msr_to_bit(msr);
139         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
140
141         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
142                 return 1;
143         return 0;
144 }
145
146 void release_perfctr_nmi(unsigned int msr)
147 {
148         unsigned int counter;
149
150         counter = nmi_perfctr_msr_to_bit(msr);
151         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
152
153         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
154 }
155
156 int reserve_evntsel_nmi(unsigned int msr)
157 {
158         unsigned int counter;
159
160         counter = nmi_evntsel_msr_to_bit(msr);
161         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
162
163         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
164                 return 1;
165         return 0;
166 }
167
168 void release_evntsel_nmi(unsigned int msr)
169 {
170         unsigned int counter;
171
172         counter = nmi_evntsel_msr_to_bit(msr);
173         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
174
175         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
176 }
177
178 static __cpuinit inline int nmi_known_cpu(void)
179 {
180         switch (boot_cpu_data.x86_vendor) {
181         case X86_VENDOR_AMD:
182                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
183         case X86_VENDOR_INTEL:
184                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
185                         return 1;
186                 else
187                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
188         }
189         return 0;
190 }
191
192 #ifdef CONFIG_SMP
193 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
194  * the CPU is idle. To make sure the NMI watchdog really ticks on all
195  * CPUs during the test make them busy.
196  */
197 static __init void nmi_cpu_busy(void *data)
198 {
199         volatile int *endflag = data;
200         local_irq_enable_in_hardirq();
201         /* Intentionally don't use cpu_relax here. This is
202            to make sure that the performance counter really ticks,
203            even if there is a simulator or similar that catches the
204            pause instruction. On a real HT machine this is fine because
205            all other CPUs are busy with "useless" delay loops and don't
206            care if they get somewhat less cycles. */
207         while (*endflag == 0)
208                 barrier();
209 }
210 #endif
211
212 static int __init check_nmi_watchdog(void)
213 {
214         volatile int endflag = 0;
215         unsigned int *prev_nmi_count;
216         int cpu;
217
218         /* Enable NMI watchdog for newer systems.
219            Actually it should be safe for most systems before 2004 too except
220            for some IBM systems that corrupt registers when NMI happens
221            during SMM. Unfortunately we don't have more exact information
222            on these and use this coarse check. */
223         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
224                 nmi_watchdog = NMI_LOCAL_APIC;
225
226         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
227                 return 0;
228
229         if (!atomic_read(&nmi_active))
230                 return 0;
231
232         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
233         if (!prev_nmi_count)
234                 return -1;
235
236         printk(KERN_INFO "Testing NMI watchdog ... ");
237
238         if (nmi_watchdog == NMI_LOCAL_APIC)
239                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
240
241         for_each_possible_cpu(cpu)
242                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
243         local_irq_enable();
244         mdelay((10*1000)/nmi_hz); // wait 10 ticks
245
246         for_each_possible_cpu(cpu) {
247 #ifdef CONFIG_SMP
248                 /* Check cpu_callin_map here because that is set
249                    after the timer is started. */
250                 if (!cpu_isset(cpu, cpu_callin_map))
251                         continue;
252 #endif
253                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
254                         continue;
255                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
256                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
257                                 cpu,
258                                 prev_nmi_count[cpu],
259                                 nmi_count(cpu));
260                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
261                         atomic_dec(&nmi_active);
262                 }
263         }
264         if (!atomic_read(&nmi_active)) {
265                 kfree(prev_nmi_count);
266                 atomic_set(&nmi_active, -1);
267                 return -1;
268         }
269         endflag = 1;
270         printk("OK.\n");
271
272         /* now that we know it works we can reduce NMI frequency to
273            something more reasonable; makes a difference in some configs */
274         if (nmi_watchdog == NMI_LOCAL_APIC) {
275                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
276
277                 nmi_hz = 1;
278                 /*
279                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
280                  * are writable, with higher bits sign extending from bit 31.
281                  * So, we can only program the counter with 31 bit values and
282                  * 32nd bit should be 1, for 33.. to be 1.
283                  * Find the appropriate nmi_hz
284                  */
285                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
286                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
287                         u64 count = (u64)cpu_khz * 1000;
288                         do_div(count, 0x7fffffffUL);
289                         nmi_hz = count + 1;
290                 }
291         }
292
293         kfree(prev_nmi_count);
294         return 0;
295 }
296 /* This needs to happen later in boot so counters are working */
297 late_initcall(check_nmi_watchdog);
298
299 static int __init setup_nmi_watchdog(char *str)
300 {
301         int nmi;
302
303         get_option(&str, &nmi);
304
305         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
306                 return 0;
307         /*
308          * If any other x86 CPU has a local APIC, then
309          * please test the NMI stuff there and send me the
310          * missing bits. Right now Intel P6/P4 and AMD K7 only.
311          */
312         if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
313                 return 0;  /* no lapic support */
314         nmi_watchdog = nmi;
315         return 1;
316 }
317
318 __setup("nmi_watchdog=", setup_nmi_watchdog);
319
320 static void disable_lapic_nmi_watchdog(void)
321 {
322         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
323
324         if (atomic_read(&nmi_active) <= 0)
325                 return;
326
327         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
328
329         BUG_ON(atomic_read(&nmi_active) != 0);
330 }
331
332 static void enable_lapic_nmi_watchdog(void)
333 {
334         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
335
336         /* are we already enabled */
337         if (atomic_read(&nmi_active) != 0)
338                 return;
339
340         /* are we lapic aware */
341         if (nmi_known_cpu() <= 0)
342                 return;
343
344         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
345         touch_nmi_watchdog();
346 }
347
348 void disable_timer_nmi_watchdog(void)
349 {
350         BUG_ON(nmi_watchdog != NMI_IO_APIC);
351
352         if (atomic_read(&nmi_active) <= 0)
353                 return;
354
355         disable_irq(0);
356         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
357
358         BUG_ON(atomic_read(&nmi_active) != 0);
359 }
360
361 void enable_timer_nmi_watchdog(void)
362 {
363         BUG_ON(nmi_watchdog != NMI_IO_APIC);
364
365         if (atomic_read(&nmi_active) == 0) {
366                 touch_nmi_watchdog();
367                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
368                 enable_irq(0);
369         }
370 }
371
372 #ifdef CONFIG_PM
373
374 static int nmi_pm_active; /* nmi_active before suspend */
375
376 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
377 {
378         /* only CPU0 goes here, other CPUs should be offline */
379         nmi_pm_active = atomic_read(&nmi_active);
380         stop_apic_nmi_watchdog(NULL);
381         BUG_ON(atomic_read(&nmi_active) != 0);
382         return 0;
383 }
384
385 static int lapic_nmi_resume(struct sys_device *dev)
386 {
387         /* only CPU0 goes here, other CPUs should be offline */
388         if (nmi_pm_active > 0) {
389                 setup_apic_nmi_watchdog(NULL);
390                 touch_nmi_watchdog();
391         }
392         return 0;
393 }
394
395
396 static struct sysdev_class nmi_sysclass = {
397         set_kset_name("lapic_nmi"),
398         .resume         = lapic_nmi_resume,
399         .suspend        = lapic_nmi_suspend,
400 };
401
402 static struct sys_device device_lapic_nmi = {
403         .id     = 0,
404         .cls    = &nmi_sysclass,
405 };
406
407 static int __init init_lapic_nmi_sysfs(void)
408 {
409         int error;
410
411         /* should really be a BUG_ON but b/c this is an
412          * init call, it just doesn't work.  -dcz
413          */
414         if (nmi_watchdog != NMI_LOCAL_APIC)
415                 return 0;
416
417         if ( atomic_read(&nmi_active) < 0 )
418                 return 0;
419
420         error = sysdev_class_register(&nmi_sysclass);
421         if (!error)
422                 error = sysdev_register(&device_lapic_nmi);
423         return error;
424 }
425 /* must come after the local APIC's device_initcall() */
426 late_initcall(init_lapic_nmi_sysfs);
427
428 #endif  /* CONFIG_PM */
429
430 /*
431  * Activate the NMI watchdog via the local APIC.
432  * Original code written by Keith Owens.
433  */
434
435 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
436 {
437         u64 count = (u64)cpu_khz * 1000;
438
439         do_div(count, nmi_hz);
440         if(descr)
441                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
442         wrmsrl(perfctr_msr, 0 - count);
443 }
444
445 /* Note that these events don't tick when the CPU idles. This means
446    the frequency varies with CPU load. */
447
448 #define K7_EVNTSEL_ENABLE       (1 << 22)
449 #define K7_EVNTSEL_INT          (1 << 20)
450 #define K7_EVNTSEL_OS           (1 << 17)
451 #define K7_EVNTSEL_USR          (1 << 16)
452 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
453 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
454
455 static int setup_k7_watchdog(void)
456 {
457         unsigned int perfctr_msr, evntsel_msr;
458         unsigned int evntsel;
459         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
460
461         perfctr_msr = MSR_K7_PERFCTR0;
462         evntsel_msr = MSR_K7_EVNTSEL0;
463         if (!reserve_perfctr_nmi(perfctr_msr))
464                 goto fail;
465
466         if (!reserve_evntsel_nmi(evntsel_msr))
467                 goto fail1;
468
469         wrmsrl(perfctr_msr, 0UL);
470
471         evntsel = K7_EVNTSEL_INT
472                 | K7_EVNTSEL_OS
473                 | K7_EVNTSEL_USR
474                 | K7_NMI_EVENT;
475
476         /* setup the timer */
477         wrmsr(evntsel_msr, evntsel, 0);
478         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
479         apic_write(APIC_LVTPC, APIC_DM_NMI);
480         evntsel |= K7_EVNTSEL_ENABLE;
481         wrmsr(evntsel_msr, evntsel, 0);
482
483         wd->perfctr_msr = perfctr_msr;
484         wd->evntsel_msr = evntsel_msr;
485         wd->cccr_msr = 0;  //unused
486         wd->check_bit = 1ULL<<63;
487         return 1;
488 fail1:
489         release_perfctr_nmi(perfctr_msr);
490 fail:
491         return 0;
492 }
493
494 static void stop_k7_watchdog(void)
495 {
496         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
497
498         wrmsr(wd->evntsel_msr, 0, 0);
499
500         release_evntsel_nmi(wd->evntsel_msr);
501         release_perfctr_nmi(wd->perfctr_msr);
502 }
503
504 #define P6_EVNTSEL0_ENABLE      (1 << 22)
505 #define P6_EVNTSEL_INT          (1 << 20)
506 #define P6_EVNTSEL_OS           (1 << 17)
507 #define P6_EVNTSEL_USR          (1 << 16)
508 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
509 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
510
511 static int setup_p6_watchdog(void)
512 {
513         unsigned int perfctr_msr, evntsel_msr;
514         unsigned int evntsel;
515         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
516
517         perfctr_msr = MSR_P6_PERFCTR0;
518         evntsel_msr = MSR_P6_EVNTSEL0;
519         if (!reserve_perfctr_nmi(perfctr_msr))
520                 goto fail;
521
522         if (!reserve_evntsel_nmi(evntsel_msr))
523                 goto fail1;
524
525         wrmsrl(perfctr_msr, 0UL);
526
527         evntsel = P6_EVNTSEL_INT
528                 | P6_EVNTSEL_OS
529                 | P6_EVNTSEL_USR
530                 | P6_NMI_EVENT;
531
532         /* setup the timer */
533         wrmsr(evntsel_msr, evntsel, 0);
534         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
535         apic_write(APIC_LVTPC, APIC_DM_NMI);
536         evntsel |= P6_EVNTSEL0_ENABLE;
537         wrmsr(evntsel_msr, evntsel, 0);
538
539         wd->perfctr_msr = perfctr_msr;
540         wd->evntsel_msr = evntsel_msr;
541         wd->cccr_msr = 0;  //unused
542         wd->check_bit = 1ULL<<39;
543         return 1;
544 fail1:
545         release_perfctr_nmi(perfctr_msr);
546 fail:
547         return 0;
548 }
549
550 static void stop_p6_watchdog(void)
551 {
552         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
553
554         wrmsr(wd->evntsel_msr, 0, 0);
555
556         release_evntsel_nmi(wd->evntsel_msr);
557         release_perfctr_nmi(wd->perfctr_msr);
558 }
559
560 /* Note that these events don't tick when the CPU idles. This means
561    the frequency varies with CPU load. */
562
563 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
564 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
565 #define P4_ESCR_OS              (1<<3)
566 #define P4_ESCR_USR             (1<<2)
567 #define P4_CCCR_OVF_PMI0        (1<<26)
568 #define P4_CCCR_OVF_PMI1        (1<<27)
569 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
570 #define P4_CCCR_COMPLEMENT      (1<<19)
571 #define P4_CCCR_COMPARE         (1<<18)
572 #define P4_CCCR_REQUIRED        (3<<16)
573 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
574 #define P4_CCCR_ENABLE          (1<<12)
575 #define P4_CCCR_OVF             (1<<31)
576 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
577    CRU_ESCR0 (with any non-null event selector) through a complemented
578    max threshold. [IA32-Vol3, Section 14.9.9] */
579
580 static int setup_p4_watchdog(void)
581 {
582         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
583         unsigned int evntsel, cccr_val;
584         unsigned int misc_enable, dummy;
585         unsigned int ht_num;
586         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
587
588         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
589         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
590                 return 0;
591
592 #ifdef CONFIG_SMP
593         /* detect which hyperthread we are on */
594         if (smp_num_siblings == 2) {
595                 unsigned int ebx, apicid;
596
597                 ebx = cpuid_ebx(1);
598                 apicid = (ebx >> 24) & 0xff;
599                 ht_num = apicid & 1;
600         } else
601 #endif
602                 ht_num = 0;
603
604         /* performance counters are shared resources
605          * assign each hyperthread its own set
606          * (re-use the ESCR0 register, seems safe
607          * and keeps the cccr_val the same)
608          */
609         if (!ht_num) {
610                 /* logical cpu 0 */
611                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
612                 evntsel_msr = MSR_P4_CRU_ESCR0;
613                 cccr_msr = MSR_P4_IQ_CCCR0;
614                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
615         } else {
616                 /* logical cpu 1 */
617                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
618                 evntsel_msr = MSR_P4_CRU_ESCR0;
619                 cccr_msr = MSR_P4_IQ_CCCR1;
620                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
621         }
622
623         if (!reserve_perfctr_nmi(perfctr_msr))
624                 goto fail;
625
626         if (!reserve_evntsel_nmi(evntsel_msr))
627                 goto fail1;
628
629         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
630                 | P4_ESCR_OS
631                 | P4_ESCR_USR;
632
633         cccr_val |= P4_CCCR_THRESHOLD(15)
634                  | P4_CCCR_COMPLEMENT
635                  | P4_CCCR_COMPARE
636                  | P4_CCCR_REQUIRED;
637
638         wrmsr(evntsel_msr, evntsel, 0);
639         wrmsr(cccr_msr, cccr_val, 0);
640         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
641         apic_write(APIC_LVTPC, APIC_DM_NMI);
642         cccr_val |= P4_CCCR_ENABLE;
643         wrmsr(cccr_msr, cccr_val, 0);
644         wd->perfctr_msr = perfctr_msr;
645         wd->evntsel_msr = evntsel_msr;
646         wd->cccr_msr = cccr_msr;
647         wd->check_bit = 1ULL<<39;
648         return 1;
649 fail1:
650         release_perfctr_nmi(perfctr_msr);
651 fail:
652         return 0;
653 }
654
655 static void stop_p4_watchdog(void)
656 {
657         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
658
659         wrmsr(wd->cccr_msr, 0, 0);
660         wrmsr(wd->evntsel_msr, 0, 0);
661
662         release_evntsel_nmi(wd->evntsel_msr);
663         release_perfctr_nmi(wd->perfctr_msr);
664 }
665
666 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
667 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
668
669 static int setup_intel_arch_watchdog(void)
670 {
671         unsigned int ebx;
672         union cpuid10_eax eax;
673         unsigned int unused;
674         unsigned int perfctr_msr, evntsel_msr;
675         unsigned int evntsel;
676         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
677
678         /*
679          * Check whether the Architectural PerfMon supports
680          * Unhalted Core Cycles Event or not.
681          * NOTE: Corresponding bit = 0 in ebx indicates event present.
682          */
683         cpuid(10, &(eax.full), &ebx, &unused, &unused);
684         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
685             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
686                 goto fail;
687
688         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
689         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
690
691         if (!reserve_perfctr_nmi(perfctr_msr))
692                 goto fail;
693
694         if (!reserve_evntsel_nmi(evntsel_msr))
695                 goto fail1;
696
697         wrmsrl(perfctr_msr, 0UL);
698
699         evntsel = ARCH_PERFMON_EVENTSEL_INT
700                 | ARCH_PERFMON_EVENTSEL_OS
701                 | ARCH_PERFMON_EVENTSEL_USR
702                 | ARCH_PERFMON_NMI_EVENT_SEL
703                 | ARCH_PERFMON_NMI_EVENT_UMASK;
704
705         /* setup the timer */
706         wrmsr(evntsel_msr, evntsel, 0);
707         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
708         apic_write(APIC_LVTPC, APIC_DM_NMI);
709         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
710         wrmsr(evntsel_msr, evntsel, 0);
711
712         wd->perfctr_msr = perfctr_msr;
713         wd->evntsel_msr = evntsel_msr;
714         wd->cccr_msr = 0;  //unused
715         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
716         return 1;
717 fail1:
718         release_perfctr_nmi(perfctr_msr);
719 fail:
720         return 0;
721 }
722
723 static void stop_intel_arch_watchdog(void)
724 {
725         unsigned int ebx;
726         union cpuid10_eax eax;
727         unsigned int unused;
728         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
729
730         /*
731          * Check whether the Architectural PerfMon supports
732          * Unhalted Core Cycles Event or not.
733          * NOTE: Corresponding bit = 0 in ebx indicates event present.
734          */
735         cpuid(10, &(eax.full), &ebx, &unused, &unused);
736         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
737             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
738                 return;
739
740         wrmsr(wd->evntsel_msr, 0, 0);
741         release_evntsel_nmi(wd->evntsel_msr);
742         release_perfctr_nmi(wd->perfctr_msr);
743 }
744
745 void setup_apic_nmi_watchdog (void *unused)
746 {
747         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
748
749         /* only support LOCAL and IO APICs for now */
750         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
751             (nmi_watchdog != NMI_IO_APIC))
752                 return;
753
754         if (wd->enabled == 1)
755                 return;
756
757         /* cheap hack to support suspend/resume */
758         /* if cpu0 is not active neither should the other cpus */
759         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
760                 return;
761
762         if (nmi_watchdog == NMI_LOCAL_APIC) {
763                 switch (boot_cpu_data.x86_vendor) {
764                 case X86_VENDOR_AMD:
765                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
766                                 return;
767                         if (!setup_k7_watchdog())
768                                 return;
769                         break;
770                 case X86_VENDOR_INTEL:
771                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
772                                 if (!setup_intel_arch_watchdog())
773                                         return;
774                                 break;
775                         }
776                         switch (boot_cpu_data.x86) {
777                         case 6:
778                                 if (boot_cpu_data.x86_model > 0xd)
779                                         return;
780
781                                 if (!setup_p6_watchdog())
782                                         return;
783                                 break;
784                         case 15:
785                                 if (boot_cpu_data.x86_model > 0x4)
786                                         return;
787
788                                 if (!setup_p4_watchdog())
789                                         return;
790                                 break;
791                         default:
792                                 return;
793                         }
794                         break;
795                 default:
796                         return;
797                 }
798         }
799         wd->enabled = 1;
800         atomic_inc(&nmi_active);
801 }
802
803 void stop_apic_nmi_watchdog(void *unused)
804 {
805         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
806
807         /* only support LOCAL and IO APICs for now */
808         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
809             (nmi_watchdog != NMI_IO_APIC))
810                 return;
811
812         if (wd->enabled == 0)
813                 return;
814
815         if (nmi_watchdog == NMI_LOCAL_APIC) {
816                 switch (boot_cpu_data.x86_vendor) {
817                 case X86_VENDOR_AMD:
818                         stop_k7_watchdog();
819                         break;
820                 case X86_VENDOR_INTEL:
821                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
822                                 stop_intel_arch_watchdog();
823                                 break;
824                         }
825                         switch (boot_cpu_data.x86) {
826                         case 6:
827                                 if (boot_cpu_data.x86_model > 0xd)
828                                         break;
829                                 stop_p6_watchdog();
830                                 break;
831                         case 15:
832                                 if (boot_cpu_data.x86_model > 0x4)
833                                         break;
834                                 stop_p4_watchdog();
835                                 break;
836                         }
837                         break;
838                 default:
839                         return;
840                 }
841         }
842         wd->enabled = 0;
843         atomic_dec(&nmi_active);
844 }
845
846 /*
847  * the best way to detect whether a CPU has a 'hard lockup' problem
848  * is to check it's local APIC timer IRQ counts. If they are not
849  * changing then that CPU has some problem.
850  *
851  * as these watchdog NMI IRQs are generated on every CPU, we only
852  * have to check the current processor.
853  *
854  * since NMIs don't listen to _any_ locks, we have to be extremely
855  * careful not to rely on unsafe variables. The printk might lock
856  * up though, so we have to break up any console locks first ...
857  * [when there will be more tty-related locks, break them up
858  *  here too!]
859  */
860
861 static unsigned int
862         last_irq_sums [NR_CPUS],
863         alert_counter [NR_CPUS];
864
865 void touch_nmi_watchdog (void)
866 {
867         int i;
868
869         /*
870          * Just reset the alert counters, (other CPUs might be
871          * spinning on locks we hold):
872          */
873         for_each_possible_cpu(i)
874                 alert_counter[i] = 0;
875
876         /*
877          * Tickle the softlockup detector too:
878          */
879         touch_softlockup_watchdog();
880 }
881 EXPORT_SYMBOL(touch_nmi_watchdog);
882
883 extern void die_nmi(struct pt_regs *, const char *msg);
884
885 int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
886 {
887
888         /*
889          * Since current_thread_info()-> is always on the stack, and we
890          * always switch the stack NMI-atomically, it's safe to use
891          * smp_processor_id().
892          */
893         unsigned int sum;
894         int touched = 0;
895         int cpu = smp_processor_id();
896         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
897         u64 dummy;
898         int rc=0;
899
900         /* check for other users first */
901         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
902                         == NOTIFY_STOP) {
903                 rc = 1;
904                 touched = 1;
905         }
906
907         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
908
909         /* if the apic timer isn't firing, this cpu isn't doing much */
910         if (!touched && last_irq_sums[cpu] == sum) {
911                 /*
912                  * Ayiee, looks like this CPU is stuck ...
913                  * wait a few IRQs (5 seconds) before doing the oops ...
914                  */
915                 alert_counter[cpu]++;
916                 if (alert_counter[cpu] == 5*nmi_hz)
917                         /*
918                          * die_nmi will return ONLY if NOTIFY_STOP happens..
919                          */
920                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
921         } else {
922                 last_irq_sums[cpu] = sum;
923                 alert_counter[cpu] = 0;
924         }
925         /* see if the nmi watchdog went off */
926         if (wd->enabled) {
927                 if (nmi_watchdog == NMI_LOCAL_APIC) {
928                         rdmsrl(wd->perfctr_msr, dummy);
929                         if (dummy & wd->check_bit){
930                                 /* this wasn't a watchdog timer interrupt */
931                                 goto done;
932                         }
933
934                         /* only Intel P4 uses the cccr msr */
935                         if (wd->cccr_msr != 0) {
936                                 /*
937                                  * P4 quirks:
938                                  * - An overflown perfctr will assert its interrupt
939                                  *   until the OVF flag in its CCCR is cleared.
940                                  * - LVTPC is masked on interrupt and must be
941                                  *   unmasked by the LVTPC handler.
942                                  */
943                                 rdmsrl(wd->cccr_msr, dummy);
944                                 dummy &= ~P4_CCCR_OVF;
945                                 wrmsrl(wd->cccr_msr, dummy);
946                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
947                         }
948                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
949                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
950                                 /* P6 based Pentium M need to re-unmask
951                                  * the apic vector but it doesn't hurt
952                                  * other P6 variant.
953                                  * ArchPerfom/Core Duo also needs this */
954                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
955                         }
956                         /* start the cycle over again */
957                         write_watchdog_counter(wd->perfctr_msr, NULL);
958                         rc = 1;
959                 } else if (nmi_watchdog == NMI_IO_APIC) {
960                         /* don't know how to accurately check for this.
961                          * just assume it was a watchdog timer interrupt
962                          * This matches the old behaviour.
963                          */
964                         rc = 1;
965                 } else
966                         printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
967         }
968 done:
969         return rc;
970 }
971
972 int do_nmi_callback(struct pt_regs * regs, int cpu)
973 {
974 #ifdef CONFIG_SYSCTL
975         if (unknown_nmi_panic)
976                 return unknown_nmi_panic_callback(regs, cpu);
977 #endif
978         return 0;
979 }
980
981 #ifdef CONFIG_SYSCTL
982
983 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
984 {
985         unsigned char reason = get_nmi_reason();
986         char buf[64];
987
988         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
989         die_nmi(regs, buf);
990         return 0;
991 }
992
993 /*
994  * proc handler for /proc/sys/kernel/nmi
995  */
996 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
997                         void __user *buffer, size_t *length, loff_t *ppos)
998 {
999         int old_state;
1000
1001         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1002         old_state = nmi_watchdog_enabled;
1003         proc_dointvec(table, write, file, buffer, length, ppos);
1004         if (!!old_state == !!nmi_watchdog_enabled)
1005                 return 0;
1006
1007         if (atomic_read(&nmi_active) < 0) {
1008                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1009                 return -EIO;
1010         }
1011
1012         if (nmi_watchdog == NMI_DEFAULT) {
1013                 if (nmi_known_cpu() > 0)
1014                         nmi_watchdog = NMI_LOCAL_APIC;
1015                 else
1016                         nmi_watchdog = NMI_IO_APIC;
1017         }
1018
1019         if (nmi_watchdog == NMI_LOCAL_APIC) {
1020                 if (nmi_watchdog_enabled)
1021                         enable_lapic_nmi_watchdog();
1022                 else
1023                         disable_lapic_nmi_watchdog();
1024         } else {
1025                 printk( KERN_WARNING
1026                         "NMI watchdog doesn't know what hardware to touch\n");
1027                 return -EIO;
1028         }
1029         return 0;
1030 }
1031
1032 #endif
1033
1034 EXPORT_SYMBOL(nmi_active);
1035 EXPORT_SYMBOL(nmi_watchdog);
1036 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1037 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1038 EXPORT_SYMBOL(reserve_perfctr_nmi);
1039 EXPORT_SYMBOL(release_perfctr_nmi);
1040 EXPORT_SYMBOL(reserve_evntsel_nmi);
1041 EXPORT_SYMBOL(release_evntsel_nmi);
1042 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1043 EXPORT_SYMBOL(enable_timer_nmi_watchdog);