~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/perf/core-book3s.c

Version: ~ [ linux-5.5-rc6 ] ~ [ linux-5.4.11 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.95 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.164 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.209 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.209 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.81 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Performance event support - powerpc architecture code
  3  *
  4  * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
  5  *
  6  * This program is free software; you can redistribute it and/or
  7  * modify it under the terms of the GNU General Public License
  8  * as published by the Free Software Foundation; either version
  9  * 2 of the License, or (at your option) any later version.
 10  */
 11 #include <linux/kernel.h>
 12 #include <linux/sched.h>
 13 #include <linux/perf_event.h>
 14 #include <linux/percpu.h>
 15 #include <linux/hardirq.h>
 16 #include <linux/uaccess.h>
 17 #include <asm/reg.h>
 18 #include <asm/pmc.h>
 19 #include <asm/machdep.h>
 20 #include <asm/firmware.h>
 21 #include <asm/ptrace.h>
 22 #include <asm/code-patching.h>
 23 
 24 #define BHRB_MAX_ENTRIES        32
 25 #define BHRB_TARGET             0x0000000000000002
 26 #define BHRB_PREDICTION         0x0000000000000001
 27 #define BHRB_EA                 0xFFFFFFFFFFFFFFFC
 28 
 29 struct cpu_hw_events {
 30         int n_events;
 31         int n_percpu;
 32         int disabled;
 33         int n_added;
 34         int n_limited;
 35         u8  pmcs_enabled;
 36         struct perf_event *event[MAX_HWEVENTS];
 37         u64 events[MAX_HWEVENTS];
 38         unsigned int flags[MAX_HWEVENTS];
 39         unsigned long mmcr[3];
 40         struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
 41         u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 42         u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
 43         unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
 44         unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
 45 
 46         unsigned int group_flag;
 47         int n_txn_start;
 48 
 49         /* BHRB bits */
 50         u64                             bhrb_filter;    /* BHRB HW branch filter */
 51         int                             bhrb_users;
 52         void                            *bhrb_context;
 53         struct  perf_branch_stack       bhrb_stack;
 54         struct  perf_branch_entry       bhrb_entries[BHRB_MAX_ENTRIES];
 55 };
 56 
 57 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 58 
 59 struct power_pmu *ppmu;
 60 
 61 /*
 62  * Normally, to ignore kernel events we set the FCS (freeze counters
 63  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
 64  * hypervisor bit set in the MSR, or if we are running on a processor
 65  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
 66  * then we need to use the FCHV bit to ignore kernel events.
 67  */
 68 static unsigned int freeze_events_kernel = MMCR0_FCS;
 69 
 70 /*
 71  * 32-bit doesn't have MMCRA but does have an MMCR2,
 72  * and a few other names are different.
 73  */
 74 #ifdef CONFIG_PPC32
 75 
 76 #define MMCR0_FCHV              0
 77 #define MMCR0_PMCjCE            MMCR0_PMCnCE
 78 #define MMCR0_FC56              0
 79 #define MMCR0_PMAO              0
 80 
 81 #define SPRN_MMCRA              SPRN_MMCR2
 82 #define MMCRA_SAMPLE_ENABLE     0
 83 
 84 static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
 85 {
 86         return 0;
 87 }
 88 static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
 89 static inline u32 perf_get_misc_flags(struct pt_regs *regs)
 90 {
 91         return 0;
 92 }
 93 static inline void perf_read_regs(struct pt_regs *regs)
 94 {
 95         regs->result = 0;
 96 }
 97 static inline int perf_intr_is_nmi(struct pt_regs *regs)
 98 {
 99         return 0;
100 }
101 
102 static inline int siar_valid(struct pt_regs *regs)
103 {
104         return 1;
105 }
106 
107 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
108 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
109 void power_pmu_flush_branch_stack(void) {}
110 static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
111 #endif /* CONFIG_PPC32 */
112 
113 static bool regs_use_siar(struct pt_regs *regs)
114 {
115         /*
116          * When we take a performance monitor exception the regs are setup
117          * using perf_read_regs() which overloads some fields, in particular
118          * regs->result to tell us whether to use SIAR.
119          *
120          * However if the regs are from another exception, eg. a syscall, then
121          * they have not been setup using perf_read_regs() and so regs->result
122          * is something random.
123          */
124         return ((TRAP(regs) == 0xf00) && regs->result);
125 }
126 
127 /*
128  * Things that are specific to 64-bit implementations.
129  */
130 #ifdef CONFIG_PPC64
131 
132 static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
133 {
134         unsigned long mmcra = regs->dsisr;
135 
136         if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) {
137                 unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
138                 if (slot > 1)
139                         return 4 * (slot - 1);
140         }
141 
142         return 0;
143 }
144 
145 /*
146  * The user wants a data address recorded.
147  * If we're not doing instruction sampling, give them the SDAR
148  * (sampled data address).  If we are doing instruction sampling, then
149  * only give them the SDAR if it corresponds to the instruction
150  * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the
151  * [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER.
152  */
153 static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
154 {
155         unsigned long mmcra = regs->dsisr;
156         bool sdar_valid;
157 
158         if (ppmu->flags & PPMU_HAS_SIER)
159                 sdar_valid = regs->dar & SIER_SDAR_VALID;
160         else {
161                 unsigned long sdsync;
162 
163                 if (ppmu->flags & PPMU_SIAR_VALID)
164                         sdsync = POWER7P_MMCRA_SDAR_VALID;
165                 else if (ppmu->flags & PPMU_ALT_SIPR)
166                         sdsync = POWER6_MMCRA_SDSYNC;
167                 else
168                         sdsync = MMCRA_SDSYNC;
169 
170                 sdar_valid = mmcra & sdsync;
171         }
172 
173         if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
174                 *addrp = mfspr(SPRN_SDAR);
175 }
176 
177 static bool regs_sihv(struct pt_regs *regs)
178 {
179         unsigned long sihv = MMCRA_SIHV;
180 
181         if (ppmu->flags & PPMU_HAS_SIER)
182                 return !!(regs->dar & SIER_SIHV);
183 
184         if (ppmu->flags & PPMU_ALT_SIPR)
185                 sihv = POWER6_MMCRA_SIHV;
186 
187         return !!(regs->dsisr & sihv);
188 }
189 
190 static bool regs_sipr(struct pt_regs *regs)
191 {
192         unsigned long sipr = MMCRA_SIPR;
193 
194         if (ppmu->flags & PPMU_HAS_SIER)
195                 return !!(regs->dar & SIER_SIPR);
196 
197         if (ppmu->flags & PPMU_ALT_SIPR)
198                 sipr = POWER6_MMCRA_SIPR;
199 
200         return !!(regs->dsisr & sipr);
201 }
202 
203 static inline u32 perf_flags_from_msr(struct pt_regs *regs)
204 {
205         if (regs->msr & MSR_PR)
206                 return PERF_RECORD_MISC_USER;
207         if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
208                 return PERF_RECORD_MISC_HYPERVISOR;
209         return PERF_RECORD_MISC_KERNEL;
210 }
211 
212 static inline u32 perf_get_misc_flags(struct pt_regs *regs)
213 {
214         bool use_siar = regs_use_siar(regs);
215 
216         if (!use_siar)
217                 return perf_flags_from_msr(regs);
218 
219         /*
220          * If we don't have flags in MMCRA, rather than using
221          * the MSR, we intuit the flags from the address in
222          * SIAR which should give slightly more reliable
223          * results
224          */
225         if (ppmu->flags & PPMU_NO_SIPR) {
226                 unsigned long siar = mfspr(SPRN_SIAR);
227                 if (siar >= PAGE_OFFSET)
228                         return PERF_RECORD_MISC_KERNEL;
229                 return PERF_RECORD_MISC_USER;
230         }
231 
232         /* PR has priority over HV, so order below is important */
233         if (regs_sipr(regs))
234                 return PERF_RECORD_MISC_USER;
235 
236         if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV))
237                 return PERF_RECORD_MISC_HYPERVISOR;
238 
239         return PERF_RECORD_MISC_KERNEL;
240 }
241 
242 /*
243  * Overload regs->dsisr to store MMCRA so we only need to read it once
244  * on each interrupt.
245  * Overload regs->dar to store SIER if we have it.
246  * Overload regs->result to specify whether we should use the MSR (result
247  * is zero) or the SIAR (result is non zero).
248  */
249 static inline void perf_read_regs(struct pt_regs *regs)
250 {
251         unsigned long mmcra = mfspr(SPRN_MMCRA);
252         int marked = mmcra & MMCRA_SAMPLE_ENABLE;
253         int use_siar;
254 
255         regs->dsisr = mmcra;
256 
257         if (ppmu->flags & PPMU_HAS_SIER)
258                 regs->dar = mfspr(SPRN_SIER);
259 
260         /*
261          * If this isn't a PMU exception (eg a software event) the SIAR is
262          * not valid. Use pt_regs.
263          *
264          * If it is a marked event use the SIAR.
265          *
266          * If the PMU doesn't update the SIAR for non marked events use
267          * pt_regs.
268          *
269          * If the PMU has HV/PR flags then check to see if they
270          * place the exception in userspace. If so, use pt_regs. In
271          * continuous sampling mode the SIAR and the PMU exception are
272          * not synchronised, so they may be many instructions apart.
273          * This can result in confusing backtraces. We still want
274          * hypervisor samples as well as samples in the kernel with
275          * interrupts off hence the userspace check.
276          */
277         if (TRAP(regs) != 0xf00)
278                 use_siar = 0;
279         else if (marked)
280                 use_siar = 1;
281         else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
282                 use_siar = 0;
283         else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs))
284                 use_siar = 0;
285         else
286                 use_siar = 1;
287 
288         regs->result = use_siar;
289 }
290 
291 /*
292  * If interrupts were soft-disabled when a PMU interrupt occurs, treat
293  * it as an NMI.
294  */
295 static inline int perf_intr_is_nmi(struct pt_regs *regs)
296 {
297         return !regs->softe;
298 }
299 
300 /*
301  * On processors like P7+ that have the SIAR-Valid bit, marked instructions
302  * must be sampled only if the SIAR-valid bit is set.
303  *
304  * For unmarked instructions and for processors that don't have the SIAR-Valid
305  * bit, assume that SIAR is valid.
306  */
307 static inline int siar_valid(struct pt_regs *regs)
308 {
309         unsigned long mmcra = regs->dsisr;
310         int marked = mmcra & MMCRA_SAMPLE_ENABLE;
311 
312         if (marked) {
313                 if (ppmu->flags & PPMU_HAS_SIER)
314                         return regs->dar & SIER_SIAR_VALID;
315 
316                 if (ppmu->flags & PPMU_SIAR_VALID)
317                         return mmcra & POWER7P_MMCRA_SIAR_VALID;
318         }
319 
320         return 1;
321 }
322 
323 
324 /* Reset all possible BHRB entries */
325 static void power_pmu_bhrb_reset(void)
326 {
327         asm volatile(PPC_CLRBHRB);
328 }
329 
330 static void power_pmu_bhrb_enable(struct perf_event *event)
331 {
332         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
333 
334         if (!ppmu->bhrb_nr)
335                 return;
336 
337         /* Clear BHRB if we changed task context to avoid data leaks */
338         if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
339                 power_pmu_bhrb_reset();
340                 cpuhw->bhrb_context = event->ctx;
341         }
342         cpuhw->bhrb_users++;
343 }
344 
345 static void power_pmu_bhrb_disable(struct perf_event *event)
346 {
347         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
348 
349         if (!ppmu->bhrb_nr)
350                 return;
351 
352         cpuhw->bhrb_users--;
353         WARN_ON_ONCE(cpuhw->bhrb_users < 0);
354 
355         if (!cpuhw->disabled && !cpuhw->bhrb_users) {
356                 /* BHRB cannot be turned off when other
357                  * events are active on the PMU.
358                  */
359 
360                 /* avoid stale pointer */
361                 cpuhw->bhrb_context = NULL;
362         }
363 }
364 
365 /* Called from ctxsw to prevent one process's branch entries to
366  * mingle with the other process's entries during context switch.
367  */
368 void power_pmu_flush_branch_stack(void)
369 {
370         if (ppmu->bhrb_nr)
371                 power_pmu_bhrb_reset();
372 }
373 /* Calculate the to address for a branch */
374 static __u64 power_pmu_bhrb_to(u64 addr)
375 {
376         unsigned int instr;
377         int ret;
378         __u64 target;
379 
380         if (is_kernel_addr(addr))
381                 return branch_target((unsigned int *)addr);
382 
383         /* Userspace: need copy instruction here then translate it */
384         pagefault_disable();
385         ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
386         if (ret) {
387                 pagefault_enable();
388                 return 0;
389         }
390         pagefault_enable();
391 
392         target = branch_target(&instr);
393         if ((!target) || (instr & BRANCH_ABSOLUTE))
394                 return target;
395 
396         /* Translate relative branch target from kernel to user address */
397         return target - (unsigned long)&instr + addr;
398 }
399 
400 /* Processing BHRB entries */
401 void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
402 {
403         u64 val;
404         u64 addr;
405         int r_index, u_index, pred;
406 
407         r_index = 0;
408         u_index = 0;
409         while (r_index < ppmu->bhrb_nr) {
410                 /* Assembly read function */
411                 val = read_bhrb(r_index++);
412                 if (!val)
413                         /* Terminal marker: End of valid BHRB entries */
414                         break;
415                 else {
416                         addr = val & BHRB_EA;
417                         pred = val & BHRB_PREDICTION;
418 
419                         if (!addr)
420                                 /* invalid entry */
421                                 continue;
422 
423                         /* Branches are read most recent first (ie. mfbhrb 0 is
424                          * the most recent branch).
425                          * There are two types of valid entries:
426                          * 1) a target entry which is the to address of a
427                          *    computed goto like a blr,bctr,btar.  The next
428                          *    entry read from the bhrb will be branch
429                          *    corresponding to this target (ie. the actual
430                          *    blr/bctr/btar instruction).
431                          * 2) a from address which is an actual branch.  If a
432                          *    target entry proceeds this, then this is the
433                          *    matching branch for that target.  If this is not
434                          *    following a target entry, then this is a branch
435                          *    where the target is given as an immediate field
436                          *    in the instruction (ie. an i or b form branch).
437                          *    In this case we need to read the instruction from
438                          *    memory to determine the target/to address.
439                          */
440 
441                         if (val & BHRB_TARGET) {
442                                 /* Target branches use two entries
443                                  * (ie. computed gotos/XL form)
444                                  */
445                                 cpuhw->bhrb_entries[u_index].to = addr;
446                                 cpuhw->bhrb_entries[u_index].mispred = pred;
447                                 cpuhw->bhrb_entries[u_index].predicted = ~pred;
448 
449                                 /* Get from address in next entry */
450                                 val = read_bhrb(r_index++);
451                                 addr = val & BHRB_EA;
452                                 if (val & BHRB_TARGET) {
453                                         /* Shouldn't have two targets in a
454                                            row.. Reset index and try again */
455                                         r_index--;
456                                         addr = 0;
457                                 }
458                                 cpuhw->bhrb_entries[u_index].from = addr;
459                         } else {
460                                 /* Branches to immediate field 
461                                    (ie I or B form) */
462                                 cpuhw->bhrb_entries[u_index].from = addr;
463                                 cpuhw->bhrb_entries[u_index].to =
464                                         power_pmu_bhrb_to(addr);
465                                 cpuhw->bhrb_entries[u_index].mispred = pred;
466                                 cpuhw->bhrb_entries[u_index].predicted = ~pred;
467                         }
468                         u_index++;
469 
470                 }
471         }
472         cpuhw->bhrb_stack.nr = u_index;
473         return;
474 }
475 
476 #endif /* CONFIG_PPC64 */
477 
478 static void perf_event_interrupt(struct pt_regs *regs);
479 
480 void perf_event_print_debug(void)
481 {
482 }
483 
484 /*
485  * Read one performance monitor counter (PMC).
486  */
487 static unsigned long read_pmc(int idx)
488 {
489         unsigned long val;
490 
491         switch (idx) {
492         case 1:
493                 val = mfspr(SPRN_PMC1);
494                 break;
495         case 2:
496                 val = mfspr(SPRN_PMC2);
497                 break;
498         case 3:
499                 val = mfspr(SPRN_PMC3);
500                 break;
501         case 4:
502                 val = mfspr(SPRN_PMC4);
503                 break;
504         case 5:
505                 val = mfspr(SPRN_PMC5);
506                 break;
507         case 6:
508                 val = mfspr(SPRN_PMC6);
509                 break;
510 #ifdef CONFIG_PPC64
511         case 7:
512                 val = mfspr(SPRN_PMC7);
513                 break;
514         case 8:
515                 val = mfspr(SPRN_PMC8);
516                 break;
517 #endif /* CONFIG_PPC64 */
518         default:
519                 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
520                 val = 0;
521         }
522         return val;
523 }
524 
525 /*
526  * Write one PMC.
527  */
528 static void write_pmc(int idx, unsigned long val)
529 {
530         switch (idx) {
531         case 1:
532                 mtspr(SPRN_PMC1, val);
533                 break;
534         case 2:
535                 mtspr(SPRN_PMC2, val);
536                 break;
537         case 3:
538                 mtspr(SPRN_PMC3, val);
539                 break;
540         case 4:
541                 mtspr(SPRN_PMC4, val);
542                 break;
543         case 5:
544                 mtspr(SPRN_PMC5, val);
545                 break;
546         case 6:
547                 mtspr(SPRN_PMC6, val);
548                 break;
549 #ifdef CONFIG_PPC64
550         case 7:
551                 mtspr(SPRN_PMC7, val);
552                 break;
553         case 8:
554                 mtspr(SPRN_PMC8, val);
555                 break;
556 #endif /* CONFIG_PPC64 */
557         default:
558                 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
559         }
560 }
561 
562 /*
563  * Check if a set of events can all go on the PMU at once.
564  * If they can't, this will look at alternative codes for the events
565  * and see if any combination of alternative codes is feasible.
566  * The feasible set is returned in event_id[].
567  */
568 static int power_check_constraints(struct cpu_hw_events *cpuhw,
569                                    u64 event_id[], unsigned int cflags[],
570                                    int n_ev)
571 {
572         unsigned long mask, value, nv;
573         unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
574         int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
575         int i, j;
576         unsigned long addf = ppmu->add_fields;
577         unsigned long tadd = ppmu->test_adder;
578 
579         if (n_ev > ppmu->n_counter)
580                 return -1;
581 
582         /* First see if the events will go on as-is */
583         for (i = 0; i < n_ev; ++i) {
584                 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
585                     && !ppmu->limited_pmc_event(event_id[i])) {
586                         ppmu->get_alternatives(event_id[i], cflags[i],
587                                                cpuhw->alternatives[i]);
588                         event_id[i] = cpuhw->alternatives[i][0];
589                 }
590                 if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
591                                          &cpuhw->avalues[i][0]))
592                         return -1;
593         }
594         value = mask = 0;
595         for (i = 0; i < n_ev; ++i) {
596                 nv = (value | cpuhw->avalues[i][0]) +
597                         (value & cpuhw->avalues[i][0] & addf);
598                 if ((((nv + tadd) ^ value) & mask) != 0 ||
599                     (((nv + tadd) ^ cpuhw->avalues[i][0]) &
600                      cpuhw->amasks[i][0]) != 0)
601                         break;
602                 value = nv;
603                 mask |= cpuhw->amasks[i][0];
604         }
605         if (i == n_ev)
606                 return 0;       /* all OK */
607 
608         /* doesn't work, gather alternatives... */
609         if (!ppmu->get_alternatives)
610                 return -1;
611         for (i = 0; i < n_ev; ++i) {
612                 choice[i] = 0;
613                 n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
614                                                   cpuhw->alternatives[i]);
615                 for (j = 1; j < n_alt[i]; ++j)
616                         ppmu->get_constraint(cpuhw->alternatives[i][j],
617                                              &cpuhw->amasks[i][j],
618                                              &cpuhw->avalues[i][j]);
619         }
620 
621         /* enumerate all possibilities and see if any will work */
622         i = 0;
623         j = -1;
624         value = mask = nv = 0;
625         while (i < n_ev) {
626                 if (j >= 0) {
627                         /* we're backtracking, restore context */
628                         value = svalues[i];
629                         mask = smasks[i];
630                         j = choice[i];
631                 }
632                 /*
633                  * See if any alternative k for event_id i,
634                  * where k > j, will satisfy the constraints.
635                  */
636                 while (++j < n_alt[i]) {
637                         nv = (value | cpuhw->avalues[i][j]) +
638                                 (value & cpuhw->avalues[i][j] & addf);
639                         if ((((nv + tadd) ^ value) & mask) == 0 &&
640                             (((nv + tadd) ^ cpuhw->avalues[i][j])
641                              & cpuhw->amasks[i][j]) == 0)
642                                 break;
643                 }
644                 if (j >= n_alt[i]) {
645                         /*
646                          * No feasible alternative, backtrack
647                          * to event_id i-1 and continue enumerating its
648                          * alternatives from where we got up to.
649                          */
650                         if (--i < 0)
651                                 return -1;
652                 } else {
653                         /*
654                          * Found a feasible alternative for event_id i,
655                          * remember where we got up to with this event_id,
656                          * go on to the next event_id, and start with
657                          * the first alternative for it.
658                          */
659                         choice[i] = j;
660                         svalues[i] = value;
661                         smasks[i] = mask;
662                         value = nv;
663                         mask |= cpuhw->amasks[i][j];
664                         ++i;
665                         j = -1;
666                 }
667         }
668 
669         /* OK, we have a feasible combination, tell the caller the solution */
670         for (i = 0; i < n_ev; ++i)
671                 event_id[i] = cpuhw->alternatives[i][choice[i]];
672         return 0;
673 }
674 
675 /*
676  * Check if newly-added events have consistent settings for
677  * exclude_{user,kernel,hv} with each other and any previously
678  * added events.
679  */
680 static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
681                           int n_prev, int n_new)
682 {
683         int eu = 0, ek = 0, eh = 0;
684         int i, n, first;
685         struct perf_event *event;
686 
687         n = n_prev + n_new;
688         if (n <= 1)
689                 return 0;
690 
691         first = 1;
692         for (i = 0; i < n; ++i) {
693                 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
694                         cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
695                         continue;
696                 }
697                 event = ctrs[i];
698                 if (first) {
699                         eu = event->attr.exclude_user;
700                         ek = event->attr.exclude_kernel;
701                         eh = event->attr.exclude_hv;
702                         first = 0;
703                 } else if (event->attr.exclude_user != eu ||
704                            event->attr.exclude_kernel != ek ||
705                            event->attr.exclude_hv != eh) {
706                         return -EAGAIN;
707                 }
708         }
709 
710         if (eu || ek || eh)
711                 for (i = 0; i < n; ++i)
712                         if (cflags[i] & PPMU_LIMITED_PMC_OK)
713                                 cflags[i] |= PPMU_LIMITED_PMC_REQD;
714 
715         return 0;
716 }
717 
718 static u64 check_and_compute_delta(u64 prev, u64 val)
719 {
720         u64 delta = (val - prev) & 0xfffffffful;
721 
722         /*
723          * POWER7 can roll back counter values, if the new value is smaller
724          * than the previous value it will cause the delta and the counter to
725          * have bogus values unless we rolled a counter over.  If a coutner is
726          * rolled back, it will be smaller, but within 256, which is the maximum
727          * number of events to rollback at once.  If we dectect a rollback
728          * return 0.  This can lead to a small lack of precision in the
729          * counters.
730          */
731         if (prev > val && (prev - val) < 256)
732                 delta = 0;
733 
734         return delta;
735 }
736 
737 static void power_pmu_read(struct perf_event *event)
738 {
739         s64 val, delta, prev;
740 
741         if (event->hw.state & PERF_HES_STOPPED)
742                 return;
743 
744         if (!event->hw.idx)
745                 return;
746         /*
747          * Performance monitor interrupts come even when interrupts
748          * are soft-disabled, as long as interrupts are hard-enabled.
749          * Therefore we treat them like NMIs.
750          */
751         do {
752                 prev = local64_read(&event->hw.prev_count);
753                 barrier();
754                 val = read_pmc(event->hw.idx);
755                 delta = check_and_compute_delta(prev, val);
756                 if (!delta)
757                         return;
758         } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
759 
760         local64_add(delta, &event->count);
761 
762         /*
763          * A number of places program the PMC with (0x80000000 - period_left).
764          * We never want period_left to be less than 1 because we will program
765          * the PMC with a value >= 0x800000000 and an edge detected PMC will
766          * roll around to 0 before taking an exception. We have seen this
767          * on POWER8.
768          *
769          * To fix this, clamp the minimum value of period_left to 1.
770          */
771         do {
772                 prev = local64_read(&event->hw.period_left);
773                 val = prev - delta;
774                 if (val < 1)
775                         val = 1;
776         } while (local64_cmpxchg(&event->hw.period_left, prev, val) != prev);
777 }
778 
779 /*
780  * On some machines, PMC5 and PMC6 can't be written, don't respect
781  * the freeze conditions, and don't generate interrupts.  This tells
782  * us if `event' is using such a PMC.
783  */
784 static int is_limited_pmc(int pmcnum)
785 {
786         return (ppmu->flags & PPMU_LIMITED_PMC5_6)
787                 && (pmcnum == 5 || pmcnum == 6);
788 }
789 
790 static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
791                                     unsigned long pmc5, unsigned long pmc6)
792 {
793         struct perf_event *event;
794         u64 val, prev, delta;
795         int i;
796 
797         for (i = 0; i < cpuhw->n_limited; ++i) {
798                 event = cpuhw->limited_counter[i];
799                 if (!event->hw.idx)
800                         continue;
801                 val = (event->hw.idx == 5) ? pmc5 : pmc6;
802                 prev = local64_read(&event->hw.prev_count);
803                 event->hw.idx = 0;
804                 delta = check_and_compute_delta(prev, val);
805                 if (delta)
806                         local64_add(delta, &event->count);
807         }
808 }
809 
810 static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
811                                   unsigned long pmc5, unsigned long pmc6)
812 {
813         struct perf_event *event;
814         u64 val, prev;
815         int i;
816 
817         for (i = 0; i < cpuhw->n_limited; ++i) {
818                 event = cpuhw->limited_counter[i];
819                 event->hw.idx = cpuhw->limited_hwidx[i];
820                 val = (event->hw.idx == 5) ? pmc5 : pmc6;
821                 prev = local64_read(&event->hw.prev_count);
822                 if (check_and_compute_delta(prev, val))
823                         local64_set(&event->hw.prev_count, val);
824                 perf_event_update_userpage(event);
825         }
826 }
827 
828 /*
829  * Since limited events don't respect the freeze conditions, we
830  * have to read them immediately after freezing or unfreezing the
831  * other events.  We try to keep the values from the limited
832  * events as consistent as possible by keeping the delay (in
833  * cycles and instructions) between freezing/unfreezing and reading
834  * the limited events as small and consistent as possible.
835  * Therefore, if any limited events are in use, we read them
836  * both, and always in the same order, to minimize variability,
837  * and do it inside the same asm that writes MMCR0.
838  */
839 static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
840 {
841         unsigned long pmc5, pmc6;
842 
843         if (!cpuhw->n_limited) {
844                 mtspr(SPRN_MMCR0, mmcr0);
845                 return;
846         }
847 
848         /*
849          * Write MMCR0, then read PMC5 and PMC6 immediately.
850          * To ensure we don't get a performance monitor interrupt
851          * between writing MMCR0 and freezing/thawing the limited
852          * events, we first write MMCR0 with the event overflow
853          * interrupt enable bits turned off.
854          */
855         asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
856                      : "=&r" (pmc5), "=&r" (pmc6)
857                      : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
858                        "i" (SPRN_MMCR0),
859                        "i" (SPRN_PMC5), "i" (SPRN_PMC6));
860 
861         if (mmcr0 & MMCR0_FC)
862                 freeze_limited_counters(cpuhw, pmc5, pmc6);
863         else
864                 thaw_limited_counters(cpuhw, pmc5, pmc6);
865 
866         /*
867          * Write the full MMCR0 including the event overflow interrupt
868          * enable bits, if necessary.
869          */
870         if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
871                 mtspr(SPRN_MMCR0, mmcr0);
872 }
873 
874 /*
875  * Disable all events to prevent PMU interrupts and to allow
876  * events to be added or removed.
877  */
878 static void power_pmu_disable(struct pmu *pmu)
879 {
880         struct cpu_hw_events *cpuhw;
881         unsigned long flags, val;
882 
883         if (!ppmu)
884                 return;
885         local_irq_save(flags);
886         cpuhw = &__get_cpu_var(cpu_hw_events);
887 
888         if (!cpuhw->disabled) {
889                 /*
890                  * Check if we ever enabled the PMU on this cpu.
891                  */
892                 if (!cpuhw->pmcs_enabled) {
893                         ppc_enable_pmcs();
894                         cpuhw->pmcs_enabled = 1;
895                 }
896 
897                 /*
898                  * Set the 'freeze counters' bit, clear PMAO/FC56.
899                  */
900                 val  = mfspr(SPRN_MMCR0);
901                 val |= MMCR0_FC;
902                 val &= ~(MMCR0_PMAO | MMCR0_FC56);
903 
904                 /*
905                  * The barrier is to make sure the mtspr has been
906                  * executed and the PMU has frozen the events etc.
907                  * before we return.
908                  */
909                 write_mmcr0(cpuhw, val);
910                 mb();
911 
912                 /*
913                  * Disable instruction sampling if it was enabled
914                  */
915                 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
916                         mtspr(SPRN_MMCRA,
917                               cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
918                         mb();
919                 }
920 
921                 cpuhw->disabled = 1;
922                 cpuhw->n_added = 0;
923         }
924         local_irq_restore(flags);
925 }
926 
927 /*
928  * Re-enable all events if disable == 0.
929  * If we were previously disabled and events were added, then
930  * put the new config on the PMU.
931  */
932 static void power_pmu_enable(struct pmu *pmu)
933 {
934         struct perf_event *event;
935         struct cpu_hw_events *cpuhw;
936         unsigned long flags;
937         long i;
938         unsigned long val;
939         s64 left;
940         unsigned int hwc_index[MAX_HWEVENTS];
941         int n_lim;
942         int idx;
943 
944         if (!ppmu)
945                 return;
946 
947         local_irq_save(flags);
948 
949         cpuhw = &__get_cpu_var(cpu_hw_events);
950         if (!cpuhw->disabled)
951                 goto out;
952 
953         if (cpuhw->n_events == 0) {
954                 ppc_set_pmu_inuse(0);
955                 goto out;
956         }
957 
958         cpuhw->disabled = 0;
959 
960         /*
961          * If we didn't change anything, or only removed events,
962          * no need to recalculate MMCR* settings and reset the PMCs.
963          * Just reenable the PMU with the current MMCR* settings
964          * (possibly updated for removal of events).
965          */
966         if (!cpuhw->n_added) {
967                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
968                 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
969                 goto out_enable;
970         }
971 
972         /*
973          * Compute MMCR* values for the new set of events
974          */
975         if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
976                                cpuhw->mmcr)) {
977                 /* shouldn't ever get here */
978                 printk(KERN_ERR "oops compute_mmcr failed\n");
979                 goto out;
980         }
981 
982         /*
983          * Add in MMCR0 freeze bits corresponding to the
984          * attr.exclude_* bits for the first event.
985          * We have already checked that all events have the
986          * same values for these bits as the first event.
987          */
988         event = cpuhw->event[0];
989         if (event->attr.exclude_user)
990                 cpuhw->mmcr[0] |= MMCR0_FCP;
991         if (event->attr.exclude_kernel)
992                 cpuhw->mmcr[0] |= freeze_events_kernel;
993         if (event->attr.exclude_hv)
994                 cpuhw->mmcr[0] |= MMCR0_FCHV;
995 
996         /*
997          * Write the new configuration to MMCR* with the freeze
998          * bit set and set the hardware events to their initial values.
999          * Then unfreeze the events.
1000          */
1001         ppc_set_pmu_inuse(1);
1002         mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
1003         mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
1004         mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
1005                                 | MMCR0_FC);
1006 
1007         /*
1008          * Read off any pre-existing events that need to move
1009          * to another PMC.
1010          */
1011         for (i = 0; i < cpuhw->n_events; ++i) {
1012                 event = cpuhw->event[i];
1013                 if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
1014                         power_pmu_read(event);
1015                         write_pmc(event->hw.idx, 0);
1016                         event->hw.idx = 0;
1017                 }
1018         }
1019 
1020         /*
1021          * Initialize the PMCs for all the new and moved events.
1022          */
1023         cpuhw->n_limited = n_lim = 0;
1024         for (i = 0; i < cpuhw->n_events; ++i) {
1025                 event = cpuhw->event[i];
1026                 if (event->hw.idx)
1027                         continue;
1028                 idx = hwc_index[i] + 1;
1029                 if (is_limited_pmc(idx)) {
1030                         cpuhw->limited_counter[n_lim] = event;
1031                         cpuhw->limited_hwidx[n_lim] = idx;
1032                         ++n_lim;
1033                         continue;
1034                 }
1035                 val = 0;
1036                 if (event->hw.sample_period) {
1037                         left = local64_read(&event->hw.period_left);
1038                         if (left < 0x80000000L)
1039                                 val = 0x80000000L - left;
1040                 }
1041                 local64_set(&event->hw.prev_count, val);
1042                 event->hw.idx = idx;
1043                 if (event->hw.state & PERF_HES_STOPPED)
1044                         val = 0;
1045                 write_pmc(idx, val);
1046                 perf_event_update_userpage(event);
1047         }
1048         cpuhw->n_limited = n_lim;
1049         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
1050 
1051  out_enable:
1052         mb();
1053         write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1054 
1055         /*
1056          * Enable instruction sampling if necessary
1057          */
1058         if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
1059                 mb();
1060                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
1061         }
1062 
1063  out:
1064         if (cpuhw->bhrb_users)
1065                 ppmu->config_bhrb(cpuhw->bhrb_filter);
1066 
1067         local_irq_restore(flags);
1068 }
1069 
1070 static int collect_events(struct perf_event *group, int max_count,
1071                           struct perf_event *ctrs[], u64 *events,
1072                           unsigned int *flags)
1073 {
1074         int n = 0;
1075         struct perf_event *event;
1076 
1077         if (!is_software_event(group)) {
1078                 if (n >= max_count)
1079                         return -1;
1080                 ctrs[n] = group;
1081                 flags[n] = group->hw.event_base;
1082                 events[n++] = group->hw.config;
1083         }
1084         list_for_each_entry(event, &group->sibling_list, group_entry) {
1085                 if (!is_software_event(event) &&
1086                     event->state != PERF_EVENT_STATE_OFF) {
1087                         if (n >= max_count)
1088                                 return -1;
1089                         ctrs[n] = event;
1090                         flags[n] = event->hw.event_base;
1091                         events[n++] = event->hw.config;
1092                 }
1093         }
1094         return n;
1095 }
1096 
1097 /*
1098  * Add a event to the PMU.
1099  * If all events are not already frozen, then we disable and
1100  * re-enable the PMU in order to get hw_perf_enable to do the
1101  * actual work of reconfiguring the PMU.
1102  */
1103 static int power_pmu_add(struct perf_event *event, int ef_flags)
1104 {
1105         struct cpu_hw_events *cpuhw;
1106         unsigned long flags;
1107         int n0;
1108         int ret = -EAGAIN;
1109 
1110         local_irq_save(flags);
1111         perf_pmu_disable(event->pmu);
1112 
1113         /*
1114          * Add the event to the list (if there is room)
1115          * and check whether the total set is still feasible.
1116          */
1117         cpuhw = &__get_cpu_var(cpu_hw_events);
1118         n0 = cpuhw->n_events;
1119         if (n0 >= ppmu->n_counter)
1120                 goto out;
1121         cpuhw->event[n0] = event;
1122         cpuhw->events[n0] = event->hw.config;
1123         cpuhw->flags[n0] = event->hw.event_base;
1124 
1125         /*
1126          * This event may have been disabled/stopped in record_and_restart()
1127          * because we exceeded the ->event_limit. If re-starting the event,
1128          * clear the ->hw.state (STOPPED and UPTODATE flags), so the user
1129          * notification is re-enabled.
1130          */
1131         if (!(ef_flags & PERF_EF_START))
1132                 event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
1133         else
1134                 event->hw.state = 0;
1135 
1136         /*
1137          * If group events scheduling transaction was started,
1138          * skip the schedulability test here, it will be performed
1139          * at commit time(->commit_txn) as a whole
1140          */
1141         if (cpuhw->group_flag & PERF_EVENT_TXN)
1142                 goto nocheck;
1143 
1144         if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
1145                 goto out;
1146         if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
1147                 goto out;
1148         event->hw.config = cpuhw->events[n0];
1149 
1150 nocheck:
1151         ++cpuhw->n_events;
1152         ++cpuhw->n_added;
1153 
1154         ret = 0;
1155  out:
1156         if (has_branch_stack(event))
1157                 power_pmu_bhrb_enable(event);
1158 
1159         perf_pmu_enable(event->pmu);
1160         local_irq_restore(flags);
1161         return ret;
1162 }
1163 
1164 /*
1165  * Remove a event from the PMU.
1166  */
1167 static void power_pmu_del(struct perf_event *event, int ef_flags)
1168 {
1169         struct cpu_hw_events *cpuhw;
1170         long i;
1171         unsigned long flags;
1172 
1173         local_irq_save(flags);
1174         perf_pmu_disable(event->pmu);
1175 
1176         power_pmu_read(event);
1177 
1178         cpuhw = &__get_cpu_var(cpu_hw_events);
1179         for (i = 0; i < cpuhw->n_events; ++i) {
1180                 if (event == cpuhw->event[i]) {
1181                         while (++i < cpuhw->n_events) {
1182                                 cpuhw->event[i-1] = cpuhw->event[i];
1183                                 cpuhw->events[i-1] = cpuhw->events[i];
1184                                 cpuhw->flags[i-1] = cpuhw->flags[i];
1185                         }
1186                         --cpuhw->n_events;
1187                         ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
1188                         if (event->hw.idx) {
1189                                 write_pmc(event->hw.idx, 0);
1190                                 event->hw.idx = 0;
1191                         }
1192                         perf_event_update_userpage(event);
1193                         break;
1194                 }
1195         }
1196         for (i = 0; i < cpuhw->n_limited; ++i)
1197                 if (event == cpuhw->limited_counter[i])
1198                         break;
1199         if (i < cpuhw->n_limited) {
1200                 while (++i < cpuhw->n_limited) {
1201                         cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
1202                         cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
1203                 }
1204                 --cpuhw->n_limited;
1205         }
1206         if (cpuhw->n_events == 0) {
1207                 /* disable exceptions if no events are running */
1208                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
1209         }
1210 
1211         if (has_branch_stack(event))
1212                 power_pmu_bhrb_disable(event);
1213 
1214         perf_pmu_enable(event->pmu);
1215         local_irq_restore(flags);
1216 }
1217 
1218 /*
1219  * POWER-PMU does not support disabling individual counters, hence
1220  * program their cycle counter to their max value and ignore the interrupts.
1221  */
1222 
1223 static void power_pmu_start(struct perf_event *event, int ef_flags)
1224 {
1225         unsigned long flags;
1226         s64 left;
1227         unsigned long val;
1228 
1229         if (!event->hw.idx || !event->hw.sample_period)
1230                 return;
1231 
1232         if (!(event->hw.state & PERF_HES_STOPPED))
1233                 return;
1234 
1235         if (ef_flags & PERF_EF_RELOAD)
1236                 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1237 
1238         local_irq_save(flags);
1239         perf_pmu_disable(event->pmu);
1240 
1241         event->hw.state = 0;
1242         left = local64_read(&event->hw.period_left);
1243 
1244         val = 0;
1245         if (left < 0x80000000L)
1246                 val = 0x80000000L - left;
1247 
1248         write_pmc(event->hw.idx, val);
1249 
1250         perf_event_update_userpage(event);
1251         perf_pmu_enable(event->pmu);
1252         local_irq_restore(flags);
1253 }
1254 
1255 static void power_pmu_stop(struct perf_event *event, int ef_flags)
1256 {
1257         unsigned long flags;
1258 
1259         if (!event->hw.idx || !event->hw.sample_period)
1260                 return;
1261 
1262         if (event->hw.state & PERF_HES_STOPPED)
1263                 return;
1264 
1265         local_irq_save(flags);
1266         perf_pmu_disable(event->pmu);
1267 
1268         power_pmu_read(event);
1269         event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
1270         write_pmc(event->hw.idx, 0);
1271 
1272         perf_event_update_userpage(event);
1273         perf_pmu_enable(event->pmu);
1274         local_irq_restore(flags);
1275 }
1276 
1277 /*
1278  * Start group events scheduling transaction
1279  * Set the flag to make pmu::enable() not perform the
1280  * schedulability test, it will be performed at commit time
1281  */
1282 void power_pmu_start_txn(struct pmu *pmu)
1283 {
1284         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1285 
1286         perf_pmu_disable(pmu);
1287         cpuhw->group_flag |= PERF_EVENT_TXN;
1288         cpuhw->n_txn_start = cpuhw->n_events;
1289 }
1290 
1291 /*
1292  * Stop group events scheduling transaction
1293  * Clear the flag and pmu::enable() will perform the
1294  * schedulability test.
1295  */
1296 void power_pmu_cancel_txn(struct pmu *pmu)
1297 {
1298         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1299 
1300         cpuhw->group_flag &= ~PERF_EVENT_TXN;
1301         perf_pmu_enable(pmu);
1302 }
1303 
1304 /*
1305  * Commit group events scheduling transaction
1306  * Perform the group schedulability test as a whole
1307  * Return 0 if success
1308  */
1309 int power_pmu_commit_txn(struct pmu *pmu)
1310 {
1311         struct cpu_hw_events *cpuhw;
1312         long i, n;
1313 
1314         if (!ppmu)
1315                 return -EAGAIN;
1316         cpuhw = &__get_cpu_var(cpu_hw_events);
1317         n = cpuhw->n_events;
1318         if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
1319                 return -EAGAIN;
1320         i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
1321         if (i < 0)
1322                 return -EAGAIN;
1323 
1324         for (i = cpuhw->n_txn_start; i < n; ++i)
1325                 cpuhw->event[i]->hw.config = cpuhw->events[i];
1326 
1327         cpuhw->group_flag &= ~PERF_EVENT_TXN;
1328         perf_pmu_enable(pmu);
1329         return 0;
1330 }
1331 
1332 /*
1333  * Return 1 if we might be able to put event on a limited PMC,
1334  * or 0 if not.
1335  * A event can only go on a limited PMC if it counts something
1336  * that a limited PMC can count, doesn't require interrupts, and
1337  * doesn't exclude any processor mode.
1338  */
1339 static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
1340                                  unsigned int flags)
1341 {
1342         int n;
1343         u64 alt[MAX_EVENT_ALTERNATIVES];
1344 
1345         if (event->attr.exclude_user
1346             || event->attr.exclude_kernel
1347             || event->attr.exclude_hv
1348             || event->attr.sample_period)
1349                 return 0;
1350 
1351         if (ppmu->limited_pmc_event(ev))
1352                 return 1;
1353 
1354         if (ppmu->flags & PPMU_ARCH_207S)
1355                 mtspr(SPRN_MMCR2, 0);
1356 
1357         /*
1358          * The requested event_id isn't on a limited PMC already;
1359          * see if any alternative code goes on a limited PMC.
1360          */
1361         if (!ppmu->get_alternatives)
1362                 return 0;
1363 
1364         flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
1365         n = ppmu->get_alternatives(ev, flags, alt);
1366 
1367         return n > 0;
1368 }
1369 
1370 /*
1371  * Find an alternative event_id that goes on a normal PMC, if possible,
1372  * and return the event_id code, or 0 if there is no such alternative.
1373  * (Note: event_id code 0 is "don't count" on all machines.)
1374  */
1375 static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
1376 {
1377         u64 alt[MAX_EVENT_ALTERNATIVES];
1378         int n;
1379 
1380         flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
1381         n = ppmu->get_alternatives(ev, flags, alt);
1382         if (!n)
1383                 return 0;
1384         return alt[0];
1385 }
1386 
1387 /* Number of perf_events counting hardware events */
1388 static atomic_t num_events;
1389 /* Used to avoid races in calling reserve/release_pmc_hardware */
1390 static DEFINE_MUTEX(pmc_reserve_mutex);
1391 
1392 /*
1393  * Release the PMU if this is the last perf_event.
1394  */
1395 static void hw_perf_event_destroy(struct perf_event *event)
1396 {
1397         if (!atomic_add_unless(&num_events, -1, 1)) {
1398                 mutex_lock(&pmc_reserve_mutex);
1399                 if (atomic_dec_return(&num_events) == 0)
1400                         release_pmc_hardware();
1401                 mutex_unlock(&pmc_reserve_mutex);
1402         }
1403 }
1404 
1405 /*
1406  * Translate a generic cache event_id config to a raw event_id code.
1407  */
1408 static int hw_perf_cache_event(u64 config, u64 *eventp)
1409 {
1410         unsigned long type, op, result;
1411         int ev;
1412 
1413         if (!ppmu->cache_events)
1414                 return -EINVAL;
1415 
1416         /* unpack config */
1417         type = config & 0xff;
1418         op = (config >> 8) & 0xff;
1419         result = (config >> 16) & 0xff;
1420 
1421         if (type >= PERF_COUNT_HW_CACHE_MAX ||
1422             op >= PERF_COUNT_HW_CACHE_OP_MAX ||
1423             result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
1424                 return -EINVAL;
1425 
1426         ev = (*ppmu->cache_events)[type][op][result];
1427         if (ev == 0)
1428                 return -EOPNOTSUPP;
1429         if (ev == -1)
1430                 return -EINVAL;
1431         *eventp = ev;
1432         return 0;
1433 }
1434 
1435 static int power_pmu_event_init(struct perf_event *event)
1436 {
1437         u64 ev;
1438         unsigned long flags;
1439         struct perf_event *ctrs[MAX_HWEVENTS];
1440         u64 events[MAX_HWEVENTS];
1441         unsigned int cflags[MAX_HWEVENTS];
1442         int n;
1443         int err;
1444         struct cpu_hw_events *cpuhw;
1445 
1446         if (!ppmu)
1447                 return -ENOENT;
1448 
1449         if (has_branch_stack(event)) {
1450                 /* PMU has BHRB enabled */
1451                 if (!(ppmu->flags & PPMU_ARCH_207S))
1452                         return -EOPNOTSUPP;
1453         }
1454 
1455         switch (event->attr.type) {
1456         case PERF_TYPE_HARDWARE:
1457                 ev = event->attr.config;
1458                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1459                         return -EOPNOTSUPP;
1460                 ev = ppmu->generic_events[ev];
1461                 break;
1462         case PERF_TYPE_HW_CACHE:
1463                 err = hw_perf_cache_event(event->attr.config, &ev);
1464                 if (err)
1465                         return err;
1466                 break;
1467         case PERF_TYPE_RAW:
1468                 ev = event->attr.config;
1469                 break;
1470         default:
1471                 return -ENOENT;
1472         }
1473 
1474         event->hw.config_base = ev;
1475         event->hw.idx = 0;
1476 
1477         /*
1478          * If we are not running on a hypervisor, force the
1479          * exclude_hv bit to 0 so that we don't care what
1480          * the user set it to.
1481          */
1482         if (!firmware_has_feature(FW_FEATURE_LPAR))
1483                 event->attr.exclude_hv = 0;
1484 
1485         /*
1486          * If this is a per-task event, then we can use
1487          * PM_RUN_* events interchangeably with their non RUN_*
1488          * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
1489          * XXX we should check if the task is an idle task.
1490          */
1491         flags = 0;
1492         if (event->attach_state & PERF_ATTACH_TASK)
1493                 flags |= PPMU_ONLY_COUNT_RUN;
1494 
1495         /*
1496          * If this machine has limited events, check whether this
1497          * event_id could go on a limited event.
1498          */
1499         if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1500                 if (can_go_on_limited_pmc(event, ev, flags)) {
1501                         flags |= PPMU_LIMITED_PMC_OK;
1502                 } else if (ppmu->limited_pmc_event(ev)) {
1503                         /*
1504                          * The requested event_id is on a limited PMC,
1505                          * but we can't use a limited PMC; see if any
1506                          * alternative goes on a normal PMC.
1507                          */
1508                         ev = normal_pmc_alternative(ev, flags);
1509                         if (!ev)
1510                                 return -EINVAL;
1511                 }
1512         }
1513 
1514         /*
1515          * If this is in a group, check if it can go on with all the
1516          * other hardware events in the group.  We assume the event
1517          * hasn't been linked into its leader's sibling list at this point.
1518          */
1519         n = 0;
1520         if (event->group_leader != event) {
1521                 n = collect_events(event->group_leader, ppmu->n_counter - 1,
1522                                    ctrs, events, cflags);
1523                 if (n < 0)
1524                         return -EINVAL;
1525         }
1526         events[n] = ev;
1527         ctrs[n] = event;
1528         cflags[n] = flags;
1529         if (check_excludes(ctrs, cflags, n, 1))
1530                 return -EINVAL;
1531 
1532         cpuhw = &get_cpu_var(cpu_hw_events);
1533         err = power_check_constraints(cpuhw, events, cflags, n + 1);
1534 
1535         if (has_branch_stack(event)) {
1536                 cpuhw->bhrb_filter = ppmu->bhrb_filter_map(
1537                                         event->attr.branch_sample_type);
1538 
1539                 if(cpuhw->bhrb_filter == -1)
1540                         return -EOPNOTSUPP;
1541         }
1542 
1543         put_cpu_var(cpu_hw_events);
1544         if (err)
1545                 return -EINVAL;
1546 
1547         event->hw.config = events[n];
1548         event->hw.event_base = cflags[n];
1549         event->hw.last_period = event->hw.sample_period;
1550         local64_set(&event->hw.period_left, event->hw.last_period);
1551 
1552         /*
1553          * See if we need to reserve the PMU.
1554          * If no events are currently in use, then we have to take a
1555          * mutex to ensure that we don't race with another task doing
1556          * reserve_pmc_hardware or release_pmc_hardware.
1557          */
1558         err = 0;
1559         if (!atomic_inc_not_zero(&num_events)) {
1560                 mutex_lock(&pmc_reserve_mutex);
1561                 if (atomic_read(&num_events) == 0 &&
1562                     reserve_pmc_hardware(perf_event_interrupt))
1563                         err = -EBUSY;
1564                 else
1565                         atomic_inc(&num_events);
1566                 mutex_unlock(&pmc_reserve_mutex);
1567         }
1568         event->destroy = hw_perf_event_destroy;
1569 
1570         return err;
1571 }
1572 
1573 static int power_pmu_event_idx(struct perf_event *event)
1574 {
1575         return event->hw.idx;
1576 }
1577 
1578 ssize_t power_events_sysfs_show(struct device *dev,
1579                                 struct device_attribute *attr, char *page)
1580 {
1581         struct perf_pmu_events_attr *pmu_attr;
1582 
1583         pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
1584 
1585         return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
1586 }
1587 
1588 struct pmu power_pmu = {
1589         .pmu_enable     = power_pmu_enable,
1590         .pmu_disable    = power_pmu_disable,
1591         .event_init     = power_pmu_event_init,
1592         .add            = power_pmu_add,
1593         .del            = power_pmu_del,
1594         .start          = power_pmu_start,
1595         .stop           = power_pmu_stop,
1596         .read           = power_pmu_read,
1597         .start_txn      = power_pmu_start_txn,
1598         .cancel_txn     = power_pmu_cancel_txn,
1599         .commit_txn     = power_pmu_commit_txn,
1600         .event_idx      = power_pmu_event_idx,
1601         .flush_branch_stack = power_pmu_flush_branch_stack,
1602 };
1603 
1604 /*
1605  * A counter has overflowed; update its count and record
1606  * things if requested.  Note that interrupts are hard-disabled
1607  * here so there is no possibility of being interrupted.
1608  */
1609 static void record_and_restart(struct perf_event *event, unsigned long val,
1610                                struct pt_regs *regs)
1611 {
1612         u64 period = event->hw.sample_period;
1613         s64 prev, delta, left;
1614         int record = 0;
1615 
1616         if (event->hw.state & PERF_HES_STOPPED) {
1617                 write_pmc(event->hw.idx, 0);
1618                 return;
1619         }
1620 
1621         /* we don't have to worry about interrupts here */
1622         prev = local64_read(&event->hw.prev_count);
1623         delta = check_and_compute_delta(prev, val);
1624         local64_add(delta, &event->count);
1625 
1626         /*
1627          * See if the total period for this event has expired,
1628          * and update for the next period.
1629          */
1630         val = 0;
1631         left = local64_read(&event->hw.period_left) - delta;
1632         if (delta == 0)
1633                 left++;
1634         if (period) {
1635                 if (left <= 0) {
1636                         left += period;
1637                         if (left <= 0)
1638                                 left = period;
1639                         record = siar_valid(regs);
1640                         event->hw.last_period = event->hw.sample_period;
1641                 }
1642                 if (left < 0x80000000LL)
1643                         val = 0x80000000LL - left;
1644         }
1645 
1646         write_pmc(event->hw.idx, val);
1647         local64_set(&event->hw.prev_count, val);
1648         local64_set(&event->hw.period_left, left);
1649         perf_event_update_userpage(event);
1650 
1651         /*
1652          * Finally record data if requested.
1653          */
1654         if (record) {
1655                 struct perf_sample_data data;
1656 
1657                 perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
1658 
1659                 if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1660                         perf_get_data_addr(regs, &data.addr);
1661 
1662                 if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
1663                         struct cpu_hw_events *cpuhw;
1664                         cpuhw = &__get_cpu_var(cpu_hw_events);
1665                         power_pmu_bhrb_read(cpuhw);
1666                         data.br_stack = &cpuhw->bhrb_stack;
1667                 }
1668 
1669                 if (perf_event_overflow(event, &data, regs))
1670                         power_pmu_stop(event, 0);
1671         }
1672 }
1673 
1674 /*
1675  * Called from generic code to get the misc flags (i.e. processor mode)
1676  * for an event_id.
1677  */
1678 unsigned long perf_misc_flags(struct pt_regs *regs)
1679 {
1680         u32 flags = perf_get_misc_flags(regs);
1681 
1682         if (flags)
1683                 return flags;
1684         return user_mode(regs) ? PERF_RECORD_MISC_USER :
1685                 PERF_RECORD_MISC_KERNEL;
1686 }
1687 
1688 /*
1689  * Called from generic code to get the instruction pointer
1690  * for an event_id.
1691  */
1692 unsigned long perf_instruction_pointer(struct pt_regs *regs)
1693 {
1694         bool use_siar = regs_use_siar(regs);
1695 
1696         if (use_siar && siar_valid(regs))
1697                 return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1698         else if (use_siar)
1699                 return 0;               // no valid instruction pointer
1700         else
1701                 return regs->nip;
1702 }
1703 
1704 static bool pmc_overflow_power7(unsigned long val)
1705 {
1706         /*
1707          * Events on POWER7 can roll back if a speculative event doesn't
1708          * eventually complete. Unfortunately in some rare cases they will
1709          * raise a performance monitor exception. We need to catch this to
1710          * ensure we reset the PMC. In all cases the PMC will be 256 or less
1711          * cycles from overflow.
1712          *
1713          * We only do this if the first pass fails to find any overflowing
1714          * PMCs because a user might set a period of less than 256 and we
1715          * don't want to mistakenly reset them.
1716          */
1717         if ((0x80000000 - val) <= 256)
1718                 return true;
1719 
1720         return false;
1721 }
1722 
1723 static bool pmc_overflow(unsigned long val)
1724 {
1725         if ((int)val < 0)
1726                 return true;
1727 
1728         return false;
1729 }
1730 
1731 /*
1732  * Performance monitor interrupt stuff
1733  */
1734 static void perf_event_interrupt(struct pt_regs *regs)
1735 {
1736         int i, j;
1737         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1738         struct perf_event *event;
1739         unsigned long val[8];
1740         int found, active;
1741         int nmi;
1742 
1743         if (cpuhw->n_limited)
1744                 freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1745                                         mfspr(SPRN_PMC6));
1746 
1747         perf_read_regs(regs);
1748 
1749         nmi = perf_intr_is_nmi(regs);
1750         if (nmi)
1751                 nmi_enter();
1752         else
1753                 irq_enter();
1754 
1755         /* Read all the PMCs since we'll need them a bunch of times */
1756         for (i = 0; i < ppmu->n_counter; ++i)
1757                 val[i] = read_pmc(i + 1);
1758 
1759         /* Try to find what caused the IRQ */
1760         found = 0;
1761         for (i = 0; i < ppmu->n_counter; ++i) {
1762                 if (!pmc_overflow(val[i]))
1763                         continue;
1764                 if (is_limited_pmc(i + 1))
1765                         continue; /* these won't generate IRQs */
1766                 /*
1767                  * We've found one that's overflowed.  For active
1768                  * counters we need to log this.  For inactive
1769                  * counters, we need to reset it anyway
1770                  */
1771                 found = 1;
1772                 active = 0;
1773                 for (j = 0; j < cpuhw->n_events; ++j) {
1774                         event = cpuhw->event[j];
1775                         if (event->hw.idx == (i + 1)) {
1776                                 active = 1;
1777                                 record_and_restart(event, val[i], regs);
1778                                 break;
1779                         }
1780                 }
1781                 if (!active)
1782                         /* reset non active counters that have overflowed */
1783                         write_pmc(i + 1, 0);
1784         }
1785         if (!found && pvr_version_is(PVR_POWER7)) {
1786                 /* check active counters for special buggy p7 overflow */
1787                 for (i = 0; i < cpuhw->n_events; ++i) {
1788                         event = cpuhw->event[i];
1789                         if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1790                                 continue;
1791                         if (pmc_overflow_power7(val[event->hw.idx - 1])) {
1792                                 /* event has overflowed in a buggy way*/
1793                                 found = 1;
1794                                 record_and_restart(event,
1795                                                    val[event->hw.idx - 1],
1796                                                    regs);
1797                         }
1798                 }
1799         }
1800         if (!found && !nmi && printk_ratelimit())
1801                 printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
1802 
1803         /*
1804          * Reset MMCR0 to its normal value.  This will set PMXE and
1805          * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1806          * and thus allow interrupts to occur again.
1807          * XXX might want to use MSR.PM to keep the events frozen until
1808          * we get back out of this interrupt.
1809          */
1810         write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1811 
1812         if (nmi)
1813                 nmi_exit();
1814         else
1815                 irq_exit();
1816 }
1817 
1818 static void power_pmu_setup(int cpu)
1819 {
1820         struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1821 
1822         if (!ppmu)
1823                 return;
1824         memset(cpuhw, 0, sizeof(*cpuhw));
1825         cpuhw->mmcr[0] = MMCR0_FC;
1826 }
1827 
1828 static int __cpuinit
1829 power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1830 {
1831         unsigned int cpu = (long)hcpu;
1832 
1833         switch (action & ~CPU_TASKS_FROZEN) {
1834         case CPU_UP_PREPARE:
1835                 power_pmu_setup(cpu);
1836                 break;
1837 
1838         default:
1839                 break;
1840         }
1841 
1842         return NOTIFY_OK;
1843 }
1844 
1845 int __cpuinit register_power_pmu(struct power_pmu *pmu)
1846 {
1847         if (ppmu)
1848                 return -EBUSY;          /* something's already registered */
1849 
1850         ppmu = pmu;
1851         pr_info("%s performance monitor hardware support registered\n",
1852                 pmu->name);
1853 
1854         power_pmu.attr_groups = ppmu->attr_groups;
1855 
1856 #ifdef MSR_HV
1857         /*
1858          * Use FCHV to ignore kernel events if MSR.HV is set.
1859          */
1860         if (mfmsr() & MSR_HV)
1861                 freeze_events_kernel = MMCR0_FCHV;
1862 #endif /* CONFIG_PPC64 */
1863 
1864         perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
1865         perf_cpu_notifier(power_pmu_notifier);
1866 
1867         return 0;
1868 }
1869 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp