~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.9.1 ] ~ [ linux-5.8.16 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.72 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.152 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.202 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.240 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.240 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Performance events core code:
  3  *
  4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  8  *
  9  * For licensing details see kernel-base/COPYING
 10  */
 11 
 12 #include <linux/fs.h>
 13 #include <linux/mm.h>
 14 #include <linux/cpu.h>
 15 #include <linux/smp.h>
 16 #include <linux/idr.h>
 17 #include <linux/file.h>
 18 #include <linux/poll.h>
 19 #include <linux/slab.h>
 20 #include <linux/hash.h>
 21 #include <linux/tick.h>
 22 #include <linux/sysfs.h>
 23 #include <linux/dcache.h>
 24 #include <linux/percpu.h>
 25 #include <linux/ptrace.h>
 26 #include <linux/reboot.h>
 27 #include <linux/vmstat.h>
 28 #include <linux/device.h>
 29 #include <linux/export.h>
 30 #include <linux/vmalloc.h>
 31 #include <linux/hardirq.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/perf_event.h>
 38 #include <linux/ftrace_event.h>
 39 #include <linux/hw_breakpoint.h>
 40 #include <linux/mm_types.h>
 41 #include <linux/cgroup.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 
 46 #include "internal.h"
 47 
 48 #include <asm/irq_regs.h>
 49 
 50 struct remote_function_call {
 51         struct task_struct      *p;
 52         int                     (*func)(void *info);
 53         void                    *info;
 54         int                     ret;
 55 };
 56 
 57 static void remote_function(void *data)
 58 {
 59         struct remote_function_call *tfc = data;
 60         struct task_struct *p = tfc->p;
 61 
 62         if (p) {
 63                 tfc->ret = -EAGAIN;
 64                 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
 65                         return;
 66         }
 67 
 68         tfc->ret = tfc->func(tfc->info);
 69 }
 70 
 71 /**
 72  * task_function_call - call a function on the cpu on which a task runs
 73  * @p:          the task to evaluate
 74  * @func:       the function to be called
 75  * @info:       the function call argument
 76  *
 77  * Calls the function @func when the task is currently running. This might
 78  * be on the current CPU, which just calls the function directly
 79  *
 80  * returns: @func return value, or
 81  *          -ESRCH  - when the process isn't running
 82  *          -EAGAIN - when the process moved away
 83  */
 84 static int
 85 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
 86 {
 87         struct remote_function_call data = {
 88                 .p      = p,
 89                 .func   = func,
 90                 .info   = info,
 91                 .ret    = -ESRCH, /* No such (running) process */
 92         };
 93 
 94         if (task_curr(p))
 95                 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 96 
 97         return data.ret;
 98 }
 99 
100 /**
101  * cpu_function_call - call a function on the cpu
102  * @func:       the function to be called
103  * @info:       the function call argument
104  *
105  * Calls the function @func on the remote cpu.
106  *
107  * returns: @func return value or -ENXIO when the cpu is offline
108  */
109 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
110 {
111         struct remote_function_call data = {
112                 .p      = NULL,
113                 .func   = func,
114                 .info   = info,
115                 .ret    = -ENXIO, /* No such CPU */
116         };
117 
118         smp_call_function_single(cpu, remote_function, &data, 1);
119 
120         return data.ret;
121 }
122 
123 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
124                        PERF_FLAG_FD_OUTPUT  |\
125                        PERF_FLAG_PID_CGROUP |\
126                        PERF_FLAG_FD_CLOEXEC)
127 
128 /*
129  * branch priv levels that need permission checks
130  */
131 #define PERF_SAMPLE_BRANCH_PERM_PLM \
132         (PERF_SAMPLE_BRANCH_KERNEL |\
133          PERF_SAMPLE_BRANCH_HV)
134 
135 enum event_type_t {
136         EVENT_FLEXIBLE = 0x1,
137         EVENT_PINNED = 0x2,
138         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
139 };
140 
141 /*
142  * perf_sched_events : >0 events exist
143  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
144  */
145 struct static_key_deferred perf_sched_events __read_mostly;
146 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
147 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
148 
149 static atomic_t nr_mmap_events __read_mostly;
150 static atomic_t nr_comm_events __read_mostly;
151 static atomic_t nr_task_events __read_mostly;
152 static atomic_t nr_freq_events __read_mostly;
153 
154 static LIST_HEAD(pmus);
155 static DEFINE_MUTEX(pmus_lock);
156 static struct srcu_struct pmus_srcu;
157 
158 /*
159  * perf event paranoia level:
160  *  -1 - not paranoid at all
161  *   0 - disallow raw tracepoint access for unpriv
162  *   1 - disallow cpu events for unpriv
163  *   2 - disallow kernel profiling for unpriv
164  */
165 int sysctl_perf_event_paranoid __read_mostly = 1;
166 
167 /* Minimum for 512 kiB + 1 user control page */
168 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
169 
170 /*
171  * max perf event sample rate
172  */
173 #define DEFAULT_MAX_SAMPLE_RATE         100000
174 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
175 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
176 
177 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
178 
179 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
180 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
181 
182 static int perf_sample_allowed_ns __read_mostly =
183         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
184 
185 void update_perf_cpu_limits(void)
186 {
187         u64 tmp = perf_sample_period_ns;
188 
189         tmp *= sysctl_perf_cpu_time_max_percent;
190         do_div(tmp, 100);
191         ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
192 }
193 
194 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
195 
196 int perf_proc_update_handler(struct ctl_table *table, int write,
197                 void __user *buffer, size_t *lenp,
198                 loff_t *ppos)
199 {
200         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
201 
202         if (ret || !write)
203                 return ret;
204 
205         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
206         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
207         update_perf_cpu_limits();
208 
209         return 0;
210 }
211 
212 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
213 
214 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
215                                 void __user *buffer, size_t *lenp,
216                                 loff_t *ppos)
217 {
218         int ret = proc_dointvec(table, write, buffer, lenp, ppos);
219 
220         if (ret || !write)
221                 return ret;
222 
223         update_perf_cpu_limits();
224 
225         return 0;
226 }
227 
228 /*
229  * perf samples are done in some very critical code paths (NMIs).
230  * If they take too much CPU time, the system can lock up and not
231  * get any real work done.  This will drop the sample rate when
232  * we detect that events are taking too long.
233  */
234 #define NR_ACCUMULATED_SAMPLES 128
235 static DEFINE_PER_CPU(u64, running_sample_length);
236 
237 static void perf_duration_warn(struct irq_work *w)
238 {
239         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
240         u64 avg_local_sample_len;
241         u64 local_samples_len;
242 
243         local_samples_len = __get_cpu_var(running_sample_length);
244         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
245 
246         printk_ratelimited(KERN_WARNING
247                         "perf interrupt took too long (%lld > %lld), lowering "
248                         "kernel.perf_event_max_sample_rate to %d\n",
249                         avg_local_sample_len, allowed_ns >> 1,
250                         sysctl_perf_event_sample_rate);
251 }
252 
253 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
254 
255 void perf_sample_event_took(u64 sample_len_ns)
256 {
257         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
258         u64 avg_local_sample_len;
259         u64 local_samples_len;
260 
261         if (allowed_ns == 0)
262                 return;
263 
264         /* decay the counter by 1 average sample */
265         local_samples_len = __get_cpu_var(running_sample_length);
266         local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
267         local_samples_len += sample_len_ns;
268         __get_cpu_var(running_sample_length) = local_samples_len;
269 
270         /*
271          * note: this will be biased artifically low until we have
272          * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
273          * from having to maintain a count.
274          */
275         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
276 
277         if (avg_local_sample_len <= allowed_ns)
278                 return;
279 
280         if (max_samples_per_tick <= 1)
281                 return;
282 
283         max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
284         sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
285         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
286 
287         update_perf_cpu_limits();
288 
289         if (!irq_work_queue(&perf_duration_work)) {
290                 early_printk("perf interrupt took too long (%lld > %lld), lowering "
291                              "kernel.perf_event_max_sample_rate to %d\n",
292                              avg_local_sample_len, allowed_ns >> 1,
293                              sysctl_perf_event_sample_rate);
294         }
295 }
296 
297 static atomic64_t perf_event_id;
298 
299 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
300                               enum event_type_t event_type);
301 
302 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
303                              enum event_type_t event_type,
304                              struct task_struct *task);
305 
306 static void update_context_time(struct perf_event_context *ctx);
307 static u64 perf_event_time(struct perf_event *event);
308 
309 void __weak perf_event_print_debug(void)        { }
310 
311 extern __weak const char *perf_pmu_name(void)
312 {
313         return "pmu";
314 }
315 
316 static inline u64 perf_clock(void)
317 {
318         return local_clock();
319 }
320 
321 static inline struct perf_cpu_context *
322 __get_cpu_context(struct perf_event_context *ctx)
323 {
324         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
325 }
326 
327 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
328                           struct perf_event_context *ctx)
329 {
330         raw_spin_lock(&cpuctx->ctx.lock);
331         if (ctx)
332                 raw_spin_lock(&ctx->lock);
333 }
334 
335 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
336                             struct perf_event_context *ctx)
337 {
338         if (ctx)
339                 raw_spin_unlock(&ctx->lock);
340         raw_spin_unlock(&cpuctx->ctx.lock);
341 }
342 
343 #ifdef CONFIG_CGROUP_PERF
344 
345 /*
346  * perf_cgroup_info keeps track of time_enabled for a cgroup.
347  * This is a per-cpu dynamically allocated data structure.
348  */
349 struct perf_cgroup_info {
350         u64                             time;
351         u64                             timestamp;
352 };
353 
354 struct perf_cgroup {
355         struct cgroup_subsys_state      css;
356         struct perf_cgroup_info __percpu *info;
357 };
358 
359 /*
360  * Must ensure cgroup is pinned (css_get) before calling
361  * this function. In other words, we cannot call this function
362  * if there is no cgroup event for the current CPU context.
363  */
364 static inline struct perf_cgroup *
365 perf_cgroup_from_task(struct task_struct *task)
366 {
367         return container_of(task_css(task, perf_event_cgrp_id),
368                             struct perf_cgroup, css);
369 }
370 
371 static inline bool
372 perf_cgroup_match(struct perf_event *event)
373 {
374         struct perf_event_context *ctx = event->ctx;
375         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
376 
377         /* @event doesn't care about cgroup */
378         if (!event->cgrp)
379                 return true;
380 
381         /* wants specific cgroup scope but @cpuctx isn't associated with any */
382         if (!cpuctx->cgrp)
383                 return false;
384 
385         /*
386          * Cgroup scoping is recursive.  An event enabled for a cgroup is
387          * also enabled for all its descendant cgroups.  If @cpuctx's
388          * cgroup is a descendant of @event's (the test covers identity
389          * case), it's a match.
390          */
391         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
392                                     event->cgrp->css.cgroup);
393 }
394 
395 static inline void perf_put_cgroup(struct perf_event *event)
396 {
397         css_put(&event->cgrp->css);
398 }
399 
400 static inline void perf_detach_cgroup(struct perf_event *event)
401 {
402         perf_put_cgroup(event);
403         event->cgrp = NULL;
404 }
405 
406 static inline int is_cgroup_event(struct perf_event *event)
407 {
408         return event->cgrp != NULL;
409 }
410 
411 static inline u64 perf_cgroup_event_time(struct perf_event *event)
412 {
413         struct perf_cgroup_info *t;
414 
415         t = per_cpu_ptr(event->cgrp->info, event->cpu);
416         return t->time;
417 }
418 
419 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
420 {
421         struct perf_cgroup_info *info;
422         u64 now;
423 
424         now = perf_clock();
425 
426         info = this_cpu_ptr(cgrp->info);
427 
428         info->time += now - info->timestamp;
429         info->timestamp = now;
430 }
431 
432 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
433 {
434         struct perf_cgroup *cgrp_out = cpuctx->cgrp;
435         if (cgrp_out)
436                 __update_cgrp_time(cgrp_out);
437 }
438 
439 static inline void update_cgrp_time_from_event(struct perf_event *event)
440 {
441         struct perf_cgroup *cgrp;
442 
443         /*
444          * ensure we access cgroup data only when needed and
445          * when we know the cgroup is pinned (css_get)
446          */
447         if (!is_cgroup_event(event))
448                 return;
449 
450         cgrp = perf_cgroup_from_task(current);
451         /*
452          * Do not update time when cgroup is not active
453          */
454         if (cgrp == event->cgrp)
455                 __update_cgrp_time(event->cgrp);
456 }
457 
458 static inline void
459 perf_cgroup_set_timestamp(struct task_struct *task,
460                           struct perf_event_context *ctx)
461 {
462         struct perf_cgroup *cgrp;
463         struct perf_cgroup_info *info;
464 
465         /*
466          * ctx->lock held by caller
467          * ensure we do not access cgroup data
468          * unless we have the cgroup pinned (css_get)
469          */
470         if (!task || !ctx->nr_cgroups)
471                 return;
472 
473         cgrp = perf_cgroup_from_task(task);
474         info = this_cpu_ptr(cgrp->info);
475         info->timestamp = ctx->timestamp;
476 }
477 
478 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
479 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
480 
481 /*
482  * reschedule events based on the cgroup constraint of task.
483  *
484  * mode SWOUT : schedule out everything
485  * mode SWIN : schedule in based on cgroup for next
486  */
487 void perf_cgroup_switch(struct task_struct *task, int mode)
488 {
489         struct perf_cpu_context *cpuctx;
490         struct pmu *pmu;
491         unsigned long flags;
492 
493         /*
494          * disable interrupts to avoid geting nr_cgroup
495          * changes via __perf_event_disable(). Also
496          * avoids preemption.
497          */
498         local_irq_save(flags);
499 
500         /*
501          * we reschedule only in the presence of cgroup
502          * constrained events.
503          */
504         rcu_read_lock();
505 
506         list_for_each_entry_rcu(pmu, &pmus, entry) {
507                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
508                 if (cpuctx->unique_pmu != pmu)
509                         continue; /* ensure we process each cpuctx once */
510 
511                 /*
512                  * perf_cgroup_events says at least one
513                  * context on this CPU has cgroup events.
514                  *
515                  * ctx->nr_cgroups reports the number of cgroup
516                  * events for a context.
517                  */
518                 if (cpuctx->ctx.nr_cgroups > 0) {
519                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
520                         perf_pmu_disable(cpuctx->ctx.pmu);
521 
522                         if (mode & PERF_CGROUP_SWOUT) {
523                                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
524                                 /*
525                                  * must not be done before ctxswout due
526                                  * to event_filter_match() in event_sched_out()
527                                  */
528                                 cpuctx->cgrp = NULL;
529                         }
530 
531                         if (mode & PERF_CGROUP_SWIN) {
532                                 WARN_ON_ONCE(cpuctx->cgrp);
533                                 /*
534                                  * set cgrp before ctxsw in to allow
535                                  * event_filter_match() to not have to pass
536                                  * task around
537                                  */
538                                 cpuctx->cgrp = perf_cgroup_from_task(task);
539                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
540                         }
541                         perf_pmu_enable(cpuctx->ctx.pmu);
542                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
543                 }
544         }
545 
546         rcu_read_unlock();
547 
548         local_irq_restore(flags);
549 }
550 
551 static inline void perf_cgroup_sched_out(struct task_struct *task,
552                                          struct task_struct *next)
553 {
554         struct perf_cgroup *cgrp1;
555         struct perf_cgroup *cgrp2 = NULL;
556 
557         /*
558          * we come here when we know perf_cgroup_events > 0
559          */
560         cgrp1 = perf_cgroup_from_task(task);
561 
562         /*
563          * next is NULL when called from perf_event_enable_on_exec()
564          * that will systematically cause a cgroup_switch()
565          */
566         if (next)
567                 cgrp2 = perf_cgroup_from_task(next);
568 
569         /*
570          * only schedule out current cgroup events if we know
571          * that we are switching to a different cgroup. Otherwise,
572          * do no touch the cgroup events.
573          */
574         if (cgrp1 != cgrp2)
575                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
576 }
577 
578 static inline void perf_cgroup_sched_in(struct task_struct *prev,
579                                         struct task_struct *task)
580 {
581         struct perf_cgroup *cgrp1;
582         struct perf_cgroup *cgrp2 = NULL;
583 
584         /*
585          * we come here when we know perf_cgroup_events > 0
586          */
587         cgrp1 = perf_cgroup_from_task(task);
588 
589         /* prev can never be NULL */
590         cgrp2 = perf_cgroup_from_task(prev);
591 
592         /*
593          * only need to schedule in cgroup events if we are changing
594          * cgroup during ctxsw. Cgroup events were not scheduled
595          * out of ctxsw out if that was not the case.
596          */
597         if (cgrp1 != cgrp2)
598                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
599 }
600 
601 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
602                                       struct perf_event_attr *attr,
603                                       struct perf_event *group_leader)
604 {
605         struct perf_cgroup *cgrp;
606         struct cgroup_subsys_state *css;
607         struct fd f = fdget(fd);
608         int ret = 0;
609 
610         if (!f.file)
611                 return -EBADF;
612 
613         css = css_tryget_online_from_dir(f.file->f_dentry,
614                                          &perf_event_cgrp_subsys);
615         if (IS_ERR(css)) {
616                 ret = PTR_ERR(css);
617                 goto out;
618         }
619 
620         cgrp = container_of(css, struct perf_cgroup, css);
621         event->cgrp = cgrp;
622 
623         /*
624          * all events in a group must monitor
625          * the same cgroup because a task belongs
626          * to only one perf cgroup at a time
627          */
628         if (group_leader && group_leader->cgrp != cgrp) {
629                 perf_detach_cgroup(event);
630                 ret = -EINVAL;
631         }
632 out:
633         fdput(f);
634         return ret;
635 }
636 
637 static inline void
638 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
639 {
640         struct perf_cgroup_info *t;
641         t = per_cpu_ptr(event->cgrp->info, event->cpu);
642         event->shadow_ctx_time = now - t->timestamp;
643 }
644 
645 static inline void
646 perf_cgroup_defer_enabled(struct perf_event *event)
647 {
648         /*
649          * when the current task's perf cgroup does not match
650          * the event's, we need to remember to call the
651          * perf_mark_enable() function the first time a task with
652          * a matching perf cgroup is scheduled in.
653          */
654         if (is_cgroup_event(event) && !perf_cgroup_match(event))
655                 event->cgrp_defer_enabled = 1;
656 }
657 
658 static inline void
659 perf_cgroup_mark_enabled(struct perf_event *event,
660                          struct perf_event_context *ctx)
661 {
662         struct perf_event *sub;
663         u64 tstamp = perf_event_time(event);
664 
665         if (!event->cgrp_defer_enabled)
666                 return;
667 
668         event->cgrp_defer_enabled = 0;
669 
670         event->tstamp_enabled = tstamp - event->total_time_enabled;
671         list_for_each_entry(sub, &event->sibling_list, group_entry) {
672                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
673                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
674                         sub->cgrp_defer_enabled = 0;
675                 }
676         }
677 }
678 #else /* !CONFIG_CGROUP_PERF */
679 
680 static inline bool
681 perf_cgroup_match(struct perf_event *event)
682 {
683         return true;
684 }
685 
686 static inline void perf_detach_cgroup(struct perf_event *event)
687 {}
688 
689 static inline int is_cgroup_event(struct perf_event *event)
690 {
691         return 0;
692 }
693 
694 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
695 {
696         return 0;
697 }
698 
699 static inline void update_cgrp_time_from_event(struct perf_event *event)
700 {
701 }
702 
703 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
704 {
705 }
706 
707 static inline void perf_cgroup_sched_out(struct task_struct *task,
708                                          struct task_struct *next)
709 {
710 }
711 
712 static inline void perf_cgroup_sched_in(struct task_struct *prev,
713                                         struct task_struct *task)
714 {
715 }
716 
717 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
718                                       struct perf_event_attr *attr,
719                                       struct perf_event *group_leader)
720 {
721         return -EINVAL;
722 }
723 
724 static inline void
725 perf_cgroup_set_timestamp(struct task_struct *task,
726                           struct perf_event_context *ctx)
727 {
728 }
729 
730 void
731 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
732 {
733 }
734 
735 static inline void
736 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
737 {
738 }
739 
740 static inline u64 perf_cgroup_event_time(struct perf_event *event)
741 {
742         return 0;
743 }
744 
745 static inline void
746 perf_cgroup_defer_enabled(struct perf_event *event)
747 {
748 }
749 
750 static inline void
751 perf_cgroup_mark_enabled(struct perf_event *event,
752                          struct perf_event_context *ctx)
753 {
754 }
755 #endif
756 
757 /*
758  * set default to be dependent on timer tick just
759  * like original code
760  */
761 #define PERF_CPU_HRTIMER (1000 / HZ)
762 /*
763  * function must be called with interrupts disbled
764  */
765 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
766 {
767         struct perf_cpu_context *cpuctx;
768         enum hrtimer_restart ret = HRTIMER_NORESTART;
769         int rotations = 0;
770 
771         WARN_ON(!irqs_disabled());
772 
773         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
774 
775         rotations = perf_rotate_context(cpuctx);
776 
777         /*
778          * arm timer if needed
779          */
780         if (rotations) {
781                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
782                 ret = HRTIMER_RESTART;
783         }
784 
785         return ret;
786 }
787 
788 /* CPU is going down */
789 void perf_cpu_hrtimer_cancel(int cpu)
790 {
791         struct perf_cpu_context *cpuctx;
792         struct pmu *pmu;
793         unsigned long flags;
794 
795         if (WARN_ON(cpu != smp_processor_id()))
796                 return;
797 
798         local_irq_save(flags);
799 
800         rcu_read_lock();
801 
802         list_for_each_entry_rcu(pmu, &pmus, entry) {
803                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
804 
805                 if (pmu->task_ctx_nr == perf_sw_context)
806                         continue;
807 
808                 hrtimer_cancel(&cpuctx->hrtimer);
809         }
810 
811         rcu_read_unlock();
812 
813         local_irq_restore(flags);
814 }
815 
816 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
817 {
818         struct hrtimer *hr = &cpuctx->hrtimer;
819         struct pmu *pmu = cpuctx->ctx.pmu;
820         int timer;
821 
822         /* no multiplexing needed for SW PMU */
823         if (pmu->task_ctx_nr == perf_sw_context)
824                 return;
825 
826         /*
827          * check default is sane, if not set then force to
828          * default interval (1/tick)
829          */
830         timer = pmu->hrtimer_interval_ms;
831         if (timer < 1)
832                 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
833 
834         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
835 
836         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
837         hr->function = perf_cpu_hrtimer_handler;
838 }
839 
840 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
841 {
842         struct hrtimer *hr = &cpuctx->hrtimer;
843         struct pmu *pmu = cpuctx->ctx.pmu;
844 
845         /* not for SW PMU */
846         if (pmu->task_ctx_nr == perf_sw_context)
847                 return;
848 
849         if (hrtimer_active(hr))
850                 return;
851 
852         if (!hrtimer_callback_running(hr))
853                 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
854                                          0, HRTIMER_MODE_REL_PINNED, 0);
855 }
856 
857 void perf_pmu_disable(struct pmu *pmu)
858 {
859         int *count = this_cpu_ptr(pmu->pmu_disable_count);
860         if (!(*count)++)
861                 pmu->pmu_disable(pmu);
862 }
863 
864 void perf_pmu_enable(struct pmu *pmu)
865 {
866         int *count = this_cpu_ptr(pmu->pmu_disable_count);
867         if (!--(*count))
868                 pmu->pmu_enable(pmu);
869 }
870 
871 static DEFINE_PER_CPU(struct list_head, rotation_list);
872 
873 /*
874  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
875  * because they're strictly cpu affine and rotate_start is called with IRQs
876  * disabled, while rotate_context is called from IRQ context.
877  */
878 static void perf_pmu_rotate_start(struct pmu *pmu)
879 {
880         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
881         struct list_head *head = &__get_cpu_var(rotation_list);
882 
883         WARN_ON(!irqs_disabled());
884 
885         if (list_empty(&cpuctx->rotation_list))
886                 list_add(&cpuctx->rotation_list, head);
887 }
888 
889 static void get_ctx(struct perf_event_context *ctx)
890 {
891         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
892 }
893 
894 static void put_ctx(struct perf_event_context *ctx)
895 {
896         if (atomic_dec_and_test(&ctx->refcount)) {
897                 if (ctx->parent_ctx)
898                         put_ctx(ctx->parent_ctx);
899                 if (ctx->task)
900                         put_task_struct(ctx->task);
901                 kfree_rcu(ctx, rcu_head);
902         }
903 }
904 
905 /*
906  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
907  * perf_pmu_migrate_context() we need some magic.
908  *
909  * Those places that change perf_event::ctx will hold both
910  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
911  *
912  * Lock ordering is by mutex address. There is one other site where
913  * perf_event_context::mutex nests and that is put_event(). But remember that
914  * that is a parent<->child context relation, and migration does not affect
915  * children, therefore these two orderings should not interact.
916  *
917  * The change in perf_event::ctx does not affect children (as claimed above)
918  * because the sys_perf_event_open() case will install a new event and break
919  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
920  * concerned with cpuctx and that doesn't have children.
921  *
922  * The places that change perf_event::ctx will issue:
923  *
924  *   perf_remove_from_context();
925  *   synchronize_rcu();
926  *   perf_install_in_context();
927  *
928  * to affect the change. The remove_from_context() + synchronize_rcu() should
929  * quiesce the event, after which we can install it in the new location. This
930  * means that only external vectors (perf_fops, prctl) can perturb the event
931  * while in transit. Therefore all such accessors should also acquire
932  * perf_event_context::mutex to serialize against this.
933  *
934  * However; because event->ctx can change while we're waiting to acquire
935  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
936  * function.
937  *
938  * Lock order:
939  *    cred_guard_mutex
940  *      task_struct::perf_event_mutex
941  *        perf_event_context::mutex
942  *          perf_event_context::lock
943  *          perf_event::child_mutex;
944  *          perf_event::mmap_mutex
945  *          mmap_sem
946  */
947 static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event)
948 {
949         struct perf_event_context *ctx;
950 
951 again:
952         rcu_read_lock();
953         ctx = ACCESS_ONCE(event->ctx);
954         if (!atomic_inc_not_zero(&ctx->refcount)) {
955                 rcu_read_unlock();
956                 goto again;
957         }
958         rcu_read_unlock();
959 
960         mutex_lock(&ctx->mutex);
961         if (event->ctx != ctx) {
962                 mutex_unlock(&ctx->mutex);
963                 put_ctx(ctx);
964                 goto again;
965         }
966 
967         return ctx;
968 }
969 
970 static void perf_event_ctx_unlock(struct perf_event *event,
971                                   struct perf_event_context *ctx)
972 {
973         mutex_unlock(&ctx->mutex);
974         put_ctx(ctx);
975 }
976 
977 /*
978  * This must be done under the ctx->lock, such as to serialize against
979  * context_equiv(), therefore we cannot call put_ctx() since that might end up
980  * calling scheduler related locks and ctx->lock nests inside those.
981  */
982 static __must_check struct perf_event_context *
983 unclone_ctx(struct perf_event_context *ctx)
984 {
985         struct perf_event_context *parent_ctx = ctx->parent_ctx;
986 
987         lockdep_assert_held(&ctx->lock);
988 
989         if (parent_ctx)
990                 ctx->parent_ctx = NULL;
991         ctx->generation++;
992 
993         return parent_ctx;
994 }
995 
996 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
997 {
998         /*
999          * only top level events have the pid namespace they were created in
1000          */
1001         if (event->parent)
1002                 event = event->parent;
1003 
1004         return task_tgid_nr_ns(p, event->ns);
1005 }
1006 
1007 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1008 {
1009         /*
1010          * only top level events have the pid namespace they were created in
1011          */
1012         if (event->parent)
1013                 event = event->parent;
1014 
1015         return task_pid_nr_ns(p, event->ns);
1016 }
1017 
1018 /*
1019  * If we inherit events we want to return the parent event id
1020  * to userspace.
1021  */
1022 static u64 primary_event_id(struct perf_event *event)
1023 {
1024         u64 id = event->id;
1025 
1026         if (event->parent)
1027                 id = event->parent->id;
1028 
1029         return id;
1030 }
1031 
1032 /*
1033  * Get the perf_event_context for a task and lock it.
1034  * This has to cope with with the fact that until it is locked,
1035  * the context could get moved to another task.
1036  */
1037 static struct perf_event_context *
1038 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1039 {
1040         struct perf_event_context *ctx;
1041 
1042 retry:
1043         /*
1044          * One of the few rules of preemptible RCU is that one cannot do
1045          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1046          * part of the read side critical section was preemptible -- see
1047          * rcu_read_unlock_special().
1048          *
1049          * Since ctx->lock nests under rq->lock we must ensure the entire read
1050          * side critical section is non-preemptible.
1051          */
1052         preempt_disable();
1053         rcu_read_lock();
1054         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1055         if (ctx) {
1056                 /*
1057                  * If this context is a clone of another, it might
1058                  * get swapped for another underneath us by
1059                  * perf_event_task_sched_out, though the
1060                  * rcu_read_lock() protects us from any context
1061                  * getting freed.  Lock the context and check if it
1062                  * got swapped before we could get the lock, and retry
1063                  * if so.  If we locked the right context, then it
1064                  * can't get swapped on us any more.
1065                  */
1066                 raw_spin_lock_irqsave(&ctx->lock, *flags);
1067                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1068                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1069                         rcu_read_unlock();
1070                         preempt_enable();
1071                         goto retry;
1072                 }
1073 
1074                 if (!atomic_inc_not_zero(&ctx->refcount)) {
1075                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1076                         ctx = NULL;
1077                 }
1078         }
1079         rcu_read_unlock();
1080         preempt_enable();
1081         return ctx;
1082 }
1083 
1084 /*
1085  * Get the context for a task and increment its pin_count so it
1086  * can't get swapped to another task.  This also increments its
1087  * reference count so that the context can't get freed.
1088  */
1089 static struct perf_event_context *
1090 perf_pin_task_context(struct task_struct *task, int ctxn)
1091 {
1092         struct perf_event_context *ctx;
1093         unsigned long flags;
1094 
1095         ctx = perf_lock_task_context(task, ctxn, &flags);
1096         if (ctx) {
1097                 ++ctx->pin_count;
1098                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1099         }
1100         return ctx;
1101 }
1102 
1103 static void perf_unpin_context(struct perf_event_context *ctx)
1104 {
1105         unsigned long flags;
1106 
1107         raw_spin_lock_irqsave(&ctx->lock, flags);
1108         --ctx->pin_count;
1109         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1110 }
1111 
1112 /*
1113  * Update the record of the current time in a context.
1114  */
1115 static void update_context_time(struct perf_event_context *ctx)
1116 {
1117         u64 now = perf_clock();
1118 
1119         ctx->time += now - ctx->timestamp;
1120         ctx->timestamp = now;
1121 }
1122 
1123 static u64 perf_event_time(struct perf_event *event)
1124 {
1125         struct perf_event_context *ctx = event->ctx;
1126 
1127         if (is_cgroup_event(event))
1128                 return perf_cgroup_event_time(event);
1129 
1130         return ctx ? ctx->time : 0;
1131 }
1132 
1133 /*
1134  * Update the total_time_enabled and total_time_running fields for a event.
1135  * The caller of this function needs to hold the ctx->lock.
1136  */
1137 static void update_event_times(struct perf_event *event)
1138 {
1139         struct perf_event_context *ctx = event->ctx;
1140         u64 run_end;
1141 
1142         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1143             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1144                 return;
1145         /*
1146          * in cgroup mode, time_enabled represents
1147          * the time the event was enabled AND active
1148          * tasks were in the monitored cgroup. This is
1149          * independent of the activity of the context as
1150          * there may be a mix of cgroup and non-cgroup events.
1151          *
1152          * That is why we treat cgroup events differently
1153          * here.
1154          */
1155         if (is_cgroup_event(event))
1156                 run_end = perf_cgroup_event_time(event);
1157         else if (ctx->is_active)
1158                 run_end = ctx->time;
1159         else
1160                 run_end = event->tstamp_stopped;
1161 
1162         event->total_time_enabled = run_end - event->tstamp_enabled;
1163 
1164         if (event->state == PERF_EVENT_STATE_INACTIVE)
1165                 run_end = event->tstamp_stopped;
1166         else
1167                 run_end = perf_event_time(event);
1168 
1169         event->total_time_running = run_end - event->tstamp_running;
1170 
1171 }
1172 
1173 /*
1174  * Update total_time_enabled and total_time_running for all events in a group.
1175  */
1176 static void update_group_times(struct perf_event *leader)
1177 {
1178         struct perf_event *event;
1179 
1180         update_event_times(leader);
1181         list_for_each_entry(event, &leader->sibling_list, group_entry)
1182                 update_event_times(event);
1183 }
1184 
1185 static struct list_head *
1186 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1187 {
1188         if (event->attr.pinned)
1189                 return &ctx->pinned_groups;
1190         else
1191                 return &ctx->flexible_groups;
1192 }
1193 
1194 /*
1195  * Add a event from the lists for its context.
1196  * Must be called with ctx->mutex and ctx->lock held.
1197  */
1198 static void
1199 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1200 {
1201         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1202         event->attach_state |= PERF_ATTACH_CONTEXT;
1203 
1204         /*
1205          * If we're a stand alone event or group leader, we go to the context
1206          * list, group events are kept attached to the group so that
1207          * perf_group_detach can, at all times, locate all siblings.
1208          */
1209         if (event->group_leader == event) {
1210                 struct list_head *list;
1211 
1212                 if (is_software_event(event))
1213                         event->group_flags |= PERF_GROUP_SOFTWARE;
1214 
1215                 list = ctx_group_list(event, ctx);
1216                 list_add_tail(&event->group_entry, list);
1217         }
1218 
1219         if (is_cgroup_event(event))
1220                 ctx->nr_cgroups++;
1221 
1222         if (has_branch_stack(event))
1223                 ctx->nr_branch_stack++;
1224 
1225         list_add_rcu(&event->event_entry, &ctx->event_list);
1226         if (!ctx->nr_events)
1227                 perf_pmu_rotate_start(ctx->pmu);
1228         ctx->nr_events++;
1229         if (event->attr.inherit_stat)
1230                 ctx->nr_stat++;
1231 
1232         ctx->generation++;
1233 }
1234 
1235 /*
1236  * Initialize event state based on the perf_event_attr::disabled.
1237  */
1238 static inline void perf_event__state_init(struct perf_event *event)
1239 {
1240         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1241                                               PERF_EVENT_STATE_INACTIVE;
1242 }
1243 
1244 /*
1245  * Called at perf_event creation and when events are attached/detached from a
1246  * group.
1247  */
1248 static void perf_event__read_size(struct perf_event *event)
1249 {
1250         int entry = sizeof(u64); /* value */
1251         int size = 0;
1252         int nr = 1;
1253 
1254         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1255                 size += sizeof(u64);
1256 
1257         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1258                 size += sizeof(u64);
1259 
1260         if (event->attr.read_format & PERF_FORMAT_ID)
1261                 entry += sizeof(u64);
1262 
1263         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1264                 nr += event->group_leader->nr_siblings;
1265                 size += sizeof(u64);
1266         }
1267 
1268         size += entry * nr;
1269         event->read_size = size;
1270 }
1271 
1272 static void perf_event__header_size(struct perf_event *event)
1273 {
1274         struct perf_sample_data *data;
1275         u64 sample_type = event->attr.sample_type;
1276         u16 size = 0;
1277 
1278         perf_event__read_size(event);
1279 
1280         if (sample_type & PERF_SAMPLE_IP)
1281                 size += sizeof(data->ip);
1282 
1283         if (sample_type & PERF_SAMPLE_ADDR)
1284                 size += sizeof(data->addr);
1285 
1286         if (sample_type & PERF_SAMPLE_PERIOD)
1287                 size += sizeof(data->period);
1288 
1289         if (sample_type & PERF_SAMPLE_WEIGHT)
1290                 size += sizeof(data->weight);
1291 
1292         if (sample_type & PERF_SAMPLE_READ)
1293                 size += event->read_size;
1294 
1295         if (sample_type & PERF_SAMPLE_DATA_SRC)
1296                 size += sizeof(data->data_src.val);
1297 
1298         if (sample_type & PERF_SAMPLE_TRANSACTION)
1299                 size += sizeof(data->txn);
1300 
1301         event->header_size = size;
1302 }
1303 
1304 static void perf_event__id_header_size(struct perf_event *event)
1305 {
1306         struct perf_sample_data *data;
1307         u64 sample_type = event->attr.sample_type;
1308         u16 size = 0;
1309 
1310         if (sample_type & PERF_SAMPLE_TID)
1311                 size += sizeof(data->tid_entry);
1312 
1313         if (sample_type & PERF_SAMPLE_TIME)
1314                 size += sizeof(data->time);
1315 
1316         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1317                 size += sizeof(data->id);
1318 
1319         if (sample_type & PERF_SAMPLE_ID)
1320                 size += sizeof(data->id);
1321 
1322         if (sample_type & PERF_SAMPLE_STREAM_ID)
1323                 size += sizeof(data->stream_id);
1324 
1325         if (sample_type & PERF_SAMPLE_CPU)
1326                 size += sizeof(data->cpu_entry);
1327 
1328         event->id_header_size = size;
1329 }
1330 
1331 static void perf_group_attach(struct perf_event *event)
1332 {
1333         struct perf_event *group_leader = event->group_leader, *pos;
1334 
1335         /*
1336          * We can have double attach due to group movement in perf_event_open.
1337          */
1338         if (event->attach_state & PERF_ATTACH_GROUP)
1339                 return;
1340 
1341         event->attach_state |= PERF_ATTACH_GROUP;
1342 
1343         if (group_leader == event)
1344                 return;
1345 
1346         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1347                         !is_software_event(event))
1348                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1349 
1350         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1351         group_leader->nr_siblings++;
1352 
1353         perf_event__header_size(group_leader);
1354 
1355         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1356                 perf_event__header_size(pos);
1357 }
1358 
1359 /*
1360  * Remove a event from the lists for its context.
1361  * Must be called with ctx->mutex and ctx->lock held.
1362  */
1363 static void
1364 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1365 {
1366         struct perf_cpu_context *cpuctx;
1367         /*
1368          * We can have double detach due to exit/hot-unplug + close.
1369          */
1370         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1371                 return;
1372 
1373         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1374 
1375         if (is_cgroup_event(event)) {
1376                 ctx->nr_cgroups--;
1377                 cpuctx = __get_cpu_context(ctx);
1378                 /*
1379                  * if there are no more cgroup events
1380                  * then cler cgrp to avoid stale pointer
1381                  * in update_cgrp_time_from_cpuctx()
1382                  */
1383                 if (!ctx->nr_cgroups)
1384                         cpuctx->cgrp = NULL;
1385         }
1386 
1387         if (has_branch_stack(event))
1388                 ctx->nr_branch_stack--;
1389 
1390         ctx->nr_events--;
1391         if (event->attr.inherit_stat)
1392                 ctx->nr_stat--;
1393 
1394         list_del_rcu(&event->event_entry);
1395 
1396         if (event->group_leader == event)
1397                 list_del_init(&event->group_entry);
1398 
1399         update_group_times(event);
1400 
1401         /*
1402          * If event was in error state, then keep it
1403          * that way, otherwise bogus counts will be
1404          * returned on read(). The only way to get out
1405          * of error state is by explicit re-enabling
1406          * of the event
1407          */
1408         if (event->state > PERF_EVENT_STATE_OFF)
1409                 event->state = PERF_EVENT_STATE_OFF;
1410 
1411         ctx->generation++;
1412 }
1413 
1414 static void perf_group_detach(struct perf_event *event)
1415 {
1416         struct perf_event *sibling, *tmp;
1417         struct list_head *list = NULL;
1418 
1419         /*
1420          * We can have double detach due to exit/hot-unplug + close.
1421          */
1422         if (!(event->attach_state & PERF_ATTACH_GROUP))
1423                 return;
1424 
1425         event->attach_state &= ~PERF_ATTACH_GROUP;
1426 
1427         /*
1428          * If this is a sibling, remove it from its group.
1429          */
1430         if (event->group_leader != event) {
1431                 list_del_init(&event->group_entry);
1432                 event->group_leader->nr_siblings--;
1433                 goto out;
1434         }
1435 
1436         if (!list_empty(&event->group_entry))
1437                 list = &event->group_entry;
1438 
1439         /*
1440          * If this was a group event with sibling events then
1441          * upgrade the siblings to singleton events by adding them
1442          * to whatever list we are on.
1443          */
1444         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1445                 if (list)
1446                         list_move_tail(&sibling->group_entry, list);
1447                 sibling->group_leader = sibling;
1448 
1449                 /* Inherit group flags from the previous leader */
1450                 sibling->group_flags = event->group_flags;
1451         }
1452 
1453 out:
1454         perf_event__header_size(event->group_leader);
1455 
1456         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1457                 perf_event__header_size(tmp);
1458 }
1459 
1460 static inline int
1461 event_filter_match(struct perf_event *event)
1462 {
1463         return (event->cpu == -1 || event->cpu == smp_processor_id())
1464             && perf_cgroup_match(event);
1465 }
1466 
1467 static void
1468 event_sched_out(struct perf_event *event,
1469                   struct perf_cpu_context *cpuctx,
1470                   struct perf_event_context *ctx)
1471 {
1472         u64 tstamp = perf_event_time(event);
1473         u64 delta;
1474         /*
1475          * An event which could not be activated because of
1476          * filter mismatch still needs to have its timings
1477          * maintained, otherwise bogus information is return
1478          * via read() for time_enabled, time_running:
1479          */
1480         if (event->state == PERF_EVENT_STATE_INACTIVE
1481             && !event_filter_match(event)) {
1482                 delta = tstamp - event->tstamp_stopped;
1483                 event->tstamp_running += delta;
1484                 event->tstamp_stopped = tstamp;
1485         }
1486 
1487         if (event->state != PERF_EVENT_STATE_ACTIVE)
1488                 return;
1489 
1490         perf_pmu_disable(event->pmu);
1491 
1492         event->state = PERF_EVENT_STATE_INACTIVE;
1493         if (event->pending_disable) {
1494                 event->pending_disable = 0;
1495                 event->state = PERF_EVENT_STATE_OFF;
1496         }
1497         event->tstamp_stopped = tstamp;
1498         event->pmu->del(event, 0);
1499         event->oncpu = -1;
1500 
1501         if (!is_software_event(event))
1502                 cpuctx->active_oncpu--;
1503         ctx->nr_active--;
1504         if (event->attr.freq && event->attr.sample_freq)
1505                 ctx->nr_freq--;
1506         if (event->attr.exclusive || !cpuctx->active_oncpu)
1507                 cpuctx->exclusive = 0;
1508 
1509         perf_pmu_enable(event->pmu);
1510 }
1511 
1512 static void
1513 group_sched_out(struct perf_event *group_event,
1514                 struct perf_cpu_context *cpuctx,
1515                 struct perf_event_context *ctx)
1516 {
1517         struct perf_event *event;
1518         int state = group_event->state;
1519 
1520         event_sched_out(group_event, cpuctx, ctx);
1521 
1522         /*
1523          * Schedule out siblings (if any):
1524          */
1525         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1526                 event_sched_out(event, cpuctx, ctx);
1527 
1528         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1529                 cpuctx->exclusive = 0;
1530 }
1531 
1532 struct remove_event {
1533         struct perf_event *event;
1534         bool detach_group;
1535 };
1536 
1537 /*
1538  * Cross CPU call to remove a performance event
1539  *
1540  * We disable the event on the hardware level first. After that we
1541  * remove it from the context list.
1542  */
1543 static int __perf_remove_from_context(void *info)
1544 {
1545         struct remove_event *re = info;
1546         struct perf_event *event = re->event;
1547         struct perf_event_context *ctx = event->ctx;
1548         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1549 
1550         raw_spin_lock(&ctx->lock);
1551         event_sched_out(event, cpuctx, ctx);
1552         if (re->detach_group)
1553                 perf_group_detach(event);
1554         list_del_event(event, ctx);
1555         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1556                 ctx->is_active = 0;
1557                 cpuctx->task_ctx = NULL;
1558         }
1559         raw_spin_unlock(&ctx->lock);
1560 
1561         return 0;
1562 }
1563 
1564 
1565 /*
1566  * Remove the event from a task's (or a CPU's) list of events.
1567  *
1568  * CPU events are removed with a smp call. For task events we only
1569  * call when the task is on a CPU.
1570  *
1571  * If event->ctx is a cloned context, callers must make sure that
1572  * every task struct that event->ctx->task could possibly point to
1573  * remains valid.  This is OK when called from perf_release since
1574  * that only calls us on the top-level context, which can't be a clone.
1575  * When called from perf_event_exit_task, it's OK because the
1576  * context has been detached from its task.
1577  */
1578 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1579 {
1580         struct perf_event_context *ctx = event->ctx;
1581         struct task_struct *task = ctx->task;
1582         struct remove_event re = {
1583                 .event = event,
1584                 .detach_group = detach_group,
1585         };
1586 
1587         lockdep_assert_held(&ctx->mutex);
1588 
1589         if (!task) {
1590                 /*
1591                  * Per cpu events are removed via an smp call and
1592                  * the removal is always successful.
1593                  */
1594                 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1595                 return;
1596         }
1597 
1598 retry:
1599         if (!task_function_call(task, __perf_remove_from_context, &re))
1600                 return;
1601 
1602         raw_spin_lock_irq(&ctx->lock);
1603         /*
1604          * If we failed to find a running task, but find the context active now
1605          * that we've acquired the ctx->lock, retry.
1606          */
1607         if (ctx->is_active) {
1608                 raw_spin_unlock_irq(&ctx->lock);
1609                 /*
1610                  * Reload the task pointer, it might have been changed by
1611                  * a concurrent perf_event_context_sched_out().
1612                  */
1613                 task = ctx->task;
1614                 goto retry;
1615         }
1616 
1617         /*
1618          * Since the task isn't running, its safe to remove the event, us
1619          * holding the ctx->lock ensures the task won't get scheduled in.
1620          */
1621         if (detach_group)
1622                 perf_group_detach(event);
1623         list_del_event(event, ctx);
1624         raw_spin_unlock_irq(&ctx->lock);
1625 }
1626 
1627 /*
1628  * Cross CPU call to disable a performance event
1629  */
1630 int __perf_event_disable(void *info)
1631 {
1632         struct perf_event *event = info;
1633         struct perf_event_context *ctx = event->ctx;
1634         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1635 
1636         /*
1637          * If this is a per-task event, need to check whether this
1638          * event's task is the current task on this cpu.
1639          *
1640          * Can trigger due to concurrent perf_event_context_sched_out()
1641          * flipping contexts around.
1642          */
1643         if (ctx->task && cpuctx->task_ctx != ctx)
1644                 return -EINVAL;
1645 
1646         raw_spin_lock(&ctx->lock);
1647 
1648         /*
1649          * If the event is on, turn it off.
1650          * If it is in error state, leave it in error state.
1651          */
1652         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1653                 update_context_time(ctx);
1654                 update_cgrp_time_from_event(event);
1655                 update_group_times(event);
1656                 if (event == event->group_leader)
1657                         group_sched_out(event, cpuctx, ctx);
1658                 else
1659                         event_sched_out(event, cpuctx, ctx);
1660                 event->state = PERF_EVENT_STATE_OFF;
1661         }
1662 
1663         raw_spin_unlock(&ctx->lock);
1664 
1665         return 0;
1666 }
1667 
1668 /*
1669  * Disable a event.
1670  *
1671  * If event->ctx is a cloned context, callers must make sure that
1672  * every task struct that event->ctx->task could possibly point to
1673  * remains valid.  This condition is satisifed when called through
1674  * perf_event_for_each_child or perf_event_for_each because they
1675  * hold the top-level event's child_mutex, so any descendant that
1676  * goes to exit will block in sync_child_event.
1677  * When called from perf_pending_event it's OK because event->ctx
1678  * is the current context on this CPU and preemption is disabled,
1679  * hence we can't get into perf_event_task_sched_out for this context.
1680  */
1681 static void _perf_event_disable(struct perf_event *event)
1682 {
1683         struct perf_event_context *ctx = event->ctx;
1684         struct task_struct *task = ctx->task;
1685 
1686         if (!task) {
1687                 /*
1688                  * Disable the event on the cpu that it's on
1689                  */
1690                 cpu_function_call(event->cpu, __perf_event_disable, event);
1691                 return;
1692         }
1693 
1694 retry:
1695         if (!task_function_call(task, __perf_event_disable, event))
1696                 return;
1697 
1698         raw_spin_lock_irq(&ctx->lock);
1699         /*
1700          * If the event is still active, we need to retry the cross-call.
1701          */
1702         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1703                 raw_spin_unlock_irq(&ctx->lock);
1704                 /*
1705                  * Reload the task pointer, it might have been changed by
1706                  * a concurrent perf_event_context_sched_out().
1707                  */
1708                 task = ctx->task;
1709                 goto retry;
1710         }
1711 
1712         /*
1713          * Since we have the lock this context can't be scheduled
1714          * in, so we can change the state safely.
1715          */
1716         if (event->state == PERF_EVENT_STATE_INACTIVE) {
1717                 update_group_times(event);
1718                 event->state = PERF_EVENT_STATE_OFF;
1719         }
1720         raw_spin_unlock_irq(&ctx->lock);
1721 }
1722 
1723 /*
1724  * Strictly speaking kernel users cannot create groups and therefore this
1725  * interface does not need the perf_event_ctx_lock() magic.
1726  */
1727 void perf_event_disable(struct perf_event *event)
1728 {
1729         struct perf_event_context *ctx;
1730 
1731         ctx = perf_event_ctx_lock(event);
1732         _perf_event_disable(event);
1733         perf_event_ctx_unlock(event, ctx);
1734 }
1735 EXPORT_SYMBOL_GPL(perf_event_disable);
1736 
1737 static void perf_set_shadow_time(struct perf_event *event,
1738                                  struct perf_event_context *ctx,
1739                                  u64 tstamp)
1740 {
1741         /*
1742          * use the correct time source for the time snapshot
1743          *
1744          * We could get by without this by leveraging the
1745          * fact that to get to this function, the caller
1746          * has most likely already called update_context_time()
1747          * and update_cgrp_time_xx() and thus both timestamp
1748          * are identical (or very close). Given that tstamp is,
1749          * already adjusted for cgroup, we could say that:
1750          *    tstamp - ctx->timestamp
1751          * is equivalent to
1752          *    tstamp - cgrp->timestamp.
1753          *
1754          * Then, in perf_output_read(), the calculation would
1755          * work with no changes because:
1756          * - event is guaranteed scheduled in
1757          * - no scheduled out in between
1758          * - thus the timestamp would be the same
1759          *
1760          * But this is a bit hairy.
1761          *
1762          * So instead, we have an explicit cgroup call to remain
1763          * within the time time source all along. We believe it
1764          * is cleaner and simpler to understand.
1765          */
1766         if (is_cgroup_event(event))
1767                 perf_cgroup_set_shadow_time(event, tstamp);
1768         else
1769                 event->shadow_ctx_time = tstamp - ctx->timestamp;
1770 }
1771 
1772 #define MAX_INTERRUPTS (~0ULL)
1773 
1774 static void perf_log_throttle(struct perf_event *event, int enable);
1775 
1776 static int
1777 event_sched_in(struct perf_event *event,
1778                  struct perf_cpu_context *cpuctx,
1779                  struct perf_event_context *ctx)
1780 {
1781         u64 tstamp = perf_event_time(event);
1782         int ret = 0;
1783 
1784         lockdep_assert_held(&ctx->lock);
1785 
1786         if (event->state <= PERF_EVENT_STATE_OFF)
1787                 return 0;
1788 
1789         event->state = PERF_EVENT_STATE_ACTIVE;
1790         event->oncpu = smp_processor_id();
1791 
1792         /*
1793          * Unthrottle events, since we scheduled we might have missed several
1794          * ticks already, also for a heavily scheduling task there is little
1795          * guarantee it'll get a tick in a timely manner.
1796          */
1797         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1798                 perf_log_throttle(event, 1);
1799                 event->hw.interrupts = 0;
1800         }
1801 
1802         /*
1803          * The new state must be visible before we turn it on in the hardware:
1804          */
1805         smp_wmb();
1806 
1807         perf_pmu_disable(event->pmu);
1808 
1809         if (event->pmu->add(event, PERF_EF_START)) {
1810                 event->state = PERF_EVENT_STATE_INACTIVE;
1811                 event->oncpu = -1;
1812                 ret = -EAGAIN;
1813                 goto out;
1814         }
1815 
1816         event->tstamp_running += tstamp - event->tstamp_stopped;
1817 
1818         perf_set_shadow_time(event, ctx, tstamp);
1819 
1820         if (!is_software_event(event))
1821                 cpuctx->active_oncpu++;
1822         ctx->nr_active++;
1823         if (event->attr.freq && event->attr.sample_freq)
1824                 ctx->nr_freq++;
1825 
1826         if (event->attr.exclusive)
1827                 cpuctx->exclusive = 1;
1828 
1829 out:
1830         perf_pmu_enable(event->pmu);
1831 
1832         return ret;
1833 }
1834 
1835 static int
1836 group_sched_in(struct perf_event *group_event,
1837                struct perf_cpu_context *cpuctx,
1838                struct perf_event_context *ctx)
1839 {
1840         struct perf_event *event, *partial_group = NULL;
1841         struct pmu *pmu = ctx->pmu;
1842         u64 now = ctx->time;
1843         bool simulate = false;
1844 
1845         if (group_event->state == PERF_EVENT_STATE_OFF)
1846                 return 0;
1847 
1848         pmu->start_txn(pmu);
1849 
1850         if (event_sched_in(group_event, cpuctx, ctx)) {
1851                 pmu->cancel_txn(pmu);
1852                 perf_cpu_hrtimer_restart(cpuctx);
1853                 return -EAGAIN;
1854         }
1855 
1856         /*
1857          * Schedule in siblings as one group (if any):
1858          */
1859         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1860                 if (event_sched_in(event, cpuctx, ctx)) {
1861                         partial_group = event;
1862                         goto group_error;
1863                 }
1864         }
1865 
1866         if (!pmu->commit_txn(pmu))
1867                 return 0;
1868 
1869 group_error:
1870         /*
1871          * Groups can be scheduled in as one unit only, so undo any
1872          * partial group before returning:
1873          * The events up to the failed event are scheduled out normally,
1874          * tstamp_stopped will be updated.
1875          *
1876          * The failed events and the remaining siblings need to have
1877          * their timings updated as if they had gone thru event_sched_in()
1878          * and event_sched_out(). This is required to get consistent timings
1879          * across the group. This also takes care of the case where the group
1880          * could never be scheduled by ensuring tstamp_stopped is set to mark
1881          * the time the event was actually stopped, such that time delta
1882          * calculation in update_event_times() is correct.
1883          */
1884         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1885                 if (event == partial_group)
1886                         simulate = true;
1887 
1888                 if (simulate) {
1889                         event->tstamp_running += now - event->tstamp_stopped;
1890                         event->tstamp_stopped = now;
1891                 } else {
1892                         event_sched_out(event, cpuctx, ctx);
1893                 }
1894         }
1895         event_sched_out(group_event, cpuctx, ctx);
1896 
1897         pmu->cancel_txn(pmu);
1898 
1899         perf_cpu_hrtimer_restart(cpuctx);
1900 
1901         return -EAGAIN;
1902 }
1903 
1904 /*
1905  * Work out whether we can put this event group on the CPU now.
1906  */
1907 static int group_can_go_on(struct perf_event *event,
1908                            struct perf_cpu_context *cpuctx,
1909                            int can_add_hw)
1910 {
1911         /*
1912          * Groups consisting entirely of software events can always go on.
1913          */
1914         if (event->group_flags & PERF_GROUP_SOFTWARE)
1915                 return 1;
1916         /*
1917          * If an exclusive group is already on, no other hardware
1918          * events can go on.
1919          */
1920         if (cpuctx->exclusive)
1921                 return 0;
1922         /*
1923          * If this group is exclusive and there are already
1924          * events on the CPU, it can't go on.
1925          */
1926         if (event->attr.exclusive && cpuctx->active_oncpu)
1927                 return 0;
1928         /*
1929          * Otherwise, try to add it if all previous groups were able
1930          * to go on.
1931          */
1932         return can_add_hw;
1933 }
1934 
1935 static void add_event_to_ctx(struct perf_event *event,
1936                                struct perf_event_context *ctx)
1937 {
1938         u64 tstamp = perf_event_time(event);
1939 
1940         list_add_event(event, ctx);
1941         perf_group_attach(event);
1942         event->tstamp_enabled = tstamp;
1943         event->tstamp_running = tstamp;
1944         event->tstamp_stopped = tstamp;
1945 }
1946 
1947 static void task_ctx_sched_out(struct perf_event_context *ctx);
1948 static void
1949 ctx_sched_in(struct perf_event_context *ctx,
1950              struct perf_cpu_context *cpuctx,
1951              enum event_type_t event_type,
1952              struct task_struct *task);
1953 
1954 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1955                                 struct perf_event_context *ctx,
1956                                 struct task_struct *task)
1957 {
1958         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1959         if (ctx)
1960                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1961         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1962         if (ctx)
1963                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1964 }
1965 
1966 /*
1967  * Cross CPU call to install and enable a performance event
1968  *
1969  * Must be called with ctx->mutex held
1970  */
1971 static int  __perf_install_in_context(void *info)
1972 {
1973         struct perf_event *event = info;
1974         struct perf_event_context *ctx = event->ctx;
1975         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1976         struct perf_event_context *task_ctx = cpuctx->task_ctx;
1977         struct task_struct *task = current;
1978 
1979         perf_ctx_lock(cpuctx, task_ctx);
1980         perf_pmu_disable(cpuctx->ctx.pmu);
1981 
1982         /*
1983          * If there was an active task_ctx schedule it out.
1984          */
1985         if (task_ctx)
1986                 task_ctx_sched_out(task_ctx);
1987 
1988         /*
1989          * If the context we're installing events in is not the
1990          * active task_ctx, flip them.
1991          */
1992         if (ctx->task && task_ctx != ctx) {
1993                 if (task_ctx)
1994                         raw_spin_unlock(&task_ctx->lock);
1995                 raw_spin_lock(&ctx->lock);
1996                 task_ctx = ctx;
1997         }
1998 
1999         if (task_ctx) {
2000                 cpuctx->task_ctx = task_ctx;
2001                 task = task_ctx->task;
2002         }
2003 
2004         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2005 
2006         update_context_time(ctx);
2007         /*
2008          * update cgrp time only if current cgrp
2009          * matches event->cgrp. Must be done before
2010          * calling add_event_to_ctx()
2011          */
2012         update_cgrp_time_from_event(event);
2013 
2014         add_event_to_ctx(event, ctx);
2015 
2016         /*
2017          * Schedule everything back in
2018          */
2019         perf_event_sched_in(cpuctx, task_ctx, task);
2020 
2021         perf_pmu_enable(cpuctx->ctx.pmu);
2022         perf_ctx_unlock(cpuctx, task_ctx);
2023 
2024         return 0;
2025 }
2026 
2027 /*
2028  * Attach a performance event to a context
2029  *
2030  * First we add the event to the list with the hardware enable bit
2031  * in event->hw_config cleared.
2032  *
2033  * If the event is attached to a task which is on a CPU we use a smp
2034  * call to enable it in the task context. The task might have been
2035  * scheduled away, but we check this in the smp call again.
2036  */
2037 static void
2038 perf_install_in_context(struct perf_event_context *ctx,
2039                         struct perf_event *event,
2040                         int cpu)
2041 {
2042         struct task_struct *task = ctx->task;
2043 
2044         lockdep_assert_held(&ctx->mutex);
2045 
2046         event->ctx = ctx;
2047         if (event->cpu != -1)
2048                 event->cpu = cpu;
2049 
2050         if (!task) {
2051                 /*
2052                  * Per cpu events are installed via an smp call and
2053                  * the install is always successful.
2054                  */
2055                 cpu_function_call(cpu, __perf_install_in_context, event);
2056                 return;
2057         }
2058 
2059 retry:
2060         if (!task_function_call(task, __perf_install_in_context, event))
2061                 return;
2062 
2063         raw_spin_lock_irq(&ctx->lock);
2064         /*
2065          * If we failed to find a running task, but find the context active now
2066          * that we've acquired the ctx->lock, retry.
2067          */
2068         if (ctx->is_active) {
2069                 raw_spin_unlock_irq(&ctx->lock);
2070                 /*
2071                  * Reload the task pointer, it might have been changed by
2072                  * a concurrent perf_event_context_sched_out().
2073                  */
2074                 task = ctx->task;
2075                 goto retry;
2076         }
2077 
2078         /*
2079          * Since the task isn't running, its safe to add the event, us holding
2080          * the ctx->lock ensures the task won't get scheduled in.
2081          */
2082         add_event_to_ctx(event, ctx);
2083         raw_spin_unlock_irq(&ctx->lock);
2084 }
2085 
2086 /*
2087  * Put a event into inactive state and update time fields.
2088  * Enabling the leader of a group effectively enables all
2089  * the group members that aren't explicitly disabled, so we
2090  * have to update their ->tstamp_enabled also.
2091  * Note: this works for group members as well as group leaders
2092  * since the non-leader members' sibling_lists will be empty.
2093  */
2094 static void __perf_event_mark_enabled(struct perf_event *event)
2095 {
2096         struct perf_event *sub;
2097         u64 tstamp = perf_event_time(event);
2098 
2099         event->state = PERF_EVENT_STATE_INACTIVE;
2100         event->tstamp_enabled = tstamp - event->total_time_enabled;
2101         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2102                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2103                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2104         }
2105 }
2106 
2107 /*
2108  * Cross CPU call to enable a performance event
2109  */
2110 static int __perf_event_enable(void *info)
2111 {
2112         struct perf_event *event = info;
2113         struct perf_event_context *ctx = event->ctx;
2114         struct perf_event *leader = event->group_leader;
2115         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2116         int err;
2117 
2118         /*
2119          * There's a time window between 'ctx->is_active' check
2120          * in perf_event_enable function and this place having:
2121          *   - IRQs on
2122          *   - ctx->lock unlocked
2123          *
2124          * where the task could be killed and 'ctx' deactivated
2125          * by perf_event_exit_task.
2126          */
2127         if (!ctx->is_active)
2128                 return -EINVAL;
2129 
2130         raw_spin_lock(&ctx->lock);
2131         update_context_time(ctx);
2132 
2133         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2134                 goto unlock;
2135 
2136         /*
2137          * set current task's cgroup time reference point
2138          */
2139         perf_cgroup_set_timestamp(current, ctx);
2140 
2141         __perf_event_mark_enabled(event);
2142 
2143         if (!event_filter_match(event)) {
2144                 if (is_cgroup_event(event))
2145                         perf_cgroup_defer_enabled(event);
2146                 goto unlock;
2147         }
2148 
2149         /*
2150          * If the event is in a group and isn't the group leader,
2151          * then don't put it on unless the group is on.
2152          */
2153         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2154                 goto unlock;
2155 
2156         if (!group_can_go_on(event, cpuctx, 1)) {
2157                 err = -EEXIST;
2158         } else {
2159                 if (event == leader)
2160                         err = group_sched_in(event, cpuctx, ctx);
2161                 else
2162                         err = event_sched_in(event, cpuctx, ctx);
2163         }
2164 
2165         if (err) {
2166                 /*
2167                  * If this event can't go on and it's part of a
2168                  * group, then the whole group has to come off.
2169                  */
2170                 if (leader != event) {
2171                         group_sched_out(leader, cpuctx, ctx);
2172                         perf_cpu_hrtimer_restart(cpuctx);
2173                 }
2174                 if (leader->attr.pinned) {
2175                         update_group_times(leader);
2176                         leader->state = PERF_EVENT_STATE_ERROR;
2177                 }
2178         }
2179 
2180 unlock:
2181         raw_spin_unlock(&ctx->lock);
2182 
2183         return 0;
2184 }
2185 
2186 /*
2187  * Enable a event.
2188  *
2189  * If event->ctx is a cloned context, callers must make sure that
2190  * every task struct that event->ctx->task could possibly point to
2191  * remains valid.  This condition is satisfied when called through
2192  * perf_event_for_each_child or perf_event_for_each as described
2193  * for perf_event_disable.
2194  */
2195 static void _perf_event_enable(struct perf_event *event)
2196 {
2197         struct perf_event_context *ctx = event->ctx;
2198         struct task_struct *task = ctx->task;
2199 
2200         if (!task) {
2201                 /*
2202                  * Enable the event on the cpu that it's on
2203                  */
2204                 cpu_function_call(event->cpu, __perf_event_enable, event);
2205                 return;
2206         }
2207 
2208         raw_spin_lock_irq(&ctx->lock);
2209         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2210                 goto out;
2211 
2212         /*
2213          * If the event is in error state, clear that first.
2214          * That way, if we see the event in error state below, we
2215          * know that it has gone back into error state, as distinct
2216          * from the task having been scheduled away before the
2217          * cross-call arrived.
2218          */
2219         if (event->state == PERF_EVENT_STATE_ERROR)
2220                 event->state = PERF_EVENT_STATE_OFF;
2221 
2222 retry:
2223         if (!ctx->is_active) {
2224                 __perf_event_mark_enabled(event);
2225                 goto out;
2226         }
2227 
2228         raw_spin_unlock_irq(&ctx->lock);
2229 
2230         if (!task_function_call(task, __perf_event_enable, event))
2231                 return;
2232 
2233         raw_spin_lock_irq(&ctx->lock);
2234 
2235         /*
2236          * If the context is active and the event is still off,
2237          * we need to retry the cross-call.
2238          */
2239         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2240                 /*
2241                  * task could have been flipped by a concurrent
2242                  * perf_event_context_sched_out()
2243                  */
2244                 task = ctx->task;
2245                 goto retry;
2246         }
2247 
2248 out:
2249         raw_spin_unlock_irq(&ctx->lock);
2250 }
2251 
2252 /*
2253  * See perf_event_disable();
2254  */
2255 void perf_event_enable(struct perf_event *event)
2256 {
2257         struct perf_event_context *ctx;
2258 
2259         ctx = perf_event_ctx_lock(event);
2260         _perf_event_enable(event);
2261         perf_event_ctx_unlock(event, ctx);
2262 }
2263 EXPORT_SYMBOL_GPL(perf_event_enable);
2264 
2265 static int _perf_event_refresh(struct perf_event *event, int refresh)
2266 {
2267         /*
2268          * not supported on inherited events
2269          */
2270         if (event->attr.inherit || !is_sampling_event(event))
2271                 return -EINVAL;
2272 
2273         atomic_add(refresh, &event->event_limit);
2274         _perf_event_enable(event);
2275 
2276         return 0;
2277 }
2278 
2279 /*
2280  * See perf_event_disable()
2281  */
2282 int perf_event_refresh(struct perf_event *event, int refresh)
2283 {
2284         struct perf_event_context *ctx;
2285         int ret;
2286 
2287         ctx = perf_event_ctx_lock(event);
2288         ret = _perf_event_refresh(event, refresh);
2289         perf_event_ctx_unlock(event, ctx);
2290 
2291         return ret;
2292 }
2293 EXPORT_SYMBOL_GPL(perf_event_refresh);
2294 
2295 static void ctx_sched_out(struct perf_event_context *ctx,
2296                           struct perf_cpu_context *cpuctx,
2297                           enum event_type_t event_type)
2298 {
2299         struct perf_event *event;
2300         int is_active = ctx->is_active;
2301 
2302         ctx->is_active &= ~event_type;
2303         if (likely(!ctx->nr_events))
2304                 return;
2305 
2306         update_context_time(ctx);
2307         update_cgrp_time_from_cpuctx(cpuctx);
2308         if (!ctx->nr_active)
2309                 return;
2310 
2311         perf_pmu_disable(ctx->pmu);
2312         if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2313                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2314                         group_sched_out(event, cpuctx, ctx);
2315         }
2316 
2317         if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2318                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2319                         group_sched_out(event, cpuctx, ctx);
2320         }
2321         perf_pmu_enable(ctx->pmu);
2322 }
2323 
2324 /*
2325  * Test whether two contexts are equivalent, i.e. whether they have both been
2326  * cloned from the same version of the same context.
2327  *
2328  * Equivalence is measured using a generation number in the context that is
2329  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2330  * and list_del_event().
2331  */
2332 static int context_equiv(struct perf_event_context *ctx1,
2333                          struct perf_event_context *ctx2)
2334 {
2335         lockdep_assert_held(&ctx1->lock);
2336         lockdep_assert_held(&ctx2->lock);
2337 
2338         /* Pinning disables the swap optimization */
2339         if (ctx1->pin_count || ctx2->pin_count)
2340                 return 0;
2341 
2342         /* If ctx1 is the parent of ctx2 */
2343         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2344                 return 1;
2345 
2346         /* If ctx2 is the parent of ctx1 */
2347         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2348                 return 1;
2349 
2350         /*
2351          * If ctx1 and ctx2 have the same parent; we flatten the parent
2352          * hierarchy, see perf_event_init_context().
2353          */
2354         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2355                         ctx1->parent_gen == ctx2->parent_gen)
2356                 return 1;
2357 
2358         /* Unmatched */
2359         return 0;
2360 }
2361 
2362 static void __perf_event_sync_stat(struct perf_event *event,
2363                                      struct perf_event *next_event)
2364 {
2365         u64 value;
2366 
2367         if (!event->attr.inherit_stat)
2368                 return;
2369 
2370         /*
2371          * Update the event value, we cannot use perf_event_read()
2372          * because we're in the middle of a context switch and have IRQs
2373          * disabled, which upsets smp_call_function_single(), however
2374          * we know the event must be on the current CPU, therefore we
2375          * don't need to use it.
2376          */
2377         switch (event->state) {
2378         case PERF_EVENT_STATE_ACTIVE:
2379                 event->pmu->read(event);
2380                 /* fall-through */
2381 
2382         case PERF_EVENT_STATE_INACTIVE:
2383                 update_event_times(event);
2384                 break;
2385 
2386         default:
2387                 break;
2388         }
2389 
2390         /*
2391          * In order to keep per-task stats reliable we need to flip the event
2392          * values when we flip the contexts.
2393          */
2394         value = local64_read(&next_event->count);
2395         value = local64_xchg(&event->count, value);
2396         local64_set(&next_event->count, value);
2397 
2398         swap(event->total_time_enabled, next_event->total_time_enabled);
2399         swap(event->total_time_running, next_event->total_time_running);
2400 
2401         /*
2402          * Since we swizzled the values, update the user visible data too.
2403          */
2404         perf_event_update_userpage(event);
2405         perf_event_update_userpage(next_event);
2406 }
2407 
2408 static void perf_event_sync_stat(struct perf_event_context *ctx,
2409                                    struct perf_event_context *next_ctx)
2410 {
2411         struct perf_event *event, *next_event;
2412 
2413         if (!ctx->nr_stat)
2414                 return;
2415 
2416         update_context_time(ctx);
2417 
2418         event = list_first_entry(&ctx->event_list,
2419                                    struct perf_event, event_entry);
2420 
2421         next_event = list_first_entry(&next_ctx->event_list,
2422                                         struct perf_event, event_entry);
2423 
2424         while (&event->event_entry != &ctx->event_list &&
2425                &next_event->event_entry != &next_ctx->event_list) {
2426 
2427                 __perf_event_sync_stat(event, next_event);
2428 
2429                 event = list_next_entry(event, event_entry);
2430                 next_event = list_next_entry(next_event, event_entry);
2431         }
2432 }
2433 
2434 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2435                                          struct task_struct *next)
2436 {
2437         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2438         struct perf_event_context *next_ctx;
2439         struct perf_event_context *parent, *next_parent;
2440         struct perf_cpu_context *cpuctx;
2441         int do_switch = 1;
2442 
2443         if (likely(!ctx))
2444                 return;
2445 
2446         cpuctx = __get_cpu_context(ctx);
2447         if (!cpuctx->task_ctx)
2448                 return;
2449 
2450         rcu_read_lock();
2451         next_ctx = next->perf_event_ctxp[ctxn];
2452         if (!next_ctx)
2453                 goto unlock;
2454 
2455         parent = rcu_dereference(ctx->parent_ctx);
2456         next_parent = rcu_dereference(next_ctx->parent_ctx);
2457 
2458         /* If neither context have a parent context; they cannot be clones. */
2459         if (!parent || !next_parent)
2460                 goto unlock;
2461 
2462         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2463                 /*
2464                  * Looks like the two contexts are clones, so we might be
2465                  * able to optimize the context switch.  We lock both
2466                  * contexts and check that they are clones under the
2467                  * lock (including re-checking that neither has been
2468                  * uncloned in the meantime).  It doesn't matter which
2469                  * order we take the locks because no other cpu could
2470                  * be trying to lock both of these tasks.
2471                  */
2472                 raw_spin_lock(&ctx->lock);
2473                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2474                 if (context_equiv(ctx, next_ctx)) {
2475                         /*
2476                          * XXX do we need a memory barrier of sorts
2477                          * wrt to rcu_dereference() of perf_event_ctxp
2478                          */
2479                         task->perf_event_ctxp[ctxn] = next_ctx;
2480                         next->perf_event_ctxp[ctxn] = ctx;
2481                         ctx->task = next;
2482                         next_ctx->task = task;
2483                         do_switch = 0;
2484 
2485                         perf_event_sync_stat(ctx, next_ctx);
2486                 }
2487                 raw_spin_unlock(&next_ctx->lock);
2488                 raw_spin_unlock(&ctx->lock);
2489         }
2490 unlock:
2491         rcu_read_unlock();
2492 
2493         if (do_switch) {
2494                 raw_spin_lock(&ctx->lock);
2495                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2496                 cpuctx->task_ctx = NULL;
2497                 raw_spin_unlock(&ctx->lock);
2498         }
2499 }
2500 
2501 #define for_each_task_context_nr(ctxn)                                  \
2502         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2503 
2504 /*
2505  * Called from scheduler to remove the events of the current task,
2506  * with interrupts disabled.
2507  *
2508  * We stop each event and update the event value in event->count.
2509  *
2510  * This does not protect us against NMI, but disable()
2511  * sets the disabled bit in the control field of event _before_
2512  * accessing the event control register. If a NMI hits, then it will
2513  * not restart the event.
2514  */
2515 void __perf_event_task_sched_out(struct task_struct *task,
2516                                  struct task_struct *next)
2517 {
2518         int ctxn;
2519 
2520         for_each_task_context_nr(ctxn)
2521                 perf_event_context_sched_out(task, ctxn, next);
2522 
2523         /*
2524          * if cgroup events exist on this CPU, then we need
2525          * to check if we have to switch out PMU state.
2526          * cgroup event are system-wide mode only
2527          */
2528         if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2529                 perf_cgroup_sched_out(task, next);
2530 }
2531 
2532 static void task_ctx_sched_out(struct perf_event_context *ctx)
2533 {
2534         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2535 
2536         if (!cpuctx->task_ctx)
2537                 return;
2538 
2539         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2540                 return;
2541 
2542         ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2543         cpuctx->task_ctx = NULL;
2544 }
2545 
2546 /*
2547  * Called with IRQs disabled
2548  */
2549 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2550                               enum event_type_t event_type)
2551 {
2552         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2553 }
2554 
2555 static void
2556 ctx_pinned_sched_in(struct perf_event_context *ctx,
2557                     struct perf_cpu_context *cpuctx)
2558 {
2559         struct perf_event *event;
2560 
2561         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2562                 if (event->state <= PERF_EVENT_STATE_OFF)
2563                         continue;
2564                 if (!event_filter_match(event))
2565                         continue;
2566 
2567                 /* may need to reset tstamp_enabled */
2568                 if (is_cgroup_event(event))
2569                         perf_cgroup_mark_enabled(event, ctx);
2570 
2571                 if (group_can_go_on(event, cpuctx, 1))
2572                         group_sched_in(event, cpuctx, ctx);
2573 
2574                 /*
2575                  * If this pinned group hasn't been scheduled,
2576                  * put it in error state.
2577                  */
2578                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2579                         update_group_times(event);
2580                         event->state = PERF_EVENT_STATE_ERROR;
2581                 }
2582         }
2583 }
2584 
2585 static void
2586 ctx_flexible_sched_in(struct perf_event_context *ctx,
2587                       struct perf_cpu_context *cpuctx)
2588 {
2589         struct perf_event *event;
2590         int can_add_hw = 1;
2591 
2592         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2593                 /* Ignore events in OFF or ERROR state */
2594                 if (event->state <= PERF_EVENT_STATE_OFF)
2595                         continue;
2596                 /*
2597                  * Listen to the 'cpu' scheduling filter constraint
2598                  * of events:
2599                  */
2600                 if (!event_filter_match(event))
2601                         continue;
2602 
2603                 /* may need to reset tstamp_enabled */
2604                 if (is_cgroup_event(event))
2605                         perf_cgroup_mark_enabled(event, ctx);
2606 
2607                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2608                         if (group_sched_in(event, cpuctx, ctx))
2609                                 can_add_hw = 0;
2610                 }
2611         }
2612 }
2613 
2614 static void
2615 ctx_sched_in(struct perf_event_context *ctx,
2616              struct perf_cpu_context *cpuctx,
2617              enum event_type_t event_type,
2618              struct task_struct *task)
2619 {
2620         u64 now;
2621         int is_active = ctx->is_active;
2622 
2623         ctx->is_active |= event_type;
2624         if (likely(!ctx->nr_events))
2625                 return;
2626 
2627         now = perf_clock();
2628         ctx->timestamp = now;
2629         perf_cgroup_set_timestamp(task, ctx);
2630         /*
2631          * First go through the list and put on any pinned groups
2632          * in order to give them the best chance of going on.
2633          */
2634         if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2635                 ctx_pinned_sched_in(ctx, cpuctx);
2636 
2637         /* Then walk through the lower prio flexible groups */
2638         if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2639                 ctx_flexible_sched_in(ctx, cpuctx);
2640 }
2641 
2642 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2643                              enum event_type_t event_type,
2644                              struct task_struct *task)
2645 {
2646         struct perf_event_context *ctx = &cpuctx->ctx;
2647 
2648         ctx_sched_in(ctx, cpuctx, event_type, task);
2649 }
2650 
2651 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2652                                         struct task_struct *task)
2653 {
2654         struct perf_cpu_context *cpuctx;
2655 
2656         cpuctx = __get_cpu_context(ctx);
2657         if (cpuctx->task_ctx == ctx)
2658                 return;
2659 
2660         perf_ctx_lock(cpuctx, ctx);
2661         perf_pmu_disable(ctx->pmu);
2662         /*
2663          * We want to keep the following priority order:
2664          * cpu pinned (that don't need to move), task pinned,
2665          * cpu flexible, task flexible.
2666          */
2667         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2668 
2669         if (ctx->nr_events)
2670                 cpuctx->task_ctx = ctx;
2671 
2672         perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2673 
2674         perf_pmu_enable(ctx->pmu);
2675         perf_ctx_unlock(cpuctx, ctx);
2676 
2677         /*
2678          * Since these rotations are per-cpu, we need to ensure the
2679          * cpu-context we got scheduled on is actually rotating.
2680          */
2681         perf_pmu_rotate_start(ctx->pmu);
2682 }
2683 
2684 /*
2685  * When sampling the branck stack in system-wide, it may be necessary
2686  * to flush the stack on context switch. This happens when the branch
2687  * stack does not tag its entries with the pid of the current task.
2688  * Otherwise it becomes impossible to associate a branch entry with a
2689  * task. This ambiguity is more likely to appear when the branch stack
2690  * supports priv level filtering and the user sets it to monitor only
2691  * at the user level (which could be a useful measurement in system-wide
2692  * mode). In that case, the risk is high of having a branch stack with
2693  * branch from multiple tasks. Flushing may mean dropping the existing
2694  * entries or stashing them somewhere in the PMU specific code layer.
2695  *
2696  * This function provides the context switch callback to the lower code
2697  * layer. It is invoked ONLY when there is at least one system-wide context
2698  * with at least one active event using taken branch sampling.
2699  */
2700 static void perf_branch_stack_sched_in(struct task_struct *prev,
2701                                        struct task_struct *task)
2702 {
2703         struct perf_cpu_context *cpuctx;
2704         struct pmu *pmu;
2705         unsigned long flags;
2706 
2707         /* no need to flush branch stack if not changing task */
2708         if (prev == task)
2709                 return;
2710 
2711         local_irq_save(flags);
2712 
2713         rcu_read_lock();
2714 
2715         list_for_each_entry_rcu(pmu, &pmus, entry) {
2716                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2717 
2718                 /*
2719                  * check if the context has at least one
2720                  * event using PERF_SAMPLE_BRANCH_STACK
2721                  */
2722                 if (cpuctx->ctx.nr_branch_stack > 0
2723                     && pmu->flush_branch_stack) {
2724 
2725                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2726 
2727                         perf_pmu_disable(pmu);
2728 
2729                         pmu->flush_branch_stack();
2730 
2731                         perf_pmu_enable(pmu);
2732 
2733                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2734                 }
2735         }
2736 
2737         rcu_read_unlock();
2738 
2739         local_irq_restore(flags);
2740 }
2741 
2742 /*
2743  * Called from scheduler to add the events of the current task
2744  * with interrupts disabled.
2745  *
2746  * We restore the event value and then enable it.
2747  *
2748  * This does not protect us against NMI, but enable()
2749  * sets the enabled bit in the control field of event _before_
2750  * accessing the event control register. If a NMI hits, then it will
2751  * keep the event running.
2752  */
2753 void __perf_event_task_sched_in(struct task_struct *prev,
2754                                 struct task_struct *task)
2755 {
2756         struct perf_event_context *ctx;
2757         int ctxn;
2758 
2759         for_each_task_context_nr(ctxn) {
2760                 ctx = task->perf_event_ctxp[ctxn];
2761                 if (likely(!ctx))
2762                         continue;
2763 
2764                 perf_event_context_sched_in(ctx, task);
2765         }
2766         /*
2767          * if cgroup events exist on this CPU, then we need
2768          * to check if we have to switch in PMU state.
2769          * cgroup event are system-wide mode only
2770          */
2771         if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2772                 perf_cgroup_sched_in(prev, task);
2773 
2774         /* check for system-wide branch_stack events */
2775         if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2776                 perf_branch_stack_sched_in(prev, task);
2777 }
2778 
2779 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2780 {
2781         u64 frequency = event->attr.sample_freq;
2782         u64 sec = NSEC_PER_SEC;
2783         u64 divisor, dividend;
2784 
2785         int count_fls, nsec_fls, frequency_fls, sec_fls;
2786 
2787         count_fls = fls64(count);
2788         nsec_fls = fls64(nsec);
2789         frequency_fls = fls64(frequency);
2790         sec_fls = 30;
2791 
2792         /*
2793          * We got @count in @nsec, with a target of sample_freq HZ
2794          * the target period becomes:
2795          *
2796          *             @count * 10^9
2797          * period = -------------------
2798          *          @nsec * sample_freq
2799          *
2800          */
2801 
2802         /*
2803          * Reduce accuracy by one bit such that @a and @b converge
2804          * to a similar magnitude.
2805          */
2806 #define REDUCE_FLS(a, b)                \
2807 do {                                    \
2808         if (a##_fls > b##_fls) {        \
2809                 a >>= 1;                \
2810                 a##_fls--;              \
2811         } else {                        \
2812                 b >>= 1;                \
2813                 b##_fls--;              \
2814         }                               \
2815 } while (0)
2816 
2817         /*
2818          * Reduce accuracy until either term fits in a u64, then proceed with
2819          * the other, so that finally we can do a u64/u64 division.
2820          */
2821         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2822                 REDUCE_FLS(nsec, frequency);
2823                 REDUCE_FLS(sec, count);
2824         }
2825 
2826         if (count_fls + sec_fls > 64) {
2827                 divisor = nsec * frequency;
2828 
2829                 while (count_fls + sec_fls > 64) {
2830                         REDUCE_FLS(count, sec);
2831                         divisor >>= 1;
2832                 }
2833 
2834                 dividend = count * sec;
2835         } else {
2836                 dividend = count * sec;
2837 
2838                 while (nsec_fls + frequency_fls > 64) {
2839                         REDUCE_FLS(nsec, frequency);
2840                         dividend >>= 1;
2841                 }
2842 
2843                 divisor = nsec * frequency;
2844         }
2845 
2846         if (!divisor)
2847                 return dividend;
2848 
2849         return div64_u64(dividend, divisor);
2850 }
2851 
2852 static DEFINE_PER_CPU(int, perf_throttled_count);
2853 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2854 
2855 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2856 {
2857         struct hw_perf_event *hwc = &event->hw;
2858         s64 period, sample_period;
2859         s64 delta;
2860 
2861         period = perf_calculate_period(event, nsec, count);
2862 
2863         delta = (s64)(period - hwc->sample_period);
2864         delta = (delta + 7) / 8; /* low pass filter */
2865 
2866         sample_period = hwc->sample_period + delta;
2867 
2868         if (!sample_period)
2869                 sample_period = 1;
2870 
2871         hwc->sample_period = sample_period;
2872 
2873         if (local64_read(&hwc->period_left) > 8*sample_period) {
2874                 if (disable)
2875                         event->pmu->stop(event, PERF_EF_UPDATE);
2876 
2877                 local64_set(&hwc->period_left, 0);
2878 
2879                 if (disable)
2880                         event->pmu->start(event, PERF_EF_RELOAD);
2881         }
2882 }
2883 
2884 /*
2885  * combine freq adjustment with unthrottling to avoid two passes over the
2886  * events. At the same time, make sure, having freq events does not change
2887  * the rate of unthrottling as that would introduce bias.
2888  */
2889 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2890                                            int needs_unthr)
2891 {
2892         struct perf_event *event;
2893         struct hw_perf_event *hwc;
2894         u64 now, period = TICK_NSEC;
2895         s64 delta;
2896 
2897         /*
2898          * only need to iterate over all events iff:
2899          * - context have events in frequency mode (needs freq adjust)
2900          * - there are events to unthrottle on this cpu
2901          */
2902         if (!(ctx->nr_freq || needs_unthr))
2903                 return;
2904 
2905         raw_spin_lock(&ctx->lock);
2906         perf_pmu_disable(ctx->pmu);
2907 
2908         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2909                 if (event->state != PERF_EVENT_STATE_ACTIVE)
2910                         continue;
2911 
2912                 if (!event_filter_match(event))
2913                         continue;
2914 
2915                 perf_pmu_disable(event->pmu);
2916 
2917                 hwc = &event->hw;
2918 
2919                 if (hwc->interrupts == MAX_INTERRUPTS) {
2920                         hwc->interrupts = 0;
2921                         perf_log_throttle(event, 1);
2922                         event->pmu->start(event, 0);
2923                 }
2924 
2925                 if (!event->attr.freq || !event->attr.sample_freq)
2926                         goto next;
2927 
2928                 /*
2929                  * stop the event and update event->count
2930                  */
2931                 event->pmu->stop(event, PERF_EF_UPDATE);
2932 
2933                 now = local64_read(&event->count);
2934                 delta = now - hwc->freq_count_stamp;
2935                 hwc->freq_count_stamp = now;
2936 
2937                 /*
2938                  * restart the event
2939                  * reload only if value has changed
2940                  * we have stopped the event so tell that
2941                  * to perf_adjust_period() to avoid stopping it
2942                  * twice.
2943                  */
2944                 if (delta > 0)
2945                         perf_adjust_period(event, period, delta, false);
2946 
2947                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2948         next:
2949                 perf_pmu_enable(event->pmu);
2950         }
2951 
2952         perf_pmu_enable(ctx->pmu);
2953         raw_spin_unlock(&ctx->lock);
2954 }
2955 
2956 /*
2957  * Round-robin a context's events:
2958  */
2959 static void rotate_ctx(struct perf_event_context *ctx)
2960 {
2961         /*
2962          * Rotate the first entry last of non-pinned groups. Rotation might be
2963          * disabled by the inheritance code.
2964          */
2965         if (!ctx->rotate_disable)
2966                 list_rotate_left(&ctx->flexible_groups);
2967 }
2968 
2969 /*
2970  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2971  * because they're strictly cpu affine and rotate_start is called with IRQs
2972  * disabled, while rotate_context is called from IRQ context.
2973  */
2974 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2975 {
2976         struct perf_event_context *ctx = NULL;
2977         int rotate = 0, remove = 1;
2978 
2979         if (cpuctx->ctx.nr_events) {
2980                 remove = 0;
2981                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2982                         rotate = 1;
2983         }
2984 
2985         ctx = cpuctx->task_ctx;
2986         if (ctx && ctx->nr_events) {
2987                 remove = 0;
2988                 if (ctx->nr_events != ctx->nr_active)
2989                         rotate = 1;
2990         }
2991 
2992         if (!rotate)
2993                 goto done;
2994 
2995         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2996         perf_pmu_disable(cpuctx->ctx.pmu);
2997 
2998         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2999         if (ctx)
3000                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3001 
3002         rotate_ctx(&cpuctx->ctx);
3003         if (ctx)
3004                 rotate_ctx(ctx);
3005 
3006         perf_event_sched_in(cpuctx, ctx, current);
3007 
3008         perf_pmu_enable(cpuctx->ctx.pmu);
3009         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3010 done:
3011         if (remove)
3012                 list_del_init(&cpuctx->rotation_list);
3013 
3014         return rotate;
3015 }
3016 
3017 #ifdef CONFIG_NO_HZ_FULL
3018 bool perf_event_can_stop_tick(void)
3019 {
3020         if (atomic_read(&nr_freq_events) ||
3021             __this_cpu_read(perf_throttled_count))
3022                 return false;
3023         else
3024                 return true;
3025 }
3026 #endif
3027 
3028 void perf_event_task_tick(void)
3029 {
3030         struct list_head *head = &__get_cpu_var(rotation_list);
3031         struct perf_cpu_context *cpuctx, *tmp;
3032         struct perf_event_context *ctx;
3033         int throttled;
3034 
3035         WARN_ON(!irqs_disabled());
3036 
3037         __this_cpu_inc(perf_throttled_seq);
3038         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3039 
3040         list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
3041                 ctx = &cpuctx->ctx;
3042                 perf_adjust_freq_unthr_context(ctx, throttled);
3043 
3044                 ctx = cpuctx->task_ctx;
3045                 if (ctx)
3046                         perf_adjust_freq_unthr_context(ctx, throttled);
3047         }
3048 }
3049 
3050 static int event_enable_on_exec(struct perf_event *event,
3051                                 struct perf_event_context *ctx)
3052 {
3053         if (!event->attr.enable_on_exec)
3054                 return 0;
3055 
3056         event->attr.enable_on_exec = 0;
3057         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3058                 return 0;
3059 
3060         __perf_event_mark_enabled(event);
3061 
3062         return 1;
3063 }
3064 
3065 /*
3066  * Enable all of a task's events that have been marked enable-on-exec.
3067  * This expects task == current.
3068  */
3069 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3070 {
3071         struct perf_event_context *clone_ctx = NULL;
3072         struct perf_event *event;
3073         unsigned long flags;
3074         int enabled = 0;
3075         int ret;
3076 
3077         local_irq_save(flags);
3078         if (!ctx || !ctx->nr_events)
3079                 goto out;
3080 
3081         /*
3082          * We must ctxsw out cgroup events to avoid conflict
3083          * when invoking perf_task_event_sched_in() later on
3084          * in this function. Otherwise we end up trying to
3085          * ctxswin cgroup events which are already scheduled
3086          * in.
3087          */
3088         perf_cgroup_sched_out(current, NULL);
3089 
3090         raw_spin_lock(&ctx->lock);
3091         task_ctx_sched_out(ctx);
3092 
3093         list_for_each_entry(event, &ctx->event_list, event_entry) {
3094                 ret = event_enable_on_exec(event, ctx);
3095                 if (ret)
3096                         enabled = 1;
3097         }
3098 
3099         /*
3100          * Unclone this context if we enabled any event.
3101          */
3102         if (enabled)
3103                 clone_ctx = unclone_ctx(ctx);
3104 
3105         raw_spin_unlock(&ctx->lock);
3106 
3107         /*
3108          * Also calls ctxswin for cgroup events, if any:
3109          */
3110         perf_event_context_sched_in(ctx, ctx->task);
3111 out:
3112         local_irq_restore(flags);
3113 
3114         if (clone_ctx)
3115                 put_ctx(clone_ctx);
3116 }
3117 
3118 void perf_event_exec(void)
3119 {
3120         struct perf_event_context *ctx;
3121         int ctxn;
3122 
3123         rcu_read_lock();
3124         for_each_task_context_nr(ctxn) {
3125                 ctx = current->perf_event_ctxp[ctxn];
3126                 if (!ctx)
3127                         continue;
3128 
3129                 perf_event_enable_on_exec(ctx);
3130         }
3131         rcu_read_unlock();
3132 }
3133 
3134 /*
3135  * Cross CPU call to read the hardware event
3136  */
3137 static void __perf_event_read(void *info)
3138 {
3139         struct perf_event *event = info;
3140         struct perf_event_context *ctx = event->ctx;
3141         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3142 
3143         /*
3144          * If this is a task context, we need to check whether it is
3145          * the current task context of this cpu.  If not it has been
3146          * scheduled out before the smp call arrived.  In that case
3147          * event->count would have been updated to a recent sample
3148          * when the event was scheduled out.
3149          */
3150         if (ctx->task && cpuctx->task_ctx != ctx)
3151                 return;
3152 
3153         raw_spin_lock(&ctx->lock);
3154         if (ctx->is_active) {
3155                 update_context_time(ctx);
3156                 update_cgrp_time_from_event(event);
3157         }
3158         update_event_times(event);
3159         if (event->state == PERF_EVENT_STATE_ACTIVE)
3160                 event->pmu->read(event);
3161         raw_spin_unlock(&ctx->lock);
3162 }
3163 
3164 static inline u64 perf_event_count(struct perf_event *event)
3165 {
3166         return local64_read(&event->count) + atomic64_read(&event->child_count);
3167 }
3168 
3169 static u64 perf_event_read(struct perf_event *event)
3170 {
3171         /*
3172          * If event is enabled and currently active on a CPU, update the
3173          * value in the event structure:
3174          */
3175         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3176                 smp_call_function_single(event->oncpu,
3177                                          __perf_event_read, event, 1);
3178         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3179                 struct perf_event_context *ctx = event->ctx;
3180                 unsigned long flags;
3181 
3182                 raw_spin_lock_irqsave(&ctx->lock, flags);
3183                 /*
3184                  * may read while context is not active
3185                  * (e.g., thread is blocked), in that case
3186                  * we cannot update context time
3187                  */
3188                 if (ctx->is_active) {
3189                         update_context_time(ctx);
3190                         update_cgrp_time_from_event(event);
3191                 }
3192                 update_event_times(event);
3193                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3194         }
3195 
3196         return perf_event_count(event);
3197 }
3198 
3199 /*
3200  * Initialize the perf_event context in a task_struct:
3201  */
3202 static void __perf_event_init_context(struct perf_event_context *ctx)
3203 {
3204         raw_spin_lock_init(&ctx->lock);
3205         mutex_init(&ctx->mutex);
3206         INIT_LIST_HEAD(&ctx->pinned_groups);
3207         INIT_LIST_HEAD(&ctx->flexible_groups);
3208         INIT_LIST_HEAD(&ctx->event_list);
3209         atomic_set(&ctx->refcount, 1);
3210 }
3211 
3212 static struct perf_event_context *
3213 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3214 {
3215         struct perf_event_context *ctx;
3216 
3217         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3218         if (!ctx)
3219                 return NULL;
3220 
3221         __perf_event_init_context(ctx);
3222         if (task) {
3223                 ctx->task = task;
3224                 get_task_struct(task);
3225         }
3226         ctx->pmu = pmu;
3227 
3228         return ctx;
3229 }
3230 
3231 static struct task_struct *
3232 find_lively_task_by_vpid(pid_t vpid)
3233 {
3234         struct task_struct *task;
3235 
3236         rcu_read_lock();
3237         if (!vpid)
3238                 task = current;
3239         else
3240                 task = find_task_by_vpid(vpid);
3241         if (task)
3242                 get_task_struct(task);
3243         rcu_read_unlock();
3244 
3245         if (!task)
3246                 return ERR_PTR(-ESRCH);
3247 
3248         return task;
3249 }
3250 
3251 /*
3252  * Returns a matching context with refcount and pincount.
3253  */
3254 static struct perf_event_context *
3255 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3256 {
3257         struct perf_event_context *ctx, *clone_ctx = NULL;
3258         struct perf_cpu_context *cpuctx;
3259         unsigned long flags;
3260         int ctxn, err;
3261 
3262         if (!task) {
3263                 /* Must be root to operate on a CPU event: */
3264                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3265                         return ERR_PTR(-EACCES);
3266 
3267                 /*
3268                  * We could be clever and allow to attach a event to an
3269                  * offline CPU and activate it when the CPU comes up, but
3270                  * that's for later.
3271                  */
3272                 if (!cpu_online(cpu))
3273                         return ERR_PTR(-ENODEV);
3274 
3275                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3276                 ctx = &cpuctx->ctx;
3277                 get_ctx(ctx);
3278                 ++ctx->pin_count;
3279 
3280                 return ctx;
3281         }
3282 
3283         err = -EINVAL;
3284         ctxn = pmu->task_ctx_nr;
3285         if (ctxn < 0)
3286                 goto errout;
3287 
3288 retry:
3289         ctx = perf_lock_task_context(task, ctxn, &flags);
3290         if (ctx) {
3291                 clone_ctx = unclone_ctx(ctx);
3292                 ++ctx->pin_count;
3293                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3294 
3295                 if (clone_ctx)
3296                         put_ctx(clone_ctx);
3297         } else {
3298                 ctx = alloc_perf_context(pmu, task);
3299                 err = -ENOMEM;
3300                 if (!ctx)
3301                         goto errout;
3302 
3303                 err = 0;
3304                 mutex_lock(&task->perf_event_mutex);
3305                 /*
3306                  * If it has already passed perf_event_exit_task().
3307                  * we must see PF_EXITING, it takes this mutex too.
3308                  */
3309                 if (task->flags & PF_EXITING)
3310                         err = -ESRCH;
3311                 else if (task->perf_event_ctxp[ctxn])
3312                         err = -EAGAIN;
3313                 else {
3314                         get_ctx(ctx);
3315                         ++ctx->pin_count;
3316                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3317                 }
3318                 mutex_unlock(&task->perf_event_mutex);
3319 
3320                 if (unlikely(err)) {
3321                         put_ctx(ctx);
3322 
3323                         if (err == -EAGAIN)
3324                                 goto retry;
3325                         goto errout;
3326                 }
3327         }
3328 
3329         return ctx;
3330 
3331 errout:
3332         return ERR_PTR(err);
3333 }
3334 
3335 static void perf_event_free_filter(struct perf_event *event);
3336 
3337 static void free_event_rcu(struct rcu_head *head)
3338 {
3339         struct perf_event *event;
3340 
3341         event = container_of(head, struct perf_event, rcu_head);
3342         if (event->ns)
3343                 put_pid_ns(event->ns);
3344         perf_event_free_filter(event);
3345         kfree(event);
3346 }
3347 
3348 static void ring_buffer_put(struct ring_buffer *rb);
3349 static void ring_buffer_attach(struct perf_event *event,
3350                                struct ring_buffer *rb);
3351 
3352 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3353 {
3354         if (event->parent)
3355                 return;
3356 
3357         if (has_branch_stack(event)) {
3358                 if (!(event->attach_state & PERF_ATTACH_TASK))
3359                         atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3360         }
3361         if (is_cgroup_event(event))
3362                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3363 }
3364 
3365 static void unaccount_event(struct perf_event *event)
3366 {
3367         if (event->parent)
3368                 return;
3369 
3370         if (event->attach_state & PERF_ATTACH_TASK)
3371                 static_key_slow_dec_deferred(&perf_sched_events);
3372         if (event->attr.mmap || event->attr.mmap_data)
3373                 atomic_dec(&nr_mmap_events);
3374         if (event->attr.comm)
3375                 atomic_dec(&nr_comm_events);
3376         if (event->attr.task)
3377                 atomic_dec(&nr_task_events);
3378         if (event->attr.freq)
3379                 atomic_dec(&nr_freq_events);
3380         if (is_cgroup_event(event))
3381                 static_key_slow_dec_deferred(&perf_sched_events);
3382         if (has_branch_stack(event))
3383                 static_key_slow_dec_deferred(&perf_sched_events);
3384 
3385         unaccount_event_cpu(event, event->cpu);
3386 }
3387 
3388 static void __free_event(struct perf_event *event)
3389 {
3390         if (!event->parent) {
3391                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3392                         put_callchain_buffers();
3393         }
3394 
3395         if (event->destroy)
3396                 event->destroy(event);
3397 
3398         if (event->ctx)
3399                 put_ctx(event->ctx);
3400 
3401         if (event->pmu)
3402                 module_put(event->pmu->module);
3403 
3404         call_rcu(&event->rcu_head, free_event_rcu);
3405 }
3406 
3407 static void _free_event(struct perf_event *event)
3408 {
3409         irq_work_sync(&event->pending);
3410 
3411         unaccount_event(event);
3412 
3413         if (event->rb) {
3414                 /*
3415                  * Can happen when we close an event with re-directed output.
3416                  *
3417                  * Since we have a 0 refcount, perf_mmap_close() will skip
3418                  * over us; possibly making our ring_buffer_put() the last.
3419                  */
3420                 mutex_lock(&event->mmap_mutex);
3421                 ring_buffer_attach(event, NULL);
3422                 mutex_unlock(&event->mmap_mutex);
3423         }
3424 
3425         if (is_cgroup_event(event))
3426                 perf_detach_cgroup(event);
3427 
3428         __free_event(event);
3429 }
3430 
3431 /*
3432  * Used to free events which have a known refcount of 1, such as in error paths
3433  * where the event isn't exposed yet and inherited events.
3434  */
3435 static void free_event(struct perf_event *event)
3436 {
3437         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3438                                 "unexpected event refcount: %ld; ptr=%p\n",
3439                                 atomic_long_read(&event->refcount), event)) {
3440                 /* leak to avoid use-after-free */
3441                 return;
3442         }
3443 
3444         _free_event(event);
3445 }
3446 
3447 /*
3448  * Called when the last reference to the file is gone.
3449  */
3450 static void put_event(struct perf_event *event)
3451 {
3452         struct perf_event_context *ctx = event->ctx;
3453         struct task_struct *owner;
3454 
3455         if (!atomic_long_dec_and_test(&event->refcount))
3456                 return;
3457 
3458         rcu_read_lock();
3459         owner = ACCESS_ONCE(event->owner);
3460         /*
3461          * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3462          * !owner it means the list deletion is complete and we can indeed
3463          * free this event, otherwise we need to serialize on
3464          * owner->perf_event_mutex.
3465          */
3466         smp_read_barrier_depends();
3467         if (owner) {
3468                 /*
3469                  * Since delayed_put_task_struct() also drops the last
3470                  * task reference we can safely take a new reference
3471                  * while holding the rcu_read_lock().
3472                  */
3473                 get_task_struct(owner);
3474         }
3475         rcu_read_unlock();
3476 
3477         if (owner) {
3478                 /*
3479                  * If we're here through perf_event_exit_task() we're already
3480                  * holding ctx->mutex which would be an inversion wrt. the
3481                  * normal lock order.
3482                  *
3483                  * However we can safely take this lock because its the child
3484                  * ctx->mutex.
3485                  */
3486                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3487 
3488                 /*
3489                  * We have to re-check the event->owner field, if it is cleared
3490                  * we raced with perf_event_exit_task(), acquiring the mutex
3491                  * ensured they're done, and we can proceed with freeing the
3492                  * event.
3493                  */
3494                 if (event->owner)
3495                         list_del_init(&event->owner_entry);
3496                 mutex_unlock(&owner->perf_event_mutex);
3497                 put_task_struct(owner);
3498         }
3499 
3500         WARN_ON_ONCE(ctx->parent_ctx);
3501         /*
3502          * There are two ways this annotation is useful:
3503          *
3504          *  1) there is a lock recursion from perf_event_exit_task
3505          *     see the comment there.
3506          *
3507          *  2) there is a lock-inversion with mmap_sem through
3508          *     perf_event_read_group(), which takes faults while
3509          *     holding ctx->mutex, however this is called after
3510          *     the last filedesc died, so there is no possibility
3511          *     to trigger the AB-BA case.
3512          */
3513         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3514         perf_remove_from_context(event, true);
3515         mutex_unlock(&ctx->mutex);
3516 
3517         _free_event(event);
3518 }
3519 
3520 int perf_event_release_kernel(struct perf_event *event)
3521 {
3522         put_event(event);
3523         return 0;
3524 }
3525 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3526 
3527 static int perf_release(struct inode *inode, struct file *file)
3528 {
3529         put_event(file->private_data);
3530         return 0;
3531 }
3532 
3533 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3534 {
3535         struct perf_event *child;
3536         u64 total = 0;
3537 
3538         *enabled = 0;
3539         *running = 0;
3540 
3541         mutex_lock(&event->child_mutex);
3542         total += perf_event_read(event);
3543         *enabled += event->total_time_enabled +
3544                         atomic64_read(&event->child_total_time_enabled);
3545         *running += event->total_time_running +
3546                         atomic64_read(&event->child_total_time_running);
3547 
3548         list_for_each_entry(child, &event->child_list, child_list) {
3549                 total += perf_event_read(child);
3550                 *enabled += child->total_time_enabled;
3551                 *running += child->total_time_running;
3552         }
3553         mutex_unlock(&event->child_mutex);
3554 
3555         return total;
3556 }
3557 EXPORT_SYMBOL_GPL(perf_event_read_value);
3558 
3559 static void __perf_read_group_add(struct perf_event *leader,
3560                                         u64 read_format, u64 *values)
3561 {
3562         struct perf_event_context *ctx = leader->ctx;
3563         struct perf_event *sub;
3564         unsigned long flags;
3565         int n = 1; /* skip @nr */
3566         u64 count, enabled, running;
3567 
3568         count = perf_event_read_value(leader, &enabled, &running);
3569 
3570         /*
3571          * Since we co-schedule groups, {enabled,running} times of siblings
3572          * will be identical to those of the leader, so we only publish one
3573          * set.
3574          */
3575         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3576                 values[n++] = enabled;
3577         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3578                 values[n++] = running;
3579 
3580         /*
3581          * Write {count,id} tuples for every sibling.
3582          */
3583         values[n++] += count;
3584         if (read_format & PERF_FORMAT_ID)
3585                 values[n++] = primary_event_id(leader);
3586 
3587         raw_spin_lock_irqsave(&ctx->lock, flags);
3588 
3589         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3590                 values[n++] = perf_event_read_value(sub, &enabled, &running);
3591                 if (read_format & PERF_FORMAT_ID)
3592                         values[n++] = primary_event_id(sub);
3593         }
3594 
3595         raw_spin_unlock_irqrestore(&ctx->lock, flags);
3596 }
3597 
3598 static int perf_event_read_group(struct perf_event *event,
3599                                    u64 read_format, char __user *buf)
3600 {
3601         struct perf_event *leader = event->group_leader, *child;
3602         struct perf_event_context *ctx = leader->ctx;
3603         int ret = event->read_size;
3604         u64 *values;
3605 
3606         lockdep_assert_held(&ctx->mutex);
3607 
3608         values = kzalloc(event->read_size, GFP_KERNEL);
3609         if (!values)
3610                 return -ENOMEM;
3611 
3612         values[0] = 1 + leader->nr_siblings;
3613 
3614         /*
3615          * By locking the child_mutex of the leader we effectively
3616          * lock the child list of all siblings.. XXX explain how.
3617          */
3618         mutex_lock(&leader->child_mutex);
3619 
3620         __perf_read_group_add(leader, read_format, values);
3621         list_for_each_entry(child, &leader->child_list, child_list)
3622                 __perf_read_group_add(child, read_format, values);
3623 
3624         mutex_unlock(&leader->child_mutex);
3625 
3626         if (copy_to_user(buf, values, event->read_size))
3627                 ret = -EFAULT;
3628 
3629         kfree(values);
3630 
3631         return ret;
3632 }
3633 
3634 static int perf_event_read_one(struct perf_event *event,
3635                                  u64 read_format, char __user *buf)
3636 {
3637         u64 enabled, running;
3638         u64 values[4];
3639         int n = 0;
3640 
3641         values[n++] = perf_event_read_value(event, &enabled, &running);
3642         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3643                 values[n++] = enabled;
3644         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3645                 values[n++] = running;
3646         if (read_format & PERF_FORMAT_ID)
3647                 values[n++] = primary_event_id(event);
3648 
3649         if (copy_to_user(buf, values, n * sizeof(u64)))
3650                 return -EFAULT;
3651 
3652         return n * sizeof(u64);
3653 }
3654 
3655 /*
3656  * Read the performance event - simple non blocking version for now
3657  */
3658 static ssize_t
3659 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3660 {
3661         u64 read_format = event->attr.read_format;
3662         int ret;
3663 
3664         /*
3665          * Return end-of-file for a read on a event that is in
3666          * error state (i.e. because it was pinned but it couldn't be
3667          * scheduled on to the CPU at some point).
3668          */
3669         if (event->state == PERF_EVENT_STATE_ERROR)
3670                 return 0;
3671 
3672         if (count < event->read_size)
3673                 return -ENOSPC;
3674 
3675         WARN_ON_ONCE(event->ctx->parent_ctx);
3676         if (read_format & PERF_FORMAT_GROUP)
3677                 ret = perf_event_read_group(event, read_format, buf);
3678         else
3679                 ret = perf_event_read_one(event, read_format, buf);
3680 
3681         return ret;
3682 }
3683 
3684 static ssize_t
3685 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3686 {
3687         struct perf_event *event = file->private_data;
3688         struct perf_event_context *ctx;
3689         int ret;
3690 
3691         ctx = perf_event_ctx_lock(event);
3692         ret = perf_read_hw(event, buf, count);
3693         perf_event_ctx_unlock(event, ctx);
3694 
3695         return ret;
3696 }
3697 
3698 static unsigned int perf_poll(struct file *file, poll_table *wait)
3699 {
3700         struct perf_event *event = file->private_data;
3701         struct ring_buffer *rb;
3702         unsigned int events = POLL_HUP;
3703 
3704         /*
3705          * Pin the event->rb by taking event->mmap_mutex; otherwise
3706          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3707          */
3708         mutex_lock(&event->mmap_mutex);
3709         rb = event->rb;
3710         if (rb)
3711                 events = atomic_xchg(&rb->poll, 0);
3712         mutex_unlock(&event->mmap_mutex);
3713 
3714         poll_wait(file, &event->waitq, wait);
3715 
3716         return events;
3717 }
3718 
3719 static void _perf_event_reset(struct perf_event *event)
3720 {
3721         (void)perf_event_read(event);
3722         local64_set(&event->count, 0);
3723         perf_event_update_userpage(event);
3724 }
3725 
3726 /*
3727  * Holding the top-level event's child_mutex means that any
3728  * descendant process that has inherited this event will block
3729  * in sync_child_event if it goes to exit, thus satisfying the
3730  * task existence requirements of perf_event_enable/disable.
3731  */
3732 static void perf_event_for_each_child(struct perf_event *event,
3733                                         void (*func)(struct perf_event *))
3734 {
3735         struct perf_event *child;
3736 
3737         WARN_ON_ONCE(event->ctx->parent_ctx);
3738 
3739         mutex_lock(&event->child_mutex);
3740         func(event);
3741         list_for_each_entry(child, &event->child_list, child_list)
3742                 func(child);
3743         mutex_unlock(&event->child_mutex);
3744 }
3745 
3746 static void perf_event_for_each(struct perf_event *event,
3747                                   void (*func)(struct perf_event *))
3748 {
3749         struct perf_event_context *ctx = event->ctx;
3750         struct perf_event *sibling;
3751 
3752         lockdep_assert_held(&ctx->mutex);
3753 
3754         event = event->group_leader;
3755 
3756         perf_event_for_each_child(event, func);
3757         list_for_each_entry(sibling, &event->sibling_list, group_entry)
3758                 perf_event_for_each_child(sibling, func);
3759 }
3760 
3761 struct period_event {
3762         struct perf_event *event;
3763         u64 value;
3764 };
3765 
3766 static int __perf_event_period(void *info)
3767 {
3768         struct period_event *pe = info;
3769         struct perf_event *event = pe->event;
3770         struct perf_event_context *ctx = event->ctx;
3771         u64 value = pe->value;
3772         bool active;
3773 
3774         raw_spin_lock(&ctx->lock);
3775         if (event->attr.freq) {
3776                 event->attr.sample_freq = value;
3777         } else {
3778                 event->attr.sample_period = value;
3779                 event->hw.sample_period = value;
3780         }
3781 
3782         active = (event->state == PERF_EVENT_STATE_ACTIVE);
3783         if (active) {
3784                 perf_pmu_disable(ctx->pmu);
3785                 event->pmu->stop(event, PERF_EF_UPDATE);
3786         }
3787 
3788         local64_set(&event->hw.period_left, 0);
3789 
3790         if (active) {
3791                 event->pmu->start(event, PERF_EF_RELOAD);
3792                 perf_pmu_enable(ctx->pmu);
3793         }
3794         raw_spin_unlock(&ctx->lock);
3795 
3796         return 0;
3797 }
3798 
3799 static int perf_event_check_period(struct perf_event *event, u64 value)
3800 {
3801         return event->pmu->check_period(event, value);
3802 }
3803 
3804 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3805 {
3806         struct period_event pe = { .event = event, };
3807         struct perf_event_context *ctx = event->ctx;
3808         struct task_struct *task;
3809         u64 value;
3810 
3811         if (!is_sampling_event(event))
3812                 return -EINVAL;
3813 
3814         if (copy_from_user(&value, arg, sizeof(value)))
3815                 return -EFAULT;
3816 
3817         if (!value)
3818                 return -EINVAL;
3819 
3820         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
3821                 return -EINVAL;
3822 
3823         if (perf_event_check_period(event, value))
3824                 return -EINVAL;
3825 
3826         if (!event->attr.freq && (value & (1ULL << 63)))
3827                 return -EINVAL;
3828 
3829         task = ctx->task;
3830         pe.value = value;
3831 
3832         if (!task) {
3833                 cpu_function_call(event->cpu, __perf_event_period, &pe);
3834                 return 0;
3835         }
3836 
3837 retry:
3838         if (!task_function_call(task, __perf_event_period, &pe))
3839                 return 0;
3840 
3841         raw_spin_lock_irq(&ctx->lock);
3842         if (ctx->is_active) {
3843                 raw_spin_unlock_irq(&ctx->lock);
3844                 task = ctx->task;
3845                 goto retry;
3846         }
3847 
3848         if (event->attr.freq) {
3849                 event->attr.sample_freq = value;
3850         } else {
3851                 event->attr.sample_period = value;
3852                 event->hw.sample_period = value;
3853         }
3854 
3855         local64_set(&event->hw.period_left, 0);
3856         raw_spin_unlock_irq(&ctx->lock);
3857 
3858         return 0;
3859 }
3860 
3861 static const struct file_operations perf_fops;
3862 
3863 static inline int perf_fget_light(int fd, struct fd *p)
3864 {
3865         struct fd f = fdget(fd);
3866         if (!f.file)
3867                 return -EBADF;
3868 
3869         if (f.file->f_op != &perf_fops) {
3870                 fdput(f);
3871                 return -EBADF;
3872         }
3873         *p = f;
3874         return 0;
3875 }
3876 
3877 static int perf_event_set_output(struct perf_event *event,
3878                                  struct perf_event *output_event);
3879 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3880 
3881 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3882 {
3883         void (*func)(struct perf_event *);
3884         u32 flags = arg;
3885 
3886         switch (cmd) {
3887         case PERF_EVENT_IOC_ENABLE:
3888                 func = _perf_event_enable;
3889                 break;
3890         case PERF_EVENT_IOC_DISABLE:
3891                 func = _perf_event_disable;
3892                 break;
3893         case PERF_EVENT_IOC_RESET:
3894                 func = _perf_event_reset;
3895                 break;
3896 
3897         case PERF_EVENT_IOC_REFRESH:
3898                 return _perf_event_refresh(event, arg);
3899 
3900         case PERF_EVENT_IOC_PERIOD:
3901                 return perf_event_period(event, (u64 __user *)arg);
3902 
3903         case PERF_EVENT_IOC_ID:
3904         {
3905                 u64 id = primary_event_id(event);
3906 
3907                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3908                         return -EFAULT;
3909                 return 0;
3910         }
3911 
3912         case PERF_EVENT_IOC_SET_OUTPUT:
3913         {
3914                 int ret;
3915                 if (arg != -1) {
3916                         struct perf_event *output_event;
3917                         struct fd output;
3918                         ret = perf_fget_light(arg, &output);
3919                         if (ret)
3920                                 return ret;
3921                         output_event = output.file->private_data;
3922                         ret = perf_event_set_output(event, output_event);
3923                         fdput(output);
3924                 } else {
3925                         ret = perf_event_set_output(event, NULL);
3926                 }
3927                 return ret;
3928         }
3929 
3930         case PERF_EVENT_IOC_SET_FILTER:
3931                 return perf_event_set_filter(event, (void __user *)arg);
3932 
3933         default:
3934                 return -ENOTTY;
3935         }
3936 
3937         if (flags & PERF_IOC_FLAG_GROUP)
3938                 perf_event_for_each(event, func);
3939         else
3940                 perf_event_for_each_child(event, func);
3941 
3942         return 0;
3943 }
3944 
3945 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3946 {
3947         struct perf_event *event = file->private_data;
3948         struct perf_event_context *ctx;
3949         long ret;
3950 
3951         ctx = perf_event_ctx_lock(event);
3952         ret = _perf_ioctl(event, cmd, arg);
3953         perf_event_ctx_unlock(event, ctx);
3954 
3955         return ret;
3956 }
3957 
3958 #ifdef CONFIG_COMPAT
3959 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3960                                 unsigned long arg)
3961 {
3962         switch (_IOC_NR(cmd)) {
3963         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
3964         case _IOC_NR(PERF_EVENT_IOC_ID):
3965                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
3966                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
3967                         cmd &= ~IOCSIZE_MASK;
3968                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
3969                 }
3970                 break;
3971         }
3972         return perf_ioctl(file, cmd, arg);
3973 }
3974 #else
3975 # define perf_compat_ioctl NULL
3976 #endif
3977 
3978 int perf_event_task_enable(void)
3979 {
3980         struct perf_event_context *ctx;
3981         struct perf_event *event;
3982 
3983         mutex_lock(&current->perf_event_mutex);
3984         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3985                 ctx = perf_event_ctx_lock(event);
3986                 perf_event_for_each_child(event, _perf_event_enable);
3987                 perf_event_ctx_unlock(event, ctx);
3988         }
3989         mutex_unlock(&current->perf_event_mutex);
3990 
3991         return 0;
3992 }
3993 
3994 int perf_event_task_disable(void)
3995 {
3996         struct perf_event_context *ctx;
3997         struct perf_event *event;
3998 
3999         mutex_lock(&current->perf_event_mutex);
4000         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4001                 ctx = perf_event_ctx_lock(event);
4002                 perf_event_for_each_child(event, _perf_event_disable);
4003                 perf_event_ctx_unlock(event, ctx);
4004         }
4005         mutex_unlock(&current->perf_event_mutex);
4006 
4007         return 0;
4008 }
4009 
4010 static int perf_event_index(struct perf_event *event)
4011 {
4012         if (event->hw.state & PERF_HES_STOPPED)
4013                 return 0;
4014 
4015         if (event->state != PERF_EVENT_STATE_ACTIVE)
4016                 return 0;
4017 
4018         return event->pmu->event_idx(event);
4019 }
4020 
4021 static void calc_timer_values(struct perf_event *event,
4022                                 u64 *now,
4023                                 u64 *enabled,
4024                                 u64 *running)
4025 {
4026         u64 ctx_time;
4027 
4028         *now = perf_clock();
4029         ctx_time = event->shadow_ctx_time + *now;
4030         *enabled = ctx_time - event->tstamp_enabled;
4031         *running = ctx_time - event->tstamp_running;
4032 }
4033 
4034 static void perf_event_init_userpage(struct perf_event *event)
4035 {
4036         struct perf_event_mmap_page *userpg;
4037         struct ring_buffer *rb;
4038 
4039         rcu_read_lock();
4040         rb = rcu_dereference(event->rb);
4041         if (!rb)
4042                 goto unlock;
4043 
4044         userpg = rb->user_page;
4045 
4046         /* Allow new userspace to detect that bit 0 is deprecated */
4047         userpg->cap_bit0_is_deprecated = 1;
4048         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4049 
4050 unlock:
4051         rcu_read_unlock();
4052 }
4053 
4054 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
4055 {
4056 }
4057 
4058 /*
4059  * Callers need to ensure there can be no nesting of this function, otherwise
4060  * the seqlock logic goes bad. We can not serialize this because the arch
4061  * code calls this from NMI context.
4062  */
4063 void perf_event_update_userpage(struct perf_event *event)
4064 {
4065         struct perf_event_mmap_page *userpg;
4066         struct ring_buffer *rb;
4067         u64 enabled, running, now;
4068 
4069         rcu_read_lock();
4070         rb = rcu_dereference(event->rb);
4071         if (!rb)
4072                 goto unlock;
4073 
4074         /*
4075          * compute total_time_enabled, total_time_running
4076          * based on snapshot values taken when the event
4077          * was last scheduled in.
4078          *
4079          * we cannot simply called update_context_time()
4080          * because of locking issue as we can be called in
4081          * NMI context
4082          */
4083         calc_timer_values(event, &now, &enabled, &running);
4084 
4085         userpg = rb->user_page;
4086         /*
4087          * Disable preemption so as to not let the corresponding user-space
4088          * spin too long if we get preempted.
4089          */
4090         preempt_disable();
4091         ++userpg->lock;
4092         barrier();
4093         userpg->index = perf_event_index(event);
4094         userpg->offset = perf_event_count(event);
4095         if (userpg->index)
4096                 userpg->offset -= local64_read(&event->hw.prev_count);
4097 
4098         userpg->time_enabled = enabled +
4099                         atomic64_read(&event->child_total_time_enabled);
4100 
4101         userpg->time_running = running +
4102                         atomic64_read(&event->child_total_time_running);
4103 
4104         arch_perf_update_userpage(userpg, now);
4105 
4106         barrier();
4107         ++userpg->lock;
4108         preempt_enable();
4109 unlock:
4110         rcu_read_unlock();
4111 }
4112 
4113 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4114 {
4115         struct perf_event *event = vma->vm_file->private_data;
4116         struct ring_buffer *rb;
4117         int ret = VM_FAULT_SIGBUS;
4118 
4119         if (vmf->flags & FAULT_FLAG_MKWRITE) {
4120                 if (vmf->pgoff == 0)
4121                         ret = 0;
4122                 return ret;
4123         }
4124 
4125         rcu_read_lock();
4126         rb = rcu_dereference(event->rb);
4127         if (!rb)
4128                 goto unlock;
4129 
4130         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4131                 goto unlock;
4132 
4133         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4134         if (!vmf->page)
4135                 goto unlock;
4136 
4137         get_page(vmf->page);
4138         vmf->page->mapping = vma->vm_file->f_mapping;
4139         vmf->page->index   = vmf->pgoff;
4140 
4141         ret = 0;
4142 unlock:
4143         rcu_read_unlock();
4144 
4145         return ret;
4146 }
4147 
4148 static void ring_buffer_attach(struct perf_event *event,
4149                                struct ring_buffer *rb)
4150 {
4151         struct ring_buffer *old_rb = NULL;
4152         unsigned long flags;
4153 
4154         if (event->rb) {
4155                 /*
4156                  * Should be impossible, we set this when removing
4157                  * event->rb_entry and wait/clear when adding event->rb_entry.
4158                  */
4159                 WARN_ON_ONCE(event->rcu_pending);
4160 
4161                 old_rb = event->rb;
4162                 spin_lock_irqsave(&old_rb->event_lock, flags);
4163                 list_del_rcu(&event->rb_entry);
4164                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4165 
4166                 event->rcu_batches = get_state_synchronize_rcu();
4167                 event->rcu_pending = 1;
4168         }
4169 
4170         if (rb) {
4171                 if (event->rcu_pending) {
4172                         cond_synchronize_rcu(event->rcu_batches);
4173                         event->rcu_pending = 0;
4174                 }
4175 
4176                 spin_lock_irqsave(&rb->event_lock, flags);
4177                 list_add_rcu(&event->rb_entry, &rb->event_list);
4178                 spin_unlock_irqrestore(&rb->event_lock, flags);
4179         }
4180 
4181         rcu_assign_pointer(event->rb, rb);
4182 
4183         if (old_rb) {
4184                 ring_buffer_put(old_rb);
4185                 /*
4186                  * Since we detached before setting the new rb, so that we
4187                  * could attach the new rb, we could have missed a wakeup.
4188                  * Provide it now.
4189                  */
4190                 wake_up_all(&event->waitq);
4191         }
4192 }
4193 
4194 static void ring_buffer_wakeup(struct perf_event *event)
4195 {
4196         struct ring_buffer *rb;
4197 
4198         rcu_read_lock();
4199         rb = rcu_dereference(event->rb);
4200         if (rb) {
4201                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4202                         wake_up_all(&event->waitq);
4203         }
4204         rcu_read_unlock();
4205 }
4206 
4207 static void rb_free_rcu(struct rcu_head *rcu_head)
4208 {
4209         struct ring_buffer *rb;
4210 
4211         rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4212         rb_free(rb);
4213 }
4214 
4215 static struct ring_buffer *ring_buffer_get(struct perf_event *event)
4216 {
4217         struct ring_buffer *rb;
4218 
4219         rcu_read_lock();
4220         rb = rcu_dereference(event->rb);
4221         if (rb) {
4222                 if (!atomic_inc_not_zero(&rb->refcount))
4223                         rb = NULL;
4224         }
4225         rcu_read_unlock();
4226 
4227         return rb;
4228 }
4229 
4230 static void ring_buffer_put(struct ring_buffer *rb)
4231 {
4232         if (!atomic_dec_and_test(&rb->refcount))
4233                 return;
4234 
4235         WARN_ON_ONCE(!list_empty(&rb->event_list));
4236 
4237         call_rcu(&rb->rcu_head, rb_free_rcu);
4238 }
4239 
4240 static void perf_mmap_open(struct vm_area_struct *vma)
4241 {
4242         struct perf_event *event = vma->vm_file->private_data;
4243 
4244         atomic_inc(&event->mmap_count);
4245         atomic_inc(&event->rb->mmap_count);
4246 }
4247 
4248 /*
4249  * A buffer can be mmap()ed multiple times; either directly through the same
4250  * event, or through other events by use of perf_event_set_output().
4251  *
4252  * In order to undo the VM accounting done by perf_mmap() we need to destroy
4253  * the buffer here, where we still have a VM context. This means we need
4254  * to detach all events redirecting to us.
4255  */
4256 static void perf_mmap_close(struct vm_area_struct *vma)
4257 {
4258         struct perf_event *event = vma->vm_file->private_data;
4259 
4260         struct ring_buffer *rb = ring_buffer_get(event);
4261         struct user_struct *mmap_user = rb->mmap_user;
4262         int mmap_locked = rb->mmap_locked;
4263         unsigned long size = perf_data_size(rb);
4264 
4265         atomic_dec(&rb->mmap_count);
4266 
4267         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4268                 goto out_put;
4269 
4270         ring_buffer_attach(event, NULL);
4271         mutex_unlock(&event->mmap_mutex);
4272 
4273         /* If there's still other mmap()s of this buffer, we're done. */
4274         if (atomic_read(&rb->mmap_count))
4275                 goto out_put;
4276 
4277         /*
4278          * No other mmap()s, detach from all other events that might redirect
4279          * into the now unreachable buffer. Somewhat complicated by the
4280          * fact that rb::event_lock otherwise nests inside mmap_mutex.
4281          */
4282 again:
4283         rcu_read_lock();
4284         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4285                 if (!atomic_long_inc_not_zero(&event->refcount)) {
4286                         /*
4287                          * This event is en-route to free_event() which will
4288                          * detach it and remove it from the list.
4289                          */
4290                         continue;
4291                 }
4292                 rcu_read_unlock();
4293 
4294                 mutex_lock(&event->mmap_mutex);
4295                 /*
4296                  * Check we didn't race with perf_event_set_output() which can
4297                  * swizzle the rb from under us while we were waiting to
4298                  * acquire mmap_mutex.
4299                  *
4300                  * If we find a different rb; ignore this event, a next
4301                  * iteration will no longer find it on the list. We have to
4302                  * still restart the iteration to make sure we're not now
4303                  * iterating the wrong list.
4304                  */
4305                 if (event->rb == rb)
4306                         ring_buffer_attach(event, NULL);
4307 
4308                 mutex_unlock(&event->mmap_mutex);
4309                 put_event(event);
4310 
4311                 /*
4312                  * Restart the iteration; either we're on the wrong list or
4313                  * destroyed its integrity by doing a deletion.
4314                  */
4315                 goto again;
4316         }
4317         rcu_read_unlock();
4318 
4319         /*
4320          * It could be there's still a few 0-ref events on the list; they'll
4321          * get cleaned up by free_event() -- they'll also still have their
4322          * ref on the rb and will free it whenever they are done with it.
4323          *
4324          * Aside from that, this buffer is 'fully' detached and unmapped,
4325          * undo the VM accounting.
4326          */
4327 
4328         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4329         vma->vm_mm->pinned_vm -= mmap_locked;
4330         free_uid(mmap_user);
4331 
4332 out_put:
4333         ring_buffer_put(rb); /* could be last */
4334 }
4335 
4336 static const struct vm_operations_struct perf_mmap_vmops = {
4337         .open           = perf_mmap_open,
4338         .close          = perf_mmap_close,
4339         .fault          = perf_mmap_fault,
4340         .page_mkwrite   = perf_mmap_fault,
4341 };
4342 
4343 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4344 {
4345         struct perf_event *event = file->private_data;
4346         unsigned long user_locked, user_lock_limit;
4347         struct user_struct *user = current_user();
4348         unsigned long locked, lock_limit;
4349         struct ring_buffer *rb;
4350         unsigned long vma_size;
4351         unsigned long nr_pages;
4352         long user_extra, extra;
4353         int ret = 0, flags = 0;
4354 
4355         /*
4356          * Don't allow mmap() of inherited per-task counters. This would
4357          * create a performance issue due to all children writing to the
4358          * same rb.
4359          */
4360         if (event->cpu == -1 && event->attr.inherit)
4361                 return -EINVAL;
4362 
4363         if (!(vma->vm_flags & VM_SHARED))
4364                 return -EINVAL;
4365 
4366         vma_size = vma->vm_end - vma->vm_start;
4367         nr_pages = (vma_size / PAGE_SIZE) - 1;
4368 
4369         /*
4370          * If we have rb pages ensure they're a power-of-two number, so we
4371          * can do bitmasks instead of modulo.
4372          */
4373         if (nr_pages != 0 && !is_power_of_2(nr_pages))
4374                 return -EINVAL;
4375 
4376         if (vma_size != PAGE_SIZE * (1 + nr_pages))
4377                 return -EINVAL;
4378 
4379         if (vma->vm_pgoff != 0)
4380                 return -EINVAL;
4381 
4382         WARN_ON_ONCE(event->ctx->parent_ctx);
4383 again:
4384         mutex_lock(&event->mmap_mutex);
4385         if (event->rb) {
4386                 if (event->rb->nr_pages != nr_pages) {
4387                         ret = -EINVAL;
4388                         goto unlock;
4389                 }
4390 
4391                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4392                         /*
4393                          * Raced against perf_mmap_close() through
4394                          * perf_event_set_output(). Try again, hope for better
4395                          * luck.
4396                          */
4397                         mutex_unlock(&event->mmap_mutex);
4398                         goto again;
4399                 }
4400 
4401                 goto unlock;
4402         }
4403 
4404         user_extra = nr_pages + 1;
4405         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4406 
4407         /*
4408          * Increase the limit linearly with more CPUs:
4409          */
4410         user_lock_limit *= num_online_cpus();
4411 
4412         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4413 
4414         extra = 0;
4415         if (user_locked > user_lock_limit)
4416                 extra = user_locked - user_lock_limit;
4417 
4418         lock_limit = rlimit(RLIMIT_MEMLOCK);
4419         lock_limit >>= PAGE_SHIFT;
4420         locked = vma->vm_mm->pinned_vm + extra;
4421 
4422         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4423                 !capable(CAP_IPC_LOCK)) {
4424                 ret = -EPERM;
4425                 goto unlock;
4426         }
4427 
4428         WARN_ON(event->rb);
4429 
4430         if (vma->vm_flags & VM_WRITE)
4431                 flags |= RING_BUFFER_WRITABLE;
4432 
4433         rb = rb_alloc(nr_pages, 
4434                 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4435                 event->cpu, flags);
4436 
4437         if (!rb) {
4438                 ret = -ENOMEM;
4439                 goto unlock;
4440         }
4441 
4442         atomic_set(&rb->mmap_count, 1);
4443         rb->mmap_locked = extra;
4444         rb->mmap_user = get_current_user();
4445 
4446         atomic_long_add(user_extra, &user->locked_vm);
4447         vma->vm_mm->pinned_vm += extra;
4448 
4449         ring_buffer_attach(event, rb);
4450 
4451         perf_event_init_userpage(event);
4452         perf_event_update_userpage(event);
4453 
4454 unlock:
4455         if (!ret)
4456                 atomic_inc(&event->mmap_count);
4457         mutex_unlock(&event->mmap_mutex);
4458 
4459         /*
4460          * Since pinned accounting is per vm we cannot allow fork() to copy our
4461          * vma.
4462          */
4463         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4464         vma->vm_ops = &perf_mmap_vmops;
4465 
4466         return ret;
4467 }
4468 
4469 static int perf_fasync(int fd, struct file *filp, int on)
4470 {
4471         struct inode *inode = file_inode(filp);
4472         struct perf_event *event = filp->private_data;
4473         int retval;
4474 
4475         mutex_lock(&inode->i_mutex);
4476         retval = fasync_helper(fd, filp, on, &event->fasync);
4477         mutex_unlock(&inode->i_mutex);
4478 
4479         if (retval < 0)
4480                 return retval;
4481 
4482         return 0;
4483 }
4484 
4485 static const struct file_operations perf_fops = {
4486         .llseek                 = no_llseek,
4487         .release                = perf_release,
4488         .read                   = perf_read,
4489         .poll                   = perf_poll,
4490         .unlocked_ioctl         = perf_ioctl,
4491         .compat_ioctl           = perf_compat_ioctl,
4492         .mmap                   = perf_mmap,
4493         .fasync                 = perf_fasync,
4494 };
4495 
4496 /*
4497  * Perf event wakeup
4498  *
4499  * If there's data, ensure we set the poll() state and publish everything
4500  * to user-space before waking everybody up.
4501  */
4502 
4503 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4504 {
4505         /* only the parent has fasync state */
4506         if (event->parent)
4507                 event = event->parent;
4508         return &event->fasync;
4509 }
4510 
4511 void perf_event_wakeup(struct perf_event *event)
4512 {
4513         ring_buffer_wakeup(event);
4514 
4515         if (event->pending_kill) {
4516                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4517                 event->pending_kill = 0;
4518         }
4519 }
4520 
4521 static void perf_pending_event(struct irq_work *entry)
4522 {
4523         struct perf_event *event = container_of(entry,
4524                         struct perf_event, pending);
4525         int rctx;
4526 
4527         rctx = perf_swevent_get_recursion_context();
4528         /*
4529          * If we 'fail' here, that's OK, it means recursion is already disabled
4530          * and we won't recurse 'further'.
4531          */
4532 
4533         if (event->pending_disable) {
4534                 event->pending_disable = 0;
4535                 __perf_event_disable(event);
4536         }
4537 
4538         if (event->pending_wakeup) {
4539                 event->pending_wakeup = 0;
4540                 perf_event_wakeup(event);
4541         }
4542 
4543         if (rctx >= 0)
4544                 perf_swevent_put_recursion_context(rctx);
4545 }
4546 
4547 /*
4548  * We assume there is only KVM supporting the callbacks.
4549  * Later on, we might change it to a list if there is
4550  * another virtualization implementation supporting the callbacks.
4551  */
4552 struct perf_guest_info_callbacks *perf_guest_cbs;
4553 
4554 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4555 {
4556         perf_guest_cbs = cbs;
4557         return 0;
4558 }
4559 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4560 
4561 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4562 {
4563         perf_guest_cbs = NULL;
4564         return 0;
4565 }
4566 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4567 
4568 static void
4569 perf_output_sample_regs(struct perf_output_handle *handle,
4570                         struct pt_regs *regs, u64 mask)
4571 {
4572         int bit;
4573 
4574         for_each_set_bit(bit, (const unsigned long *) &mask,
4575                          sizeof(mask) * BITS_PER_BYTE) {
4576                 u64 val;
4577 
4578                 val = perf_reg_value(regs, bit);
4579                 perf_output_put(handle, val);
4580         }
4581 }
4582 
4583 static void perf_sample_regs_user(struct perf_regs_user *regs_user,
4584                                   struct pt_regs *regs)
4585 {
4586         if (!user_mode(regs)) {
4587                 if (!(current->flags & PF_KTHREAD))
4588                         regs = task_pt_regs(current);
4589                 else
4590                         regs = NULL;
4591         }
4592 
4593         if (regs) {
4594                 regs_user->regs = regs;
4595                 regs_user->abi  = perf_reg_abi(current);
4596         }
4597 }
4598 
4599 /*
4600  * Get remaining task size from user stack pointer.
4601  *
4602  * It'd be better to take stack vma map and limit this more
4603  * precisly, but there's no way to get it safely under interrupt,
4604  * so using TASK_SIZE as limit.
4605  */
4606 static u64 perf_ustack_task_size(struct pt_regs *regs)
4607 {
4608         unsigned long addr = perf_user_stack_pointer(regs);
4609 
4610         if (!addr || addr >= TASK_SIZE)
4611                 return 0;
4612 
4613         return TASK_SIZE - addr;
4614 }
4615 
4616 static u16
4617 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4618                         struct pt_regs *regs)
4619 {
4620         u64 task_size;
4621 
4622         /* No regs, no stack pointer, no dump. */
4623         if (!regs)
4624                 return 0;
4625 
4626         /*
4627          * Check if we fit in with the requested stack size into the:
4628          * - TASK_SIZE
4629          *   If we don't, we limit the size to the TASK_SIZE.
4630          *
4631          * - remaining sample size
4632          *   If we don't, we customize the stack size to
4633          *   fit in to the remaining sample size.
4634          */
4635 
4636         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4637         stack_size = min(stack_size, (u16) task_size);
4638 
4639         /* Current header size plus static size and dynamic size. */
4640         header_size += 2 * sizeof(u64);
4641 
4642         /* Do we fit in with the current stack dump size? */
4643         if ((u16) (header_size + stack_size) < header_size) {
4644                 /*
4645                  * If we overflow the maximum size for the sample,
4646                  * we customize the stack dump size to fit in.
4647                  */
4648                 stack_size = USHRT_MAX - header_size - sizeof(u64);
4649                 stack_size = round_up(stack_size, sizeof(u64));
4650         }
4651 
4652         return stack_size;
4653 }
4654 
4655 static void
4656 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4657                           struct pt_regs *regs)
4658 {
4659         /* Case of a kernel thread, nothing to dump */
4660         if (!regs) {
4661                 u64 size = 0;
4662                 perf_output_put(handle, size);
4663         } else {
4664                 unsigned long sp;
4665                 unsigned int rem;
4666                 u64 dyn_size;
4667 
4668                 /*
4669                  * We dump:
4670                  * static size
4671                  *   - the size requested by user or the best one we can fit
4672                  *     in to the sample max size
4673                  * data
4674                  *   - user stack dump data
4675                  * dynamic size
4676                  *   - the actual dumped size
4677                  */
4678 
4679                 /* Static size. */
4680                 perf_output_put(handle, dump_size);
4681 
4682                 /* Data. */
4683                 sp = perf_user_stack_pointer(regs);
4684                 rem = __output_copy_user(handle, (void *) sp, dump_size);
4685                 dyn_size = dump_size - rem;
4686 
4687                 perf_output_skip(handle, rem);
4688 
4689                 /* Dynamic size. */
4690                 perf_output_put(handle, dyn_size);
4691         }
4692 }
4693 
4694 static void __perf_event_header__init_id(struct perf_event_header *header,
4695                                          struct perf_sample_data *data,
4696                                          struct perf_event *event)
4697 {
4698         u64 sample_type = event->attr.sample_type;
4699 
4700         data->type = sample_type;
4701         header->size += event->id_header_size;
4702 
4703         if (sample_type & PERF_SAMPLE_TID) {
4704                 /* namespace issues */
4705                 data->tid_entry.pid = perf_event_pid(event, current);
4706                 data->tid_entry.tid = perf_event_tid(event, current);
4707         }
4708 
4709         if (sample_type & PERF_SAMPLE_TIME)
4710                 data->time = perf_clock();
4711 
4712         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4713                 data->id = primary_event_id(event);
4714 
4715         if (sample_type & PERF_SAMPLE_STREAM_ID)
4716                 data->stream_id = event->id;
4717 
4718         if (sample_type & PERF_SAMPLE_CPU) {
4719                 data->cpu_entry.cpu      = raw_smp_processor_id();
4720                 data->cpu_entry.reserved = 0;
4721         }
4722 }
4723 
4724 void perf_event_header__init_id(struct perf_event_header *header,
4725                                 struct perf_sample_data *data,
4726                                 struct perf_event *event)
4727 {
4728         if (event->attr.sample_id_all)
4729                 __perf_event_header__init_id(header, data, event);
4730 }
4731 
4732 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4733                                            struct perf_sample_data *data)
4734 {
4735         u64 sample_type = data->type;
4736 
4737         if (sample_type & PERF_SAMPLE_TID)
4738                 perf_output_put(handle, data->tid_entry);
4739 
4740         if (sample_type & PERF_SAMPLE_TIME)
4741                 perf_output_put(handle, data->time);
4742 
4743         if (sample_type & PERF_SAMPLE_ID)
4744                 perf_output_put(handle, data->id);
4745 
4746         if (sample_type & PERF_SAMPLE_STREAM_ID)
4747                 perf_output_put(handle, data->stream_id);
4748 
4749         if (sample_type & PERF_SAMPLE_CPU)
4750                 perf_output_put(handle, data->cpu_entry);
4751 
4752         if (sample_type & PERF_SAMPLE_IDENTIFIER)
4753                 perf_output_put(handle, data->id);
4754 }
4755 
4756 void perf_event__output_id_sample(struct perf_event *event,
4757                                   struct perf_output_handle *handle,
4758                                   struct perf_sample_data *sample)
4759 {
4760         if (event->attr.sample_id_all)
4761                 __perf_event__output_id_sample(handle, sample);
4762 }
4763 
4764 static void perf_output_read_one(struct perf_output_handle *handle,
4765                                  struct perf_event *event,
4766                                  u64 enabled, u64 running)
4767 {
4768         u64 read_format = event->attr.read_format;
4769         u64 values[4];
4770         int n = 0;
4771 
4772         values[n++] = perf_event_count(event);
4773         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4774                 values[n++] = enabled +
4775                         atomic64_read(&event->child_total_time_enabled);
4776         }
4777         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4778                 values[n++] = running +
4779                         atomic64_read(&event->child_total_time_running);
4780         }
4781         if (read_format & PERF_FORMAT_ID)
4782                 values[n++] = primary_event_id(event);
4783 
4784         __output_copy(handle, values, n * sizeof(u64));
4785 }
4786 
4787 static void perf_output_read_group(struct perf_output_handle *handle,
4788                             struct perf_event *event,
4789                             u64 enabled, u64 running)
4790 {
4791         struct perf_event *leader = event->group_leader, *sub;
4792         u64 read_format = event->attr.read_format;
4793         u64 values[5];
4794         int n = 0;
4795 
4796         values[n++] = 1 + leader->nr_siblings;
4797 
4798         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4799                 values[n++] = enabled;
4800 
4801         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4802                 values[n++] = running;
4803 
4804         if (leader != event)
4805                 leader->pmu->read(leader);
4806 
4807         values[n++] = perf_event_count(leader);
4808         if (read_format & PERF_FORMAT_ID)
4809                 values[n++] = primary_event_id(leader);
4810 
4811         __output_copy(handle, values, n * sizeof(u64));
4812 
4813         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4814                 n = 0;
4815 
4816                 if ((sub != event) &&
4817                     (sub->state == PERF_EVENT_STATE_ACTIVE))
4818                         sub->pmu->read(sub);
4819 
4820                 values[n++] = perf_event_count(sub);
4821                 if (read_format & PERF_FORMAT_ID)
4822                         values[n++] = primary_event_id(sub);
4823 
4824                 __output_copy(handle, values, n * sizeof(u64));
4825         }
4826 }
4827 
4828 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4829                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
4830 
4831 /*
4832  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
4833  *
4834  * The problem is that its both hard and excessively expensive to iterate the
4835  * child list, not to mention that its impossible to IPI the children running
4836  * on another CPU, from interrupt/NMI context.
4837  */
4838 static void perf_output_read(struct perf_output_handle *handle,
4839                              struct perf_event *event)
4840 {
4841         u64 enabled = 0, running = 0, now;
4842         u64 read_format = event->attr.read_format;
4843 
4844         /*
4845          * compute total_time_enabled, total_time_running
4846          * based on snapshot values taken when the event
4847          * was last scheduled in.
4848          *
4849          * we cannot simply called update_context_time()
4850          * because of locking issue as we are called in
4851          * NMI context
4852          */
4853         if (read_format & PERF_FORMAT_TOTAL_TIMES)
4854                 calc_timer_values(event, &now, &enabled, &running);
4855 
4856         if (event->attr.read_format & PERF_FORMAT_GROUP)
4857                 perf_output_read_group(handle, event, enabled, running);
4858         else
4859                 perf_output_read_one(handle, event, enabled, running);
4860 }
4861 
4862 void perf_output_sample(struct perf_output_handle *handle,
4863                         struct perf_event_header *header,
4864                         struct perf_sample_data *data,
4865                         struct perf_event *event)
4866 {
4867         u64 sample_type = data->type;
4868 
4869         perf_output_put(handle, *header);
4870 
4871         if (sample_type & PERF_SAMPLE_IDENTIFIER)
4872                 perf_output_put(handle, data->id);
4873 
4874         if (sample_type & PERF_SAMPLE_IP)
4875                 perf_output_put(handle, data->ip);
4876 
4877         if (sample_type & PERF_SAMPLE_TID)
4878                 perf_output_put(handle, data->tid_entry);
4879 
4880         if (sample_type & PERF_SAMPLE_TIME)
4881                 perf_output_put(handle, data->time);
4882 
4883         if (sample_type & PERF_SAMPLE_ADDR)
4884                 perf_output_put(handle, data->addr);
4885 
4886         if (sample_type & PERF_SAMPLE_ID)
4887                 perf_output_put(handle, data->id);
4888 
4889         if (sample_type & PERF_SAMPLE_STREAM_ID)
4890                 perf_output_put(handle, data->stream_id);
4891 
4892         if (sample_type & PERF_SAMPLE_CPU)
4893                 perf_output_put(handle, data->cpu_entry);
4894 
4895         if (sample_type & PERF_SAMPLE_PERIOD)
4896                 perf_output_put(handle, data->period);
4897 
4898         if (sample_type & PERF_SAMPLE_READ)
4899                 perf_output_read(handle, event);
4900 
4901         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4902                 if (data->callchain) {
4903                         int size = 1;
4904 
4905                         if (data->callchain)
4906                                 size += data->callchain->nr;
4907 
4908                         size *= sizeof(u64);
4909 
4910                         __output_copy(handle, data->callchain, size);
4911                 } else {
4912                         u64 nr = 0;
4913                         perf_output_put(handle, nr);
4914                 }
4915         }
4916 
4917         if (sample_type & PERF_SAMPLE_RAW) {
4918                 if (data->raw) {
4919                         perf_output_put(handle, data->raw->size);
4920                         __output_copy(handle, data->raw->data,
4921                                            data->raw->size);
4922                 } else {
4923                         struct {
4924                                 u32     size;
4925                                 u32     data;
4926                         } raw = {
4927                                 .size = sizeof(u32),
4928                                 .data = 0,
4929                         };
4930                         perf_output_put(handle, raw);
4931                 }
4932         }
4933 
4934         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4935                 if (data->br_stack) {
4936                         size_t size;
4937 
4938                         size = data->br_stack->nr
4939                              * sizeof(struct perf_branch_entry);
4940 
4941                         perf_output_put(handle, data->br_stack->nr);
4942                         perf_output_copy(handle, data->br_stack->entries, size);
4943                 } else {
4944                         /*
4945                          * we always store at least the value of nr
4946                          */
4947                         u64 nr = 0;
4948                         perf_output_put(handle, nr);
4949                 }
4950         }
4951 
4952         if (sample_type & PERF_SAMPLE_REGS_USER) {
4953                 u64 abi = data->regs_user.abi;
4954 
4955                 /*
4956                  * If there are no regs to dump, notice it through
4957                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4958                  */
4959                 perf_output_put(handle, abi);
4960 
4961                 if (abi) {
4962                         u64 mask = event->attr.sample_regs_user;
4963                         perf_output_sample_regs(handle,
4964                                                 data->regs_user.regs,
4965                                                 mask);
4966                 }
4967         }
4968 
4969         if (sample_type & PERF_SAMPLE_STACK_USER) {
4970                 perf_output_sample_ustack(handle,
4971                                           data->stack_user_size,
4972                                           data->regs_user.regs);
4973         }
4974 
4975         if (sample_type & PERF_SAMPLE_WEIGHT)
4976                 perf_output_put(handle, data->weight);
4977 
4978         if (sample_type & PERF_SAMPLE_DATA_SRC)
4979                 perf_output_put(handle, data->data_src.val);
4980 
4981         if (sample_type & PERF_SAMPLE_TRANSACTION)
4982                 perf_output_put(handle, data->txn);
4983 
4984         if (!event->attr.watermark) {
4985                 int wakeup_events = event->attr.wakeup_events;
4986 
4987                 if (wakeup_events) {
4988                         struct ring_buffer *rb = handle->rb;
4989                         int events = local_inc_return(&rb->events);
4990 
4991                         if (events >= wakeup_events) {
4992                                 local_sub(wakeup_events, &rb->events);
4993                                 local_inc(&rb->wakeup);
4994                         }
4995                 }
4996         }
4997 }
4998 
4999 void perf_prepare_sample(struct perf_event_header *header,
5000                          struct perf_sample_data *data,
5001                          struct perf_event *event,
5002                          struct pt_regs *regs)
5003 {
5004         u64 sample_type = event->attr.sample_type;
5005 
5006         header->type = PERF_RECORD_SAMPLE;
5007         header->size = sizeof(*header) + event->header_size;
5008 
5009         header->misc = 0;
5010         header->misc |= perf_misc_flags(regs);
5011 
5012         __perf_event_header__init_id(header, data, event);
5013 
5014         if (sample_type & PERF_SAMPLE_IP)
5015                 data->ip = perf_instruction_pointer(regs);
5016 
5017         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5018                 int size = 1;
5019 
5020                 data->callchain = perf_callchain(event, regs);
5021 
5022                 if (data->callchain)
5023                         size += data->callchain->nr;
5024 
5025                 header->size += size * sizeof(u64);
5026         }
5027 
5028         if (sample_type & PERF_SAMPLE_RAW) {
5029                 int size = sizeof(u32);
5030 
5031                 if (data->raw)
5032                         size += data->raw->size;
5033                 else
5034                         size += sizeof(u32);
5035 
5036                 WARN_ON_ONCE(size & (sizeof(u64)-1));
5037                 header->size += size;
5038         }
5039 
5040         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5041                 int size = sizeof(u64); /* nr */
5042                 if (data->br_stack) {
5043                         size += data->br_stack->nr
5044                               * sizeof(struct perf_branch_entry);
5045                 }
5046                 header->size += size;
5047         }
5048 
5049         if (sample_type & PERF_SAMPLE_REGS_USER) {
5050                 /* regs dump ABI info */
5051                 int size = sizeof(u64);
5052 
5053                 perf_sample_regs_user(&data->regs_user, regs);
5054 
5055                 if (data->regs_user.regs) {
5056                         u64 mask = event->attr.sample_regs_user;
5057                         size += hweight64(mask) * sizeof(u64);
5058                 }
5059 
5060                 header->size += size;
5061         }
5062 
5063         if (sample_type & PERF_SAMPLE_STACK_USER) {
5064                 /*
5065                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5066                  * processed as the last one or have additional check added
5067                  * in case new sample type is added, because we could eat
5068                  * up the rest of the sample size.
5069                  */
5070                 struct perf_regs_user *uregs = &data->regs_user;
5071                 u16 stack_size = event->attr.sample_stack_user;
5072                 u16 size = sizeof(u64);
5073 
5074                 if (!uregs->abi)
5075                         perf_sample_regs_user(uregs, regs);
5076 
5077                 stack_size = perf_sample_ustack_size(stack_size, header->size,
5078                                                      uregs->regs);
5079 
5080                 /*
5081                  * If there is something to dump, add space for the dump
5082                  * itself and for the field that tells the dynamic size,
5083                  * which is how many have been actually dumped.
5084                  */
5085                 if (stack_size)
5086                         size += sizeof(u64) + stack_size;
5087 
5088                 data->stack_user_size = stack_size;
5089                 header->size += size;
5090         }
5091 }
5092 
5093 static void perf_event_output(struct perf_event *event,
5094                                 struct perf_sample_data *data,
5095                                 struct pt_regs *regs)
5096 {
5097         struct perf_output_handle handle;
5098         struct perf_event_header header;
5099 
5100         /* protect the callchain buffers */
5101         rcu_read_lock();
5102 
5103         perf_prepare_sample(&header, data, event, regs);
5104 
5105         if (perf_output_begin(&handle, event, header.size))
5106                 goto exit;
5107 
5108         perf_output_sample(&handle, &header, data, event);
5109 
5110         perf_output_end(&handle);
5111 
5112 exit:
5113         rcu_read_unlock();
5114 }
5115 
5116 /*
5117  * read event_id
5118  */
5119 
5120 struct perf_read_event {
5121         struct perf_event_header        header;
5122 
5123         u32                             pid;
5124         u32                             tid;
5125 };
5126 
5127 static void
5128 perf_event_read_event(struct perf_event *event,
5129                         struct task_struct *task)
5130 {
5131         struct perf_output_handle handle;
5132         struct perf_sample_data sample;
5133         struct perf_read_event read_event = {
5134                 .header = {
5135                         .type = PERF_RECORD_READ,
5136                         .misc = 0,
5137                         .size = sizeof(read_event) + event->read_size,
5138                 },
5139                 .pid = perf_event_pid(event, task),
5140                 .tid = perf_event_tid(event, task),
5141         };
5142         int ret;
5143 
5144         perf_event_header__init_id(&read_event.header, &sample, event);
5145         ret = perf_output_begin(&handle, event, read_event.header.size);
5146         if (ret)
5147                 return;
5148 
5149         perf_output_put(&handle, read_event);
5150         perf_output_read(&handle, event);
5151         perf_event__output_id_sample(event, &handle, &sample);
5152 
5153         perf_output_end(&handle);
5154 }
5155 
5156 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5157 
5158 static void
5159 perf_event_aux_ctx(struct perf_event_context *ctx,
5160                    perf_event_aux_output_cb output,
5161                    void *data)
5162 {
5163         struct perf_event *event;
5164 
5165         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5166                 if (event->state < PERF_EVENT_STATE_INACTIVE)
5167                         continue;
5168                 if (!event_filter_match(event))
5169                         continue;
5170                 output(event, data);
5171         }
5172 }
5173 
5174 static void
5175 perf_event_aux(perf_event_aux_output_cb output, void *data,
5176                struct perf_event_context *task_ctx)
5177 {
5178         struct perf_cpu_context *cpuctx;
5179         struct perf_event_context *ctx;
5180         struct pmu *pmu;
5181         int ctxn;
5182 
5183         rcu_read_lock();
5184         list_for_each_entry_rcu(pmu, &pmus, entry) {
5185                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5186                 if (cpuctx->unique_pmu != pmu)
5187                         goto next;
5188                 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5189                 if (task_ctx)
5190                         goto next;
5191                 ctxn = pmu->task_ctx_nr;
5192                 if (ctxn < 0)
5193                         goto next;
5194                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5195                 if (ctx)
5196                         perf_event_aux_ctx(ctx, output, data);
5197 next:
5198                 put_cpu_ptr(pmu->pmu_cpu_context);
5199         }
5200 
5201         if (task_ctx) {
5202                 preempt_disable();
5203                 perf_event_aux_ctx(task_ctx, output, data);
5204                 preempt_enable();
5205         }
5206         rcu_read_unlock();
5207 }
5208 
5209 /*
5210  * task tracking -- fork/exit
5211  *
5212  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5213  */
5214 
5215 struct perf_task_event {
5216         struct task_struct              *task;
5217         struct perf_event_context       *task_ctx;
5218 
5219         struct {
5220                 struct perf_event_header        header;
5221 
5222                 u32                             pid;
5223                 u32                             ppid;
5224                 u32                             tid;
5225                 u32                             ptid;
5226                 u64                             time;
5227         } event_id;
5228 };
5229 
5230 static int perf_event_task_match(struct perf_event *event)
5231 {
5232         return event->attr.comm  || event->attr.mmap ||
5233                event->attr.mmap2 || event->attr.mmap_data ||
5234                event->attr.task;
5235 }
5236 
5237 static void perf_event_task_output(struct perf_event *event,
5238                                    void *data)
5239 {
5240         struct perf_task_event *task_event = data;
5241         struct perf_output_handle handle;
5242         struct perf_sample_data sample;
5243         struct task_struct *task = task_event->task;
5244         int ret, size = task_event->event_id.header.size;
5245 
5246         if (!perf_event_task_match(event))
5247                 return;
5248 
5249         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5250 
5251         ret = perf_output_begin(&handle, event,
5252                                 task_event->event_id.header.size);
5253         if (ret)
5254                 goto out;
5255 
5256         task_event->event_id.pid = perf_event_pid(event, task);
5257         task_event->event_id.ppid = perf_event_pid(event, current);
5258 
5259         task_event->event_id.tid = perf_event_tid(event, task);
5260         task_event->event_id.ptid = perf_event_tid(event, current);
5261 
5262         perf_output_put(&handle, task_event->event_id);
5263 
5264         perf_event__output_id_sample(event, &handle, &sample);
5265 
5266         perf_output_end(&handle);
5267 out:
5268         task_event->event_id.header.size = size;
5269 }
5270 
5271 static void perf_event_task(struct task_struct *task,
5272                               struct perf_event_context *task_ctx,
5273                               int new)
5274 {
5275         struct perf_task_event task_event;
5276 
5277         if (!atomic_read(&nr_comm_events) &&
5278             !atomic_read(&nr_mmap_events) &&
5279             !atomic_read(&nr_task_events))
5280                 return;
5281 
5282         task_event = (struct perf_task_event){
5283                 .task     = task,
5284                 .task_ctx = task_ctx,
5285                 .event_id    = {
5286                         .header = {
5287                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5288                                 .misc = 0,
5289                                 .size = sizeof(task_event.event_id),
5290                         },
5291                         /* .pid  */
5292                         /* .ppid */
5293                         /* .tid  */
5294                         /* .ptid */
5295                         .time = perf_clock(),
5296                 },
5297         };
5298 
5299         perf_event_aux(perf_event_task_output,
5300                        &task_event,
5301                        task_ctx);
5302 }
5303 
5304 void perf_event_fork(struct task_struct *task)
5305 {
5306         perf_event_task(task, NULL, 1);
5307 }
5308 
5309 /*
5310  * comm tracking
5311  */
5312 
5313 struct perf_comm_event {
5314         struct task_struct      *task;
5315         char                    *comm;
5316         int                     comm_size;
5317 
5318         struct {
5319                 struct perf_event_header        header;
5320 
5321                 u32                             pid;
5322                 u32                             tid;
5323         } event_id;
5324 };
5325 
5326 static int perf_event_comm_match(struct perf_event *event)
5327 {
5328         return event->attr.comm;
5329 }
5330 
5331 static void perf_event_comm_output(struct perf_event *event,
5332                                    void *data)
5333 {
5334         struct perf_comm_event *comm_event = data;
5335         struct perf_output_handle handle;
5336         struct perf_sample_data sample;
5337         int size = comm_event->event_id.header.size;
5338         int ret;
5339 
5340         if (!perf_event_comm_match(event))
5341                 return;
5342 
5343         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5344         ret = perf_output_begin(&handle, event,
5345                                 comm_event->event_id.header.size);
5346 
5347         if (ret)
5348                 goto out;
5349 
5350         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5351         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5352 
5353         perf_output_put(&handle, comm_event->event_id);
5354         __output_copy(&handle, comm_event->comm,
5355                                    comm_event->comm_size);
5356 
5357         perf_event__output_id_sample(event, &handle, &sample);
5358 
5359         perf_output_end(&handle);
5360 out:
5361         comm_event->event_id.header.size = size;
5362 }
5363 
5364 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5365 {
5366         char comm[TASK_COMM_LEN];
5367         unsigned int size;
5368 
5369         memset(comm, 0, sizeof(comm));
5370         strlcpy(comm, comm_event->task->comm, sizeof(comm));
5371         size = ALIGN(strlen(comm)+1, sizeof(u64));
5372 
5373         comm_event->comm = comm;
5374         comm_event->comm_size = size;
5375 
5376         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5377 
5378         perf_event_aux(perf_event_comm_output,
5379                        comm_event,
5380                        NULL);
5381 }
5382 
5383 void perf_event_comm(struct task_struct *task, bool exec)
5384 {
5385         struct perf_comm_event comm_event;
5386 
5387         if (!atomic_read(&nr_comm_events))
5388                 return;
5389 
5390         comm_event = (struct perf_comm_event){
5391                 .task   = task,
5392                 /* .comm      */
5393                 /* .comm_size */
5394                 .event_id  = {
5395                         .header = {
5396                                 .type = PERF_RECORD_COMM,
5397                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5398                                 /* .size */
5399                         },
5400                         /* .pid */
5401                         /* .tid */
5402                 },
5403         };
5404 
5405         perf_event_comm_event(&comm_event);
5406 }
5407 
5408 /*
5409  * mmap tracking
5410  */
5411 
5412 struct perf_mmap_event {
5413         struct vm_area_struct   *vma;
5414 
5415         const char              *file_name;
5416         int                     file_size;
5417         int                     maj, min;
5418         u64                     ino;
5419         u64                     ino_generation;
5420         u32                     prot, flags;
5421 
5422         struct {
5423                 struct perf_event_header        header;
5424 
5425                 u32                             pid;
5426                 u32                             tid;
5427                 u64                             start;
5428                 u64                             len;
5429                 u64                             pgoff;
5430         } event_id;
5431 };
5432 
5433 static int perf_event_mmap_match(struct perf_event *event,
5434                                  void *data)
5435 {
5436         struct perf_mmap_event *mmap_event = data;
5437         struct vm_area_struct *vma = mmap_event->vma;
5438         int executable = vma->vm_flags & VM_EXEC;
5439 
5440         return (!executable && event->attr.mmap_data) ||
5441                (executable && (event->attr.mmap || event->attr.mmap2));
5442 }
5443 
5444 static void perf_event_mmap_output(struct perf_event *event,
5445                                    void *data)
5446 {
5447         struct perf_mmap_event *mmap_event = data;
5448         struct perf_output_handle handle;
5449         struct perf_sample_data sample;
5450         int size = mmap_event->event_id.header.size;
5451         u32 type = mmap_event->event_id.header.type;
5452         int ret;
5453 
5454         if (!perf_event_mmap_match(event, data))
5455                 return;
5456 
5457         if (event->attr.mmap2) {
5458                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5459                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5460                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5461                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5462                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5463                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5464                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5465         }
5466 
5467         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5468         ret = perf_output_begin(&handle, event,
5469                                 mmap_event->event_id.header.size);
5470         if (ret)
5471                 goto out;
5472 
5473         mmap_event->event_id.pid = perf_event_pid(event, current);
5474         mmap_event->event_id.tid = perf_event_tid(event, current);
5475 
5476         perf_output_put(&handle, mmap_event->event_id);
5477 
5478         if (event->attr.mmap2) {
5479                 perf_output_put(&handle, mmap_event->maj);
5480                 perf_output_put(&handle, mmap_event->min);
5481                 perf_output_put(&handle, mmap_event->ino);
5482                 perf_output_put(&handle, mmap_event->ino_generation);
5483                 perf_output_put(&handle, mmap_event->prot);
5484                 perf_output_put(&handle, mmap_event->flags);
5485         }
5486 
5487         __output_copy(&handle, mmap_event->file_name,
5488                                    mmap_event->file_size);
5489 
5490         perf_event__output_id_sample(event, &handle, &sample);
5491 
5492         perf_output_end(&handle);
5493 out:
5494         mmap_event->event_id.header.size = size;
5495         mmap_event->event_id.header.type = type;
5496 }
5497 
5498 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5499 {
5500         struct vm_area_struct *vma = mmap_event->vma;
5501         struct file *file = vma->vm_file;
5502         int maj = 0, min = 0;
5503         u64 ino = 0, gen = 0;
5504         u32 prot = 0, flags = 0;
5505         unsigned int size;
5506         char tmp[16];
5507         char *buf = NULL;
5508         char *name;
5509 
5510         if (vma->vm_flags & VM_READ)
5511                 prot |= PROT_READ;
5512         if (vma->vm_flags & VM_WRITE)
5513                 prot |= PROT_WRITE;
5514         if (vma->vm_flags & VM_EXEC)
5515                 prot |= PROT_EXEC;
5516 
5517         if (vma->vm_flags & VM_MAYSHARE)
5518                 flags = MAP_SHARED;
5519         else
5520                 flags = MAP_PRIVATE;
5521 
5522         if (vma->vm_flags & VM_DENYWRITE)
5523                 flags |= MAP_DENYWRITE;
5524         if (vma->vm_flags & VM_MAYEXEC)
5525                 flags |= MAP_EXECUTABLE;
5526         if (vma->vm_flags & VM_LOCKED)
5527                 flags |= MAP_LOCKED;
5528         if (vma->vm_flags & VM_HUGETLB)
5529                 flags |= MAP_HUGETLB;
5530 
5531         if (file) {
5532                 struct inode *inode;
5533                 dev_t dev;
5534 
5535                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5536                 if (!buf) {
5537                         name = "//enomem";
5538                         goto cpy_name;
5539                 }
5540                 /*
5541                  * d_path() works from the end of the rb backwards, so we
5542                  * need to add enough zero bytes after the string to handle
5543                  * the 64bit alignment we do later.
5544                  */
5545                 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5546                 if (IS_ERR(name)) {
5547                         name = "//toolong";
5548                         goto cpy_name;
5549                 }
5550                 inode = file_inode(vma->vm_file);
5551                 dev = inode->i_sb->s_dev;
5552                 ino = inode->i_ino;
5553                 gen = inode->i_generation;
5554                 maj = MAJOR(dev);
5555                 min = MINOR(dev);
5556 
5557                 goto got_name;
5558         } else {
5559                 name = (char *)arch_vma_name(vma);
5560                 if (name)
5561                         goto cpy_name;
5562 
5563                 if (vma->vm_start <= vma->vm_mm->start_brk &&
5564                                 vma->vm_end >= vma->vm_mm->brk) {
5565                         name = "[heap]";
5566                         goto cpy_name;
5567                 }
5568                 if (vma->vm_start <= vma->vm_mm->start_stack &&
5569                                 vma->vm_end >= vma->vm_mm->start_stack) {
5570                         name = "[stack]";
5571                         goto cpy_name;
5572                 }
5573 
5574                 name = "//anon";
5575                 goto cpy_name;
5576         }
5577 
5578 cpy_name:
5579         strlcpy(tmp, name, sizeof(tmp));
5580         name = tmp;
5581 got_name:
5582         /*
5583          * Since our buffer works in 8 byte units we need to align our string
5584          * size to a multiple of 8. However, we must guarantee the tail end is
5585          * zero'd out to avoid leaking random bits to userspace.
5586          */
5587         size = strlen(name)+1;
5588         while (!IS_ALIGNED(size, sizeof(u64)))
5589                 name[size++] = '\0';
5590 
5591         mmap_event->file_name = name;
5592         mmap_event->file_size = size;
5593         mmap_event->maj = maj;
5594         mmap_event->min = min;
5595         mmap_event->ino = ino;
5596         mmap_event->ino_generation = gen;
5597         mmap_event->prot = prot;
5598         mmap_event->flags = flags;
5599 
5600         if (!(vma->vm_flags & VM_EXEC))
5601                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5602 
5603         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5604 
5605         perf_event_aux(perf_event_mmap_output,
5606                        mmap_event,
5607                        NULL);
5608 
5609         kfree(buf);
5610 }
5611 
5612 void perf_event_mmap(struct vm_area_struct *vma)
5613 {
5614         struct perf_mmap_event mmap_event;
5615 
5616         if (!atomic_read(&nr_mmap_events))
5617                 return;
5618 
5619         mmap_event = (struct perf_mmap_event){
5620                 .vma    = vma,
5621                 /* .file_name */
5622                 /* .file_size */
5623                 .event_id  = {
5624                         .header = {
5625                                 .type = PERF_RECORD_MMAP,
5626                                 .misc = PERF_RECORD_MISC_USER,
5627                                 /* .size */
5628                         },
5629                         /* .pid */
5630                         /* .tid */
5631                         .start  = vma->vm_start,
5632                         .len    = vma->vm_end - vma->vm_start,
5633                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
5634                 },
5635                 /* .maj (attr_mmap2 only) */
5636                 /* .min (attr_mmap2 only) */
5637                 /* .ino (attr_mmap2 only) */
5638                 /* .ino_generation (attr_mmap2 only) */
5639                 /* .prot (attr_mmap2 only) */
5640                 /* .flags (attr_mmap2 only) */
5641         };
5642 
5643         perf_event_mmap_event(&mmap_event);
5644 }
5645 
5646 /*
5647  * IRQ throttle logging
5648  */
5649 
5650 static void perf_log_throttle(struct perf_event *event, int enable)
5651 {
5652         struct perf_output_handle handle;
5653         struct perf_sample_data sample;
5654         int ret;
5655 
5656         struct {
5657                 struct perf_event_header        header;
5658                 u64                             time;
5659                 u64                             id;
5660                 u64                             stream_id;
5661         } throttle_event = {
5662                 .header = {
5663                         .type = PERF_RECORD_THROTTLE,
5664                         .misc = 0,
5665                         .size = sizeof(throttle_event),
5666                 },
5667                 .time           = perf_clock(),
5668                 .id             = primary_event_id(event),
5669                 .stream_id      = event->id,
5670         };
5671 
5672         if (enable)
5673                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5674 
5675         perf_event_header__init_id(&throttle_event.header, &sample, event);
5676 
5677         ret = perf_output_begin(&handle, event,
5678                                 throttle_event.header.size);
5679         if (ret)
5680                 return;
5681 
5682         perf_output_put(&handle, throttle_event);
5683         perf_event__output_id_sample(event, &handle, &sample);
5684         perf_output_end(&handle);
5685 }
5686 
5687 /*
5688  * Generic event overflow handling, sampling.
5689  */
5690 
5691 static int __perf_event_overflow(struct perf_event *event,
5692                                    int throttle, struct perf_sample_data *data,
5693                                    struct pt_regs *regs)
5694 {
5695         int events = atomic_read(&event->event_limit);
5696         struct hw_perf_event *hwc = &event->hw;
5697         u64 seq;
5698         int ret = 0;
5699 
5700         /*
5701          * Non-sampling counters might still use the PMI to fold short
5702          * hardware counters, ignore those.
5703          */
5704         if (unlikely(!is_sampling_event(event)))
5705                 return 0;
5706 
5707         seq = __this_cpu_read(perf_throttled_seq);
5708         if (seq != hwc->interrupts_seq) {
5709                 hwc->interrupts_seq = seq;
5710                 hwc->interrupts = 1;
5711         } else {
5712                 hwc->interrupts++;
5713                 if (unlikely(throttle
5714                              && hwc->interrupts >= max_samples_per_tick)) {
5715                         __this_cpu_inc(perf_throttled_count);
5716                         hwc->interrupts = MAX_INTERRUPTS;
5717                         perf_log_throttle(event, 0);
5718                         tick_nohz_full_kick();
5719                         ret = 1;
5720                 }
5721         }
5722 
5723         if (event->attr.freq) {
5724                 u64 now = perf_clock();
5725                 s64 delta = now - hwc->freq_time_stamp;
5726 
5727                 hwc->freq_time_stamp = now;
5728 
5729                 if (delta > 0 && delta < 2*TICK_NSEC)
5730                         perf_adjust_period(event, delta, hwc->last_period, true);
5731         }
5732 
5733         /*
5734          * XXX event_limit might not quite work as expected on inherited
5735          * events
5736          */
5737 
5738         event->pending_kill = POLL_IN;
5739         if (events && atomic_dec_and_test(&event->event_limit)) {
5740                 ret = 1;
5741                 event->pending_kill = POLL_HUP;
5742                 event->pending_disable = 1;
5743                 irq_work_queue(&event->pending);
5744         }
5745 
5746         if (event->overflow_handler)
5747                 event->overflow_handler(event, data, regs);
5748         else
5749                 perf_event_output(event, data, regs);
5750 
5751         if (*perf_event_fasync(event) && event->pending_kill) {
5752                 event->pending_wakeup = 1;
5753                 irq_work_queue(&event->pending);
5754         }
5755 
5756         return ret;
5757 }
5758 
5759 int perf_event_overflow(struct perf_event *event,
5760                           struct perf_sample_data *data,
5761                           struct pt_regs *regs)
5762 {
5763         return __perf_event_overflow(event, 1, data, regs);
5764 }
5765 
5766 /*
5767  * Generic software event infrastructure
5768  */
5769 
5770 struct swevent_htable {
5771         struct swevent_hlist            *swevent_hlist;
5772         struct mutex                    hlist_mutex;
5773         int                             hlist_refcount;
5774 
5775         /* Recursion avoidance in each contexts */
5776         int                             recursion[PERF_NR_CONTEXTS];
5777 };
5778 
5779 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5780 
5781 /*
5782  * We directly increment event->count and keep a second value in
5783  * event->hw.period_left to count intervals. This period event
5784  * is kept in the range [-sample_period, 0] so that we can use the
5785  * sign as trigger.
5786  */
5787 
5788 u64 perf_swevent_set_period(struct perf_event *event)
5789 {
5790         struct hw_perf_event *hwc = &event->hw;
5791         u64 period = hwc->last_period;
5792         u64 nr, offset;
5793         s64 old, val;
5794 
5795         hwc->last_period = hwc->sample_period;
5796 
5797 again:
5798         old = val = local64_read(&hwc->period_left);
5799         if (val < 0)