~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.5-rc1 ] ~ [ linux-5.4.2 ] ~ [ linux-5.3.15 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.88 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.158 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.206 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.206 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.78 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Performance events core code:
  3  *
  4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  8  *
  9  * For licensing details see kernel-base/COPYING
 10  */
 11 
 12 #include <linux/fs.h>
 13 #include <linux/mm.h>
 14 #include <linux/cpu.h>
 15 #include <linux/smp.h>
 16 #include <linux/idr.h>
 17 #include <linux/file.h>
 18 #include <linux/poll.h>
 19 #include <linux/slab.h>
 20 #include <linux/hash.h>
 21 #include <linux/tick.h>
 22 #include <linux/sysfs.h>
 23 #include <linux/dcache.h>
 24 #include <linux/percpu.h>
 25 #include <linux/ptrace.h>
 26 #include <linux/reboot.h>
 27 #include <linux/vmstat.h>
 28 #include <linux/device.h>
 29 #include <linux/export.h>
 30 #include <linux/vmalloc.h>
 31 #include <linux/hardirq.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/cgroup.h>
 38 #include <linux/perf_event.h>
 39 #include <linux/trace_events.h>
 40 #include <linux/hw_breakpoint.h>
 41 #include <linux/mm_types.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 #include <linux/bpf.h>
 46 #include <linux/filter.h>
 47 #include <linux/namei.h>
 48 #include <linux/parser.h>
 49 
 50 #include "internal.h"
 51 
 52 #include <asm/irq_regs.h>
 53 
 54 typedef int (*remote_function_f)(void *);
 55 
 56 struct remote_function_call {
 57         struct task_struct      *p;
 58         remote_function_f       func;
 59         void                    *info;
 60         int                     ret;
 61 };
 62 
 63 static void remote_function(void *data)
 64 {
 65         struct remote_function_call *tfc = data;
 66         struct task_struct *p = tfc->p;
 67 
 68         if (p) {
 69                 /* -EAGAIN */
 70                 if (task_cpu(p) != smp_processor_id())
 71                         return;
 72 
 73                 /*
 74                  * Now that we're on right CPU with IRQs disabled, we can test
 75                  * if we hit the right task without races.
 76                  */
 77 
 78                 tfc->ret = -ESRCH; /* No such (running) process */
 79                 if (p != current)
 80                         return;
 81         }
 82 
 83         tfc->ret = tfc->func(tfc->info);
 84 }
 85 
 86 /**
 87  * task_function_call - call a function on the cpu on which a task runs
 88  * @p:          the task to evaluate
 89  * @func:       the function to be called
 90  * @info:       the function call argument
 91  *
 92  * Calls the function @func when the task is currently running. This might
 93  * be on the current CPU, which just calls the function directly
 94  *
 95  * returns: @func return value, or
 96  *          -ESRCH  - when the process isn't running
 97  *          -EAGAIN - when the process moved away
 98  */
 99 static int
100 task_function_call(struct task_struct *p, remote_function_f func, void *info)
101 {
102         struct remote_function_call data = {
103                 .p      = p,
104                 .func   = func,
105                 .info   = info,
106                 .ret    = -EAGAIN,
107         };
108         int ret;
109 
110         do {
111                 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
112                 if (!ret)
113                         ret = data.ret;
114         } while (ret == -EAGAIN);
115 
116         return ret;
117 }
118 
119 /**
120  * cpu_function_call - call a function on the cpu
121  * @func:       the function to be called
122  * @info:       the function call argument
123  *
124  * Calls the function @func on the remote cpu.
125  *
126  * returns: @func return value or -ENXIO when the cpu is offline
127  */
128 static int cpu_function_call(int cpu, remote_function_f func, void *info)
129 {
130         struct remote_function_call data = {
131                 .p      = NULL,
132                 .func   = func,
133                 .info   = info,
134                 .ret    = -ENXIO, /* No such CPU */
135         };
136 
137         smp_call_function_single(cpu, remote_function, &data, 1);
138 
139         return data.ret;
140 }
141 
142 static inline struct perf_cpu_context *
143 __get_cpu_context(struct perf_event_context *ctx)
144 {
145         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
146 }
147 
148 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
149                           struct perf_event_context *ctx)
150 {
151         raw_spin_lock(&cpuctx->ctx.lock);
152         if (ctx)
153                 raw_spin_lock(&ctx->lock);
154 }
155 
156 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
157                             struct perf_event_context *ctx)
158 {
159         if (ctx)
160                 raw_spin_unlock(&ctx->lock);
161         raw_spin_unlock(&cpuctx->ctx.lock);
162 }
163 
164 #define TASK_TOMBSTONE ((void *)-1L)
165 
166 static bool is_kernel_event(struct perf_event *event)
167 {
168         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
169 }
170 
171 /*
172  * On task ctx scheduling...
173  *
174  * When !ctx->nr_events a task context will not be scheduled. This means
175  * we can disable the scheduler hooks (for performance) without leaving
176  * pending task ctx state.
177  *
178  * This however results in two special cases:
179  *
180  *  - removing the last event from a task ctx; this is relatively straight
181  *    forward and is done in __perf_remove_from_context.
182  *
183  *  - adding the first event to a task ctx; this is tricky because we cannot
184  *    rely on ctx->is_active and therefore cannot use event_function_call().
185  *    See perf_install_in_context().
186  *
187  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
188  */
189 
190 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
191                         struct perf_event_context *, void *);
192 
193 struct event_function_struct {
194         struct perf_event *event;
195         event_f func;
196         void *data;
197 };
198 
199 static int event_function(void *info)
200 {
201         struct event_function_struct *efs = info;
202         struct perf_event *event = efs->event;
203         struct perf_event_context *ctx = event->ctx;
204         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
205         struct perf_event_context *task_ctx = cpuctx->task_ctx;
206         int ret = 0;
207 
208         WARN_ON_ONCE(!irqs_disabled());
209 
210         perf_ctx_lock(cpuctx, task_ctx);
211         /*
212          * Since we do the IPI call without holding ctx->lock things can have
213          * changed, double check we hit the task we set out to hit.
214          */
215         if (ctx->task) {
216                 if (ctx->task != current) {
217                         ret = -ESRCH;
218                         goto unlock;
219                 }
220 
221                 /*
222                  * We only use event_function_call() on established contexts,
223                  * and event_function() is only ever called when active (or
224                  * rather, we'll have bailed in task_function_call() or the
225                  * above ctx->task != current test), therefore we must have
226                  * ctx->is_active here.
227                  */
228                 WARN_ON_ONCE(!ctx->is_active);
229                 /*
230                  * And since we have ctx->is_active, cpuctx->task_ctx must
231                  * match.
232                  */
233                 WARN_ON_ONCE(task_ctx != ctx);
234         } else {
235                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
236         }
237 
238         efs->func(event, cpuctx, ctx, efs->data);
239 unlock:
240         perf_ctx_unlock(cpuctx, task_ctx);
241 
242         return ret;
243 }
244 
245 static void event_function_call(struct perf_event *event, event_f func, void *data)
246 {
247         struct perf_event_context *ctx = event->ctx;
248         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
249         struct event_function_struct efs = {
250                 .event = event,
251                 .func = func,
252                 .data = data,
253         };
254 
255         if (!event->parent) {
256                 /*
257                  * If this is a !child event, we must hold ctx::mutex to
258                  * stabilize the the event->ctx relation. See
259                  * perf_event_ctx_lock().
260                  */
261                 lockdep_assert_held(&ctx->mutex);
262         }
263 
264         if (!task) {
265                 cpu_function_call(event->cpu, event_function, &efs);
266                 return;
267         }
268 
269         if (task == TASK_TOMBSTONE)
270                 return;
271 
272 again:
273         if (!task_function_call(task, event_function, &efs))
274                 return;
275 
276         raw_spin_lock_irq(&ctx->lock);
277         /*
278          * Reload the task pointer, it might have been changed by
279          * a concurrent perf_event_context_sched_out().
280          */
281         task = ctx->task;
282         if (task == TASK_TOMBSTONE) {
283                 raw_spin_unlock_irq(&ctx->lock);
284                 return;
285         }
286         if (ctx->is_active) {
287                 raw_spin_unlock_irq(&ctx->lock);
288                 goto again;
289         }
290         func(event, NULL, ctx, data);
291         raw_spin_unlock_irq(&ctx->lock);
292 }
293 
294 /*
295  * Similar to event_function_call() + event_function(), but hard assumes IRQs
296  * are already disabled and we're on the right CPU.
297  */
298 static void event_function_local(struct perf_event *event, event_f func, void *data)
299 {
300         struct perf_event_context *ctx = event->ctx;
301         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
302         struct task_struct *task = READ_ONCE(ctx->task);
303         struct perf_event_context *task_ctx = NULL;
304 
305         WARN_ON_ONCE(!irqs_disabled());
306 
307         if (task) {
308                 if (task == TASK_TOMBSTONE)
309                         return;
310 
311                 task_ctx = ctx;
312         }
313 
314         perf_ctx_lock(cpuctx, task_ctx);
315 
316         task = ctx->task;
317         if (task == TASK_TOMBSTONE)
318                 goto unlock;
319 
320         if (task) {
321                 /*
322                  * We must be either inactive or active and the right task,
323                  * otherwise we're screwed, since we cannot IPI to somewhere
324                  * else.
325                  */
326                 if (ctx->is_active) {
327                         if (WARN_ON_ONCE(task != current))
328                                 goto unlock;
329 
330                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
331                                 goto unlock;
332                 }
333         } else {
334                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
335         }
336 
337         func(event, cpuctx, ctx, data);
338 unlock:
339         perf_ctx_unlock(cpuctx, task_ctx);
340 }
341 
342 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
343                        PERF_FLAG_FD_OUTPUT  |\
344                        PERF_FLAG_PID_CGROUP |\
345                        PERF_FLAG_FD_CLOEXEC)
346 
347 /*
348  * branch priv levels that need permission checks
349  */
350 #define PERF_SAMPLE_BRANCH_PERM_PLM \
351         (PERF_SAMPLE_BRANCH_KERNEL |\
352          PERF_SAMPLE_BRANCH_HV)
353 
354 enum event_type_t {
355         EVENT_FLEXIBLE = 0x1,
356         EVENT_PINNED = 0x2,
357         EVENT_TIME = 0x4,
358         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
359 };
360 
361 /*
362  * perf_sched_events : >0 events exist
363  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
364  */
365 
366 static void perf_sched_delayed(struct work_struct *work);
367 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
368 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
369 static DEFINE_MUTEX(perf_sched_mutex);
370 static atomic_t perf_sched_count;
371 
372 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
373 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
374 
375 static atomic_t nr_mmap_events __read_mostly;
376 static atomic_t nr_comm_events __read_mostly;
377 static atomic_t nr_task_events __read_mostly;
378 static atomic_t nr_freq_events __read_mostly;
379 static atomic_t nr_switch_events __read_mostly;
380 
381 static LIST_HEAD(pmus);
382 static DEFINE_MUTEX(pmus_lock);
383 static struct srcu_struct pmus_srcu;
384 
385 /*
386  * perf event paranoia level:
387  *  -1 - not paranoid at all
388  *   0 - disallow raw tracepoint access for unpriv
389  *   1 - disallow cpu events for unpriv
390  *   2 - disallow kernel profiling for unpriv
391  */
392 int sysctl_perf_event_paranoid __read_mostly = 2;
393 
394 /* Minimum for 512 kiB + 1 user control page */
395 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
396 
397 /*
398  * max perf event sample rate
399  */
400 #define DEFAULT_MAX_SAMPLE_RATE         100000
401 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
402 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
403 
404 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
405 
406 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
407 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
408 
409 static int perf_sample_allowed_ns __read_mostly =
410         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
411 
412 static void update_perf_cpu_limits(void)
413 {
414         u64 tmp = perf_sample_period_ns;
415 
416         tmp *= sysctl_perf_cpu_time_max_percent;
417         tmp = div_u64(tmp, 100);
418         if (!tmp)
419                 tmp = 1;
420 
421         WRITE_ONCE(perf_sample_allowed_ns, tmp);
422 }
423 
424 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
425 
426 int perf_proc_update_handler(struct ctl_table *table, int write,
427                 void __user *buffer, size_t *lenp,
428                 loff_t *ppos)
429 {
430         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
431 
432         if (ret || !write)
433                 return ret;
434 
435         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
436         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
437         update_perf_cpu_limits();
438 
439         return 0;
440 }
441 
442 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
443 
444 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
445                                 void __user *buffer, size_t *lenp,
446                                 loff_t *ppos)
447 {
448         int ret = proc_dointvec(table, write, buffer, lenp, ppos);
449 
450         if (ret || !write)
451                 return ret;
452 
453         if (sysctl_perf_cpu_time_max_percent == 100 ||
454             sysctl_perf_cpu_time_max_percent == 0) {
455                 printk(KERN_WARNING
456                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
457                 WRITE_ONCE(perf_sample_allowed_ns, 0);
458         } else {
459                 update_perf_cpu_limits();
460         }
461 
462         return 0;
463 }
464 
465 /*
466  * perf samples are done in some very critical code paths (NMIs).
467  * If they take too much CPU time, the system can lock up and not
468  * get any real work done.  This will drop the sample rate when
469  * we detect that events are taking too long.
470  */
471 #define NR_ACCUMULATED_SAMPLES 128
472 static DEFINE_PER_CPU(u64, running_sample_length);
473 
474 static u64 __report_avg;
475 static u64 __report_allowed;
476 
477 static void perf_duration_warn(struct irq_work *w)
478 {
479         printk_ratelimited(KERN_WARNING
480                 "perf: interrupt took too long (%lld > %lld), lowering "
481                 "kernel.perf_event_max_sample_rate to %d\n",
482                 __report_avg, __report_allowed,
483                 sysctl_perf_event_sample_rate);
484 }
485 
486 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
487 
488 void perf_sample_event_took(u64 sample_len_ns)
489 {
490         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
491         u64 running_len;
492         u64 avg_len;
493         u32 max;
494 
495         if (max_len == 0)
496                 return;
497 
498         /* Decay the counter by 1 average sample. */
499         running_len = __this_cpu_read(running_sample_length);
500         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
501         running_len += sample_len_ns;
502         __this_cpu_write(running_sample_length, running_len);
503 
504         /*
505          * Note: this will be biased artifically low until we have
506          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
507          * from having to maintain a count.
508          */
509         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
510         if (avg_len <= max_len)
511                 return;
512 
513         __report_avg = avg_len;
514         __report_allowed = max_len;
515 
516         /*
517          * Compute a throttle threshold 25% below the current duration.
518          */
519         avg_len += avg_len / 4;
520         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
521         if (avg_len < max)
522                 max /= (u32)avg_len;
523         else
524                 max = 1;
525 
526         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
527         WRITE_ONCE(max_samples_per_tick, max);
528 
529         sysctl_perf_event_sample_rate = max * HZ;
530         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
531 
532         if (!irq_work_queue(&perf_duration_work)) {
533                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
534                              "kernel.perf_event_max_sample_rate to %d\n",
535                              __report_avg, __report_allowed,
536                              sysctl_perf_event_sample_rate);
537         }
538 }
539 
540 static atomic64_t perf_event_id;
541 
542 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
543                               enum event_type_t event_type);
544 
545 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
546                              enum event_type_t event_type,
547                              struct task_struct *task);
548 
549 static void update_context_time(struct perf_event_context *ctx);
550 static u64 perf_event_time(struct perf_event *event);
551 
552 void __weak perf_event_print_debug(void)        { }
553 
554 extern __weak const char *perf_pmu_name(void)
555 {
556         return "pmu";
557 }
558 
559 static inline u64 perf_clock(void)
560 {
561         return local_clock();
562 }
563 
564 static inline u64 perf_event_clock(struct perf_event *event)
565 {
566         return event->clock();
567 }
568 
569 #ifdef CONFIG_CGROUP_PERF
570 
571 static inline bool
572 perf_cgroup_match(struct perf_event *event)
573 {
574         struct perf_event_context *ctx = event->ctx;
575         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
576 
577         /* @event doesn't care about cgroup */
578         if (!event->cgrp)
579                 return true;
580 
581         /* wants specific cgroup scope but @cpuctx isn't associated with any */
582         if (!cpuctx->cgrp)
583                 return false;
584 
585         /*
586          * Cgroup scoping is recursive.  An event enabled for a cgroup is
587          * also enabled for all its descendant cgroups.  If @cpuctx's
588          * cgroup is a descendant of @event's (the test covers identity
589          * case), it's a match.
590          */
591         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
592                                     event->cgrp->css.cgroup);
593 }
594 
595 static inline void perf_detach_cgroup(struct perf_event *event)
596 {
597         css_put(&event->cgrp->css);
598         event->cgrp = NULL;
599 }
600 
601 static inline int is_cgroup_event(struct perf_event *event)
602 {
603         return event->cgrp != NULL;
604 }
605 
606 static inline u64 perf_cgroup_event_time(struct perf_event *event)
607 {
608         struct perf_cgroup_info *t;
609 
610         t = per_cpu_ptr(event->cgrp->info, event->cpu);
611         return t->time;
612 }
613 
614 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
615 {
616         struct perf_cgroup_info *info;
617         u64 now;
618 
619         now = perf_clock();
620 
621         info = this_cpu_ptr(cgrp->info);
622 
623         info->time += now - info->timestamp;
624         info->timestamp = now;
625 }
626 
627 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
628 {
629         struct perf_cgroup *cgrp_out = cpuctx->cgrp;
630         if (cgrp_out)
631                 __update_cgrp_time(cgrp_out);
632 }
633 
634 static inline void update_cgrp_time_from_event(struct perf_event *event)
635 {
636         struct perf_cgroup *cgrp;
637 
638         /*
639          * ensure we access cgroup data only when needed and
640          * when we know the cgroup is pinned (css_get)
641          */
642         if (!is_cgroup_event(event))
643                 return;
644 
645         cgrp = perf_cgroup_from_task(current, event->ctx);
646         /*
647          * Do not update time when cgroup is not active
648          */
649         if (cgrp == event->cgrp)
650                 __update_cgrp_time(event->cgrp);
651 }
652 
653 static inline void
654 perf_cgroup_set_timestamp(struct task_struct *task,
655                           struct perf_event_context *ctx)
656 {
657         struct perf_cgroup *cgrp;
658         struct perf_cgroup_info *info;
659 
660         /*
661          * ctx->lock held by caller
662          * ensure we do not access cgroup data
663          * unless we have the cgroup pinned (css_get)
664          */
665         if (!task || !ctx->nr_cgroups)
666                 return;
667 
668         cgrp = perf_cgroup_from_task(task, ctx);
669         info = this_cpu_ptr(cgrp->info);
670         info->timestamp = ctx->timestamp;
671 }
672 
673 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
674 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
675 
676 /*
677  * reschedule events based on the cgroup constraint of task.
678  *
679  * mode SWOUT : schedule out everything
680  * mode SWIN : schedule in based on cgroup for next
681  */
682 static void perf_cgroup_switch(struct task_struct *task, int mode)
683 {
684         struct perf_cpu_context *cpuctx;
685         struct pmu *pmu;
686         unsigned long flags;
687 
688         /*
689          * disable interrupts to avoid geting nr_cgroup
690          * changes via __perf_event_disable(). Also
691          * avoids preemption.
692          */
693         local_irq_save(flags);
694 
695         /*
696          * we reschedule only in the presence of cgroup
697          * constrained events.
698          */
699 
700         list_for_each_entry_rcu(pmu, &pmus, entry) {
701                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
702                 if (cpuctx->unique_pmu != pmu)
703                         continue; /* ensure we process each cpuctx once */
704 
705                 /*
706                  * perf_cgroup_events says at least one
707                  * context on this CPU has cgroup events.
708                  *
709                  * ctx->nr_cgroups reports the number of cgroup
710                  * events for a context.
711                  */
712                 if (cpuctx->ctx.nr_cgroups > 0) {
713                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
714                         perf_pmu_disable(cpuctx->ctx.pmu);
715 
716                         if (mode & PERF_CGROUP_SWOUT) {
717                                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
718                                 /*
719                                  * must not be done before ctxswout due
720                                  * to event_filter_match() in event_sched_out()
721                                  */
722                                 cpuctx->cgrp = NULL;
723                         }
724 
725                         if (mode & PERF_CGROUP_SWIN) {
726                                 WARN_ON_ONCE(cpuctx->cgrp);
727                                 /*
728                                  * set cgrp before ctxsw in to allow
729                                  * event_filter_match() to not have to pass
730                                  * task around
731                                  * we pass the cpuctx->ctx to perf_cgroup_from_task()
732                                  * because cgorup events are only per-cpu
733                                  */
734                                 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
735                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
736                         }
737                         perf_pmu_enable(cpuctx->ctx.pmu);
738                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
739                 }
740         }
741 
742         local_irq_restore(flags);
743 }
744 
745 static inline void perf_cgroup_sched_out(struct task_struct *task,
746                                          struct task_struct *next)
747 {
748         struct perf_cgroup *cgrp1;
749         struct perf_cgroup *cgrp2 = NULL;
750 
751         rcu_read_lock();
752         /*
753          * we come here when we know perf_cgroup_events > 0
754          * we do not need to pass the ctx here because we know
755          * we are holding the rcu lock
756          */
757         cgrp1 = perf_cgroup_from_task(task, NULL);
758         cgrp2 = perf_cgroup_from_task(next, NULL);
759 
760         /*
761          * only schedule out current cgroup events if we know
762          * that we are switching to a different cgroup. Otherwise,
763          * do no touch the cgroup events.
764          */
765         if (cgrp1 != cgrp2)
766                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
767 
768         rcu_read_unlock();
769 }
770 
771 static inline void perf_cgroup_sched_in(struct task_struct *prev,
772                                         struct task_struct *task)
773 {
774         struct perf_cgroup *cgrp1;
775         struct perf_cgroup *cgrp2 = NULL;
776 
777         rcu_read_lock();
778         /*
779          * we come here when we know perf_cgroup_events > 0
780          * we do not need to pass the ctx here because we know
781          * we are holding the rcu lock
782          */
783         cgrp1 = perf_cgroup_from_task(task, NULL);
784         cgrp2 = perf_cgroup_from_task(prev, NULL);
785 
786         /*
787          * only need to schedule in cgroup events if we are changing
788          * cgroup during ctxsw. Cgroup events were not scheduled
789          * out of ctxsw out if that was not the case.
790          */
791         if (cgrp1 != cgrp2)
792                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
793 
794         rcu_read_unlock();
795 }
796 
797 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
798                                       struct perf_event_attr *attr,
799                                       struct perf_event *group_leader)
800 {
801         struct perf_cgroup *cgrp;
802         struct cgroup_subsys_state *css;
803         struct fd f = fdget(fd);
804         int ret = 0;
805 
806         if (!f.file)
807                 return -EBADF;
808 
809         css = css_tryget_online_from_dir(f.file->f_path.dentry,
810                                          &perf_event_cgrp_subsys);
811         if (IS_ERR(css)) {
812                 ret = PTR_ERR(css);
813                 goto out;
814         }
815 
816         cgrp = container_of(css, struct perf_cgroup, css);
817         event->cgrp = cgrp;
818 
819         /*
820          * all events in a group must monitor
821          * the same cgroup because a task belongs
822          * to only one perf cgroup at a time
823          */
824         if (group_leader && group_leader->cgrp != cgrp) {
825                 perf_detach_cgroup(event);
826                 ret = -EINVAL;
827         }
828 out:
829         fdput(f);
830         return ret;
831 }
832 
833 static inline void
834 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
835 {
836         struct perf_cgroup_info *t;
837         t = per_cpu_ptr(event->cgrp->info, event->cpu);
838         event->shadow_ctx_time = now - t->timestamp;
839 }
840 
841 static inline void
842 perf_cgroup_defer_enabled(struct perf_event *event)
843 {
844         /*
845          * when the current task's perf cgroup does not match
846          * the event's, we need to remember to call the
847          * perf_mark_enable() function the first time a task with
848          * a matching perf cgroup is scheduled in.
849          */
850         if (is_cgroup_event(event) && !perf_cgroup_match(event))
851                 event->cgrp_defer_enabled = 1;
852 }
853 
854 static inline void
855 perf_cgroup_mark_enabled(struct perf_event *event,
856                          struct perf_event_context *ctx)
857 {
858         struct perf_event *sub;
859         u64 tstamp = perf_event_time(event);
860 
861         if (!event->cgrp_defer_enabled)
862                 return;
863 
864         event->cgrp_defer_enabled = 0;
865 
866         event->tstamp_enabled = tstamp - event->total_time_enabled;
867         list_for_each_entry(sub, &event->sibling_list, group_entry) {
868                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
869                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
870                         sub->cgrp_defer_enabled = 0;
871                 }
872         }
873 }
874 #else /* !CONFIG_CGROUP_PERF */
875 
876 static inline bool
877 perf_cgroup_match(struct perf_event *event)
878 {
879         return true;
880 }
881 
882 static inline void perf_detach_cgroup(struct perf_event *event)
883 {}
884 
885 static inline int is_cgroup_event(struct perf_event *event)
886 {
887         return 0;
888 }
889 
890 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
891 {
892         return 0;
893 }
894 
895 static inline void update_cgrp_time_from_event(struct perf_event *event)
896 {
897 }
898 
899 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
900 {
901 }
902 
903 static inline void perf_cgroup_sched_out(struct task_struct *task,
904                                          struct task_struct *next)
905 {
906 }
907 
908 static inline void perf_cgroup_sched_in(struct task_struct *prev,
909                                         struct task_struct *task)
910 {
911 }
912 
913 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
914                                       struct perf_event_attr *attr,
915                                       struct perf_event *group_leader)
916 {
917         return -EINVAL;
918 }
919 
920 static inline void
921 perf_cgroup_set_timestamp(struct task_struct *task,
922                           struct perf_event_context *ctx)
923 {
924 }
925 
926 void
927 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
928 {
929 }
930 
931 static inline void
932 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
933 {
934 }
935 
936 static inline u64 perf_cgroup_event_time(struct perf_event *event)
937 {
938         return 0;
939 }
940 
941 static inline void
942 perf_cgroup_defer_enabled(struct perf_event *event)
943 {
944 }
945 
946 static inline void
947 perf_cgroup_mark_enabled(struct perf_event *event,
948                          struct perf_event_context *ctx)
949 {
950 }
951 #endif
952 
953 /*
954  * set default to be dependent on timer tick just
955  * like original code
956  */
957 #define PERF_CPU_HRTIMER (1000 / HZ)
958 /*
959  * function must be called with interrupts disbled
960  */
961 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
962 {
963         struct perf_cpu_context *cpuctx;
964         int rotations = 0;
965 
966         WARN_ON(!irqs_disabled());
967 
968         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
969         rotations = perf_rotate_context(cpuctx);
970 
971         raw_spin_lock(&cpuctx->hrtimer_lock);
972         if (rotations)
973                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
974         else
975                 cpuctx->hrtimer_active = 0;
976         raw_spin_unlock(&cpuctx->hrtimer_lock);
977 
978         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
979 }
980 
981 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
982 {
983         struct hrtimer *timer = &cpuctx->hrtimer;
984         struct pmu *pmu = cpuctx->ctx.pmu;
985         u64 interval;
986 
987         /* no multiplexing needed for SW PMU */
988         if (pmu->task_ctx_nr == perf_sw_context)
989                 return;
990 
991         /*
992          * check default is sane, if not set then force to
993          * default interval (1/tick)
994          */
995         interval = pmu->hrtimer_interval_ms;
996         if (interval < 1)
997                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
998 
999         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1000 
1001         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1002         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1003         timer->function = perf_mux_hrtimer_handler;
1004 }
1005 
1006 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1007 {
1008         struct hrtimer *timer = &cpuctx->hrtimer;
1009         struct pmu *pmu = cpuctx->ctx.pmu;
1010         unsigned long flags;
1011 
1012         /* not for SW PMU */
1013         if (pmu->task_ctx_nr == perf_sw_context)
1014                 return 0;
1015 
1016         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1017         if (!cpuctx->hrtimer_active) {
1018                 cpuctx->hrtimer_active = 1;
1019                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1020                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1021         }
1022         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1023 
1024         return 0;
1025 }
1026 
1027 void perf_pmu_disable(struct pmu *pmu)
1028 {
1029         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1030         if (!(*count)++)
1031                 pmu->pmu_disable(pmu);
1032 }
1033 
1034 void perf_pmu_enable(struct pmu *pmu)
1035 {
1036         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1037         if (!--(*count))
1038                 pmu->pmu_enable(pmu);
1039 }
1040 
1041 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1042 
1043 /*
1044  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1045  * perf_event_task_tick() are fully serialized because they're strictly cpu
1046  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1047  * disabled, while perf_event_task_tick is called from IRQ context.
1048  */
1049 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1050 {
1051         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1052 
1053         WARN_ON(!irqs_disabled());
1054 
1055         WARN_ON(!list_empty(&ctx->active_ctx_list));
1056 
1057         list_add(&ctx->active_ctx_list, head);
1058 }
1059 
1060 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1061 {
1062         WARN_ON(!irqs_disabled());
1063 
1064         WARN_ON(list_empty(&ctx->active_ctx_list));
1065 
1066         list_del_init(&ctx->active_ctx_list);
1067 }
1068 
1069 static void get_ctx(struct perf_event_context *ctx)
1070 {
1071         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1072 }
1073 
1074 static void free_ctx(struct rcu_head *head)
1075 {
1076         struct perf_event_context *ctx;
1077 
1078         ctx = container_of(head, struct perf_event_context, rcu_head);
1079         kfree(ctx->task_ctx_data);
1080         kfree(ctx);
1081 }
1082 
1083 static void put_ctx(struct perf_event_context *ctx)
1084 {
1085         if (atomic_dec_and_test(&ctx->refcount)) {
1086                 if (ctx->parent_ctx)
1087                         put_ctx(ctx->parent_ctx);
1088                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1089                         put_task_struct(ctx->task);
1090                 call_rcu(&ctx->rcu_head, free_ctx);
1091         }
1092 }
1093 
1094 /*
1095  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1096  * perf_pmu_migrate_context() we need some magic.
1097  *
1098  * Those places that change perf_event::ctx will hold both
1099  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1100  *
1101  * Lock ordering is by mutex address. There are two other sites where
1102  * perf_event_context::mutex nests and those are:
1103  *
1104  *  - perf_event_exit_task_context()    [ child , 0 ]
1105  *      perf_event_exit_event()
1106  *        put_event()                   [ parent, 1 ]
1107  *
1108  *  - perf_event_init_context()         [ parent, 0 ]
1109  *      inherit_task_group()
1110  *        inherit_group()
1111  *          inherit_event()
1112  *            perf_event_alloc()
1113  *              perf_init_event()
1114  *                perf_try_init_event() [ child , 1 ]
1115  *
1116  * While it appears there is an obvious deadlock here -- the parent and child
1117  * nesting levels are inverted between the two. This is in fact safe because
1118  * life-time rules separate them. That is an exiting task cannot fork, and a
1119  * spawning task cannot (yet) exit.
1120  *
1121  * But remember that that these are parent<->child context relations, and
1122  * migration does not affect children, therefore these two orderings should not
1123  * interact.
1124  *
1125  * The change in perf_event::ctx does not affect children (as claimed above)
1126  * because the sys_perf_event_open() case will install a new event and break
1127  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1128  * concerned with cpuctx and that doesn't have children.
1129  *
1130  * The places that change perf_event::ctx will issue:
1131  *
1132  *   perf_remove_from_context();
1133  *   synchronize_rcu();
1134  *   perf_install_in_context();
1135  *
1136  * to affect the change. The remove_from_context() + synchronize_rcu() should
1137  * quiesce the event, after which we can install it in the new location. This
1138  * means that only external vectors (perf_fops, prctl) can perturb the event
1139  * while in transit. Therefore all such accessors should also acquire
1140  * perf_event_context::mutex to serialize against this.
1141  *
1142  * However; because event->ctx can change while we're waiting to acquire
1143  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1144  * function.
1145  *
1146  * Lock order:
1147  *    cred_guard_mutex
1148  *      task_struct::perf_event_mutex
1149  *        perf_event_context::mutex
1150  *          perf_event::child_mutex;
1151  *            perf_event_context::lock
1152  *          perf_event::mmap_mutex
1153  *          mmap_sem
1154  */
1155 static struct perf_event_context *
1156 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1157 {
1158         struct perf_event_context *ctx;
1159 
1160 again:
1161         rcu_read_lock();
1162         ctx = ACCESS_ONCE(event->ctx);
1163         if (!atomic_inc_not_zero(&ctx->refcount)) {
1164                 rcu_read_unlock();
1165                 goto again;
1166         }
1167         rcu_read_unlock();
1168 
1169         mutex_lock_nested(&ctx->mutex, nesting);
1170         if (event->ctx != ctx) {
1171                 mutex_unlock(&ctx->mutex);
1172                 put_ctx(ctx);
1173                 goto again;
1174         }
1175 
1176         return ctx;
1177 }
1178 
1179 static inline struct perf_event_context *
1180 perf_event_ctx_lock(struct perf_event *event)
1181 {
1182         return perf_event_ctx_lock_nested(event, 0);
1183 }
1184 
1185 static void perf_event_ctx_unlock(struct perf_event *event,
1186                                   struct perf_event_context *ctx)
1187 {
1188         mutex_unlock(&ctx->mutex);
1189         put_ctx(ctx);
1190 }
1191 
1192 /*
1193  * This must be done under the ctx->lock, such as to serialize against
1194  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1195  * calling scheduler related locks and ctx->lock nests inside those.
1196  */
1197 static __must_check struct perf_event_context *
1198 unclone_ctx(struct perf_event_context *ctx)
1199 {
1200         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1201 
1202         lockdep_assert_held(&ctx->lock);
1203 
1204         if (parent_ctx)
1205                 ctx->parent_ctx = NULL;
1206         ctx->generation++;
1207 
1208         return parent_ctx;
1209 }
1210 
1211 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1212 {
1213         /*
1214          * only top level events have the pid namespace they were created in
1215          */
1216         if (event->parent)
1217                 event = event->parent;
1218 
1219         return task_tgid_nr_ns(p, event->ns);
1220 }
1221 
1222 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1223 {
1224         /*
1225          * only top level events have the pid namespace they were created in
1226          */
1227         if (event->parent)
1228                 event = event->parent;
1229 
1230         return task_pid_nr_ns(p, event->ns);
1231 }
1232 
1233 /*
1234  * If we inherit events we want to return the parent event id
1235  * to userspace.
1236  */
1237 static u64 primary_event_id(struct perf_event *event)
1238 {
1239         u64 id = event->id;
1240 
1241         if (event->parent)
1242                 id = event->parent->id;
1243 
1244         return id;
1245 }
1246 
1247 /*
1248  * Get the perf_event_context for a task and lock it.
1249  *
1250  * This has to cope with with the fact that until it is locked,
1251  * the context could get moved to another task.
1252  */
1253 static struct perf_event_context *
1254 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1255 {
1256         struct perf_event_context *ctx;
1257 
1258 retry:
1259         /*
1260          * One of the few rules of preemptible RCU is that one cannot do
1261          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1262          * part of the read side critical section was irqs-enabled -- see
1263          * rcu_read_unlock_special().
1264          *
1265          * Since ctx->lock nests under rq->lock we must ensure the entire read
1266          * side critical section has interrupts disabled.
1267          */
1268         local_irq_save(*flags);
1269         rcu_read_lock();
1270         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1271         if (ctx) {
1272                 /*
1273                  * If this context is a clone of another, it might
1274                  * get swapped for another underneath us by
1275                  * perf_event_task_sched_out, though the
1276                  * rcu_read_lock() protects us from any context
1277                  * getting freed.  Lock the context and check if it
1278                  * got swapped before we could get the lock, and retry
1279                  * if so.  If we locked the right context, then it
1280                  * can't get swapped on us any more.
1281                  */
1282                 raw_spin_lock(&ctx->lock);
1283                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1284                         raw_spin_unlock(&ctx->lock);
1285                         rcu_read_unlock();
1286                         local_irq_restore(*flags);
1287                         goto retry;
1288                 }
1289 
1290                 if (ctx->task == TASK_TOMBSTONE ||
1291                     !atomic_inc_not_zero(&ctx->refcount)) {
1292                         raw_spin_unlock(&ctx->lock);
1293                         ctx = NULL;
1294                 } else {
1295                         WARN_ON_ONCE(ctx->task != task);
1296                 }
1297         }
1298         rcu_read_unlock();
1299         if (!ctx)
1300                 local_irq_restore(*flags);
1301         return ctx;
1302 }
1303 
1304 /*
1305  * Get the context for a task and increment its pin_count so it
1306  * can't get swapped to another task.  This also increments its
1307  * reference count so that the context can't get freed.
1308  */
1309 static struct perf_event_context *
1310 perf_pin_task_context(struct task_struct *task, int ctxn)
1311 {
1312         struct perf_event_context *ctx;
1313         unsigned long flags;
1314 
1315         ctx = perf_lock_task_context(task, ctxn, &flags);
1316         if (ctx) {
1317                 ++ctx->pin_count;
1318                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1319         }
1320         return ctx;
1321 }
1322 
1323 static void perf_unpin_context(struct perf_event_context *ctx)
1324 {
1325         unsigned long flags;
1326 
1327         raw_spin_lock_irqsave(&ctx->lock, flags);
1328         --ctx->pin_count;
1329         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1330 }
1331 
1332 /*
1333  * Update the record of the current time in a context.
1334  */
1335 static void update_context_time(struct perf_event_context *ctx)
1336 {
1337         u64 now = perf_clock();
1338 
1339         ctx->time += now - ctx->timestamp;
1340         ctx->timestamp = now;
1341 }
1342 
1343 static u64 perf_event_time(struct perf_event *event)
1344 {
1345         struct perf_event_context *ctx = event->ctx;
1346 
1347         if (is_cgroup_event(event))
1348                 return perf_cgroup_event_time(event);
1349 
1350         return ctx ? ctx->time : 0;
1351 }
1352 
1353 /*
1354  * Update the total_time_enabled and total_time_running fields for a event.
1355  */
1356 static void update_event_times(struct perf_event *event)
1357 {
1358         struct perf_event_context *ctx = event->ctx;
1359         u64 run_end;
1360 
1361         lockdep_assert_held(&ctx->lock);
1362 
1363         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1364             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1365                 return;
1366 
1367         /*
1368          * in cgroup mode, time_enabled represents
1369          * the time the event was enabled AND active
1370          * tasks were in the monitored cgroup. This is
1371          * independent of the activity of the context as
1372          * there may be a mix of cgroup and non-cgroup events.
1373          *
1374          * That is why we treat cgroup events differently
1375          * here.
1376          */
1377         if (is_cgroup_event(event))
1378                 run_end = perf_cgroup_event_time(event);
1379         else if (ctx->is_active)
1380                 run_end = ctx->time;
1381         else
1382                 run_end = event->tstamp_stopped;
1383 
1384         event->total_time_enabled = run_end - event->tstamp_enabled;
1385 
1386         if (event->state == PERF_EVENT_STATE_INACTIVE)
1387                 run_end = event->tstamp_stopped;
1388         else
1389                 run_end = perf_event_time(event);
1390 
1391         event->total_time_running = run_end - event->tstamp_running;
1392 
1393 }
1394 
1395 /*
1396  * Update total_time_enabled and total_time_running for all events in a group.
1397  */
1398 static void update_group_times(struct perf_event *leader)
1399 {
1400         struct perf_event *event;
1401 
1402         update_event_times(leader);
1403         list_for_each_entry(event, &leader->sibling_list, group_entry)
1404                 update_event_times(event);
1405 }
1406 
1407 static struct list_head *
1408 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1409 {
1410         if (event->attr.pinned)
1411                 return &ctx->pinned_groups;
1412         else
1413                 return &ctx->flexible_groups;
1414 }
1415 
1416 /*
1417  * Add a event from the lists for its context.
1418  * Must be called with ctx->mutex and ctx->lock held.
1419  */
1420 static void
1421 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1422 {
1423         lockdep_assert_held(&ctx->lock);
1424 
1425         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1426         event->attach_state |= PERF_ATTACH_CONTEXT;
1427 
1428         /*
1429          * If we're a stand alone event or group leader, we go to the context
1430          * list, group events are kept attached to the group so that
1431          * perf_group_detach can, at all times, locate all siblings.
1432          */
1433         if (event->group_leader == event) {
1434                 struct list_head *list;
1435 
1436                 if (is_software_event(event))
1437                         event->group_flags |= PERF_GROUP_SOFTWARE;
1438 
1439                 list = ctx_group_list(event, ctx);
1440                 list_add_tail(&event->group_entry, list);
1441         }
1442 
1443         if (is_cgroup_event(event))
1444                 ctx->nr_cgroups++;
1445 
1446         list_add_rcu(&event->event_entry, &ctx->event_list);
1447         ctx->nr_events++;
1448         if (event->attr.inherit_stat)
1449                 ctx->nr_stat++;
1450 
1451         ctx->generation++;
1452 }
1453 
1454 /*
1455  * Initialize event state based on the perf_event_attr::disabled.
1456  */
1457 static inline void perf_event__state_init(struct perf_event *event)
1458 {
1459         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1460                                               PERF_EVENT_STATE_INACTIVE;
1461 }
1462 
1463 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1464 {
1465         int entry = sizeof(u64); /* value */
1466         int size = 0;
1467         int nr = 1;
1468 
1469         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1470                 size += sizeof(u64);
1471 
1472         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1473                 size += sizeof(u64);
1474 
1475         if (event->attr.read_format & PERF_FORMAT_ID)
1476                 entry += sizeof(u64);
1477 
1478         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1479                 nr += nr_siblings;
1480                 size += sizeof(u64);
1481         }
1482 
1483         size += entry * nr;
1484         event->read_size = size;
1485 }
1486 
1487 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1488 {
1489         struct perf_sample_data *data;
1490         u16 size = 0;
1491 
1492         if (sample_type & PERF_SAMPLE_IP)
1493                 size += sizeof(data->ip);
1494 
1495         if (sample_type & PERF_SAMPLE_ADDR)
1496                 size += sizeof(data->addr);
1497 
1498         if (sample_type & PERF_SAMPLE_PERIOD)
1499                 size += sizeof(data->period);
1500 
1501         if (sample_type & PERF_SAMPLE_WEIGHT)
1502                 size += sizeof(data->weight);
1503 
1504         if (sample_type & PERF_SAMPLE_READ)
1505                 size += event->read_size;
1506 
1507         if (sample_type & PERF_SAMPLE_DATA_SRC)
1508                 size += sizeof(data->data_src.val);
1509 
1510         if (sample_type & PERF_SAMPLE_TRANSACTION)
1511                 size += sizeof(data->txn);
1512 
1513         event->header_size = size;
1514 }
1515 
1516 /*
1517  * Called at perf_event creation and when events are attached/detached from a
1518  * group.
1519  */
1520 static void perf_event__header_size(struct perf_event *event)
1521 {
1522         __perf_event_read_size(event,
1523                                event->group_leader->nr_siblings);
1524         __perf_event_header_size(event, event->attr.sample_type);
1525 }
1526 
1527 static void perf_event__id_header_size(struct perf_event *event)
1528 {
1529         struct perf_sample_data *data;
1530         u64 sample_type = event->attr.sample_type;
1531         u16 size = 0;
1532 
1533         if (sample_type & PERF_SAMPLE_TID)
1534                 size += sizeof(data->tid_entry);
1535 
1536         if (sample_type & PERF_SAMPLE_TIME)
1537                 size += sizeof(data->time);
1538 
1539         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1540                 size += sizeof(data->id);
1541 
1542         if (sample_type & PERF_SAMPLE_ID)
1543                 size += sizeof(data->id);
1544 
1545         if (sample_type & PERF_SAMPLE_STREAM_ID)
1546                 size += sizeof(data->stream_id);
1547 
1548         if (sample_type & PERF_SAMPLE_CPU)
1549                 size += sizeof(data->cpu_entry);
1550 
1551         event->id_header_size = size;
1552 }
1553 
1554 static bool perf_event_validate_size(struct perf_event *event)
1555 {
1556         /*
1557          * The values computed here will be over-written when we actually
1558          * attach the event.
1559          */
1560         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1561         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1562         perf_event__id_header_size(event);
1563 
1564         /*
1565          * Sum the lot; should not exceed the 64k limit we have on records.
1566          * Conservative limit to allow for callchains and other variable fields.
1567          */
1568         if (event->read_size + event->header_size +
1569             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1570                 return false;
1571 
1572         return true;
1573 }
1574 
1575 static void perf_group_attach(struct perf_event *event)
1576 {
1577         struct perf_event *group_leader = event->group_leader, *pos;
1578 
1579         /*
1580          * We can have double attach due to group movement in perf_event_open.
1581          */
1582         if (event->attach_state & PERF_ATTACH_GROUP)
1583                 return;
1584 
1585         event->attach_state |= PERF_ATTACH_GROUP;
1586 
1587         if (group_leader == event)
1588                 return;
1589 
1590         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1591 
1592         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1593                         !is_software_event(event))
1594                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1595 
1596         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1597         group_leader->nr_siblings++;
1598 
1599         perf_event__header_size(group_leader);
1600 
1601         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1602                 perf_event__header_size(pos);
1603 }
1604 
1605 /*
1606  * Remove a event from the lists for its context.
1607  * Must be called with ctx->mutex and ctx->lock held.
1608  */
1609 static void
1610 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1611 {
1612         struct perf_cpu_context *cpuctx;
1613 
1614         WARN_ON_ONCE(event->ctx != ctx);
1615         lockdep_assert_held(&ctx->lock);
1616 
1617         /*
1618          * We can have double detach due to exit/hot-unplug + close.
1619          */
1620         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1621                 return;
1622 
1623         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1624 
1625         if (is_cgroup_event(event)) {
1626                 ctx->nr_cgroups--;
1627                 /*
1628                  * Because cgroup events are always per-cpu events, this will
1629                  * always be called from the right CPU.
1630                  */
1631                 cpuctx = __get_cpu_context(ctx);
1632                 /*
1633                  * If there are no more cgroup events then clear cgrp to avoid
1634                  * stale pointer in update_cgrp_time_from_cpuctx().
1635                  */
1636                 if (!ctx->nr_cgroups)
1637                         cpuctx->cgrp = NULL;
1638         }
1639 
1640         ctx->nr_events--;
1641         if (event->attr.inherit_stat)
1642                 ctx->nr_stat--;
1643 
1644         list_del_rcu(&event->event_entry);
1645 
1646         if (event->group_leader == event)
1647                 list_del_init(&event->group_entry);
1648 
1649         update_group_times(event);
1650 
1651         /*
1652          * If event was in error state, then keep it
1653          * that way, otherwise bogus counts will be
1654          * returned on read(). The only way to get out
1655          * of error state is by explicit re-enabling
1656          * of the event
1657          */
1658         if (event->state > PERF_EVENT_STATE_OFF)
1659                 event->state = PERF_EVENT_STATE_OFF;
1660 
1661         ctx->generation++;
1662 }
1663 
1664 static void perf_group_detach(struct perf_event *event)
1665 {
1666         struct perf_event *sibling, *tmp;
1667         struct list_head *list = NULL;
1668 
1669         /*
1670          * We can have double detach due to exit/hot-unplug + close.
1671          */
1672         if (!(event->attach_state & PERF_ATTACH_GROUP))
1673                 return;
1674 
1675         event->attach_state &= ~PERF_ATTACH_GROUP;
1676 
1677         /*
1678          * If this is a sibling, remove it from its group.
1679          */
1680         if (event->group_leader != event) {
1681                 list_del_init(&event->group_entry);
1682                 event->group_leader->nr_siblings--;
1683                 goto out;
1684         }
1685 
1686         if (!list_empty(&event->group_entry))
1687                 list = &event->group_entry;
1688 
1689         /*
1690          * If this was a group event with sibling events then
1691          * upgrade the siblings to singleton events by adding them
1692          * to whatever list we are on.
1693          */
1694         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1695                 if (list)
1696                         list_move_tail(&sibling->group_entry, list);
1697                 sibling->group_leader = sibling;
1698 
1699                 /* Inherit group flags from the previous leader */
1700                 sibling->group_flags = event->group_flags;
1701 
1702                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1703         }
1704 
1705 out:
1706         perf_event__header_size(event->group_leader);
1707 
1708         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1709                 perf_event__header_size(tmp);
1710 }
1711 
1712 static bool is_orphaned_event(struct perf_event *event)
1713 {
1714         return event->state == PERF_EVENT_STATE_DEAD;
1715 }
1716 
1717 static inline int __pmu_filter_match(struct perf_event *event)
1718 {
1719         struct pmu *pmu = event->pmu;
1720         return pmu->filter_match ? pmu->filter_match(event) : 1;
1721 }
1722 
1723 /*
1724  * Check whether we should attempt to schedule an event group based on
1725  * PMU-specific filtering. An event group can consist of HW and SW events,
1726  * potentially with a SW leader, so we must check all the filters, to
1727  * determine whether a group is schedulable:
1728  */
1729 static inline int pmu_filter_match(struct perf_event *event)
1730 {
1731         struct perf_event *child;
1732 
1733         if (!__pmu_filter_match(event))
1734                 return 0;
1735 
1736         list_for_each_entry(child, &event->sibling_list, group_entry) {
1737                 if (!__pmu_filter_match(child))
1738                         return 0;
1739         }
1740 
1741         return 1;
1742 }
1743 
1744 static inline int
1745 event_filter_match(struct perf_event *event)
1746 {
1747         return (event->cpu == -1 || event->cpu == smp_processor_id())
1748             && perf_cgroup_match(event) && pmu_filter_match(event);
1749 }
1750 
1751 static void
1752 event_sched_out(struct perf_event *event,
1753                   struct perf_cpu_context *cpuctx,
1754                   struct perf_event_context *ctx)
1755 {
1756         u64 tstamp = perf_event_time(event);
1757         u64 delta;
1758 
1759         WARN_ON_ONCE(event->ctx != ctx);
1760         lockdep_assert_held(&ctx->lock);
1761 
1762         /*
1763          * An event which could not be activated because of
1764          * filter mismatch still needs to have its timings
1765          * maintained, otherwise bogus information is return
1766          * via read() for time_enabled, time_running:
1767          */
1768         if (event->state == PERF_EVENT_STATE_INACTIVE
1769             && !event_filter_match(event)) {
1770                 delta = tstamp - event->tstamp_stopped;
1771                 event->tstamp_running += delta;
1772                 event->tstamp_stopped = tstamp;
1773         }
1774 
1775         if (event->state != PERF_EVENT_STATE_ACTIVE)
1776                 return;
1777 
1778         perf_pmu_disable(event->pmu);
1779 
1780         event->tstamp_stopped = tstamp;
1781         event->pmu->del(event, 0);
1782         event->oncpu = -1;
1783         event->state = PERF_EVENT_STATE_INACTIVE;
1784         if (event->pending_disable) {
1785                 event->pending_disable = 0;
1786                 event->state = PERF_EVENT_STATE_OFF;
1787         }
1788 
1789         if (!is_software_event(event))
1790                 cpuctx->active_oncpu--;
1791         if (!--ctx->nr_active)
1792                 perf_event_ctx_deactivate(ctx);
1793         if (event->attr.freq && event->attr.sample_freq)
1794                 ctx->nr_freq--;
1795         if (event->attr.exclusive || !cpuctx->active_oncpu)
1796                 cpuctx->exclusive = 0;
1797 
1798         perf_pmu_enable(event->pmu);
1799 }
1800 
1801 static void
1802 group_sched_out(struct perf_event *group_event,
1803                 struct perf_cpu_context *cpuctx,
1804                 struct perf_event_context *ctx)
1805 {
1806         struct perf_event *event;
1807         int state = group_event->state;
1808 
1809         event_sched_out(group_event, cpuctx, ctx);
1810 
1811         /*
1812          * Schedule out siblings (if any):
1813          */
1814         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1815                 event_sched_out(event, cpuctx, ctx);
1816 
1817         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1818                 cpuctx->exclusive = 0;
1819 }
1820 
1821 #define DETACH_GROUP    0x01UL
1822 
1823 /*
1824  * Cross CPU call to remove a performance event
1825  *
1826  * We disable the event on the hardware level first. After that we
1827  * remove it from the context list.
1828  */
1829 static void
1830 __perf_remove_from_context(struct perf_event *event,
1831                            struct perf_cpu_context *cpuctx,
1832                            struct perf_event_context *ctx,
1833                            void *info)
1834 {
1835         unsigned long flags = (unsigned long)info;
1836 
1837         event_sched_out(event, cpuctx, ctx);
1838         if (flags & DETACH_GROUP)
1839                 perf_group_detach(event);
1840         list_del_event(event, ctx);
1841 
1842         if (!ctx->nr_events && ctx->is_active) {
1843                 ctx->is_active = 0;
1844                 if (ctx->task) {
1845                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1846                         cpuctx->task_ctx = NULL;
1847                 }
1848         }
1849 }
1850 
1851 /*
1852  * Remove the event from a task's (or a CPU's) list of events.
1853  *
1854  * If event->ctx is a cloned context, callers must make sure that
1855  * every task struct that event->ctx->task could possibly point to
1856  * remains valid.  This is OK when called from perf_release since
1857  * that only calls us on the top-level context, which can't be a clone.
1858  * When called from perf_event_exit_task, it's OK because the
1859  * context has been detached from its task.
1860  */
1861 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1862 {
1863         lockdep_assert_held(&event->ctx->mutex);
1864 
1865         event_function_call(event, __perf_remove_from_context, (void *)flags);
1866 }
1867 
1868 /*
1869  * Cross CPU call to disable a performance event
1870  */
1871 static void __perf_event_disable(struct perf_event *event,
1872                                  struct perf_cpu_context *cpuctx,
1873                                  struct perf_event_context *ctx,
1874                                  void *info)
1875 {
1876         if (event->state < PERF_EVENT_STATE_INACTIVE)
1877                 return;
1878 
1879         update_context_time(ctx);
1880         update_cgrp_time_from_event(event);
1881         update_group_times(event);
1882         if (event == event->group_leader)
1883                 group_sched_out(event, cpuctx, ctx);
1884         else
1885                 event_sched_out(event, cpuctx, ctx);
1886         event->state = PERF_EVENT_STATE_OFF;
1887 }
1888 
1889 /*
1890  * Disable a event.
1891  *
1892  * If event->ctx is a cloned context, callers must make sure that
1893  * every task struct that event->ctx->task could possibly point to
1894  * remains valid.  This condition is satisifed when called through
1895  * perf_event_for_each_child or perf_event_for_each because they
1896  * hold the top-level event's child_mutex, so any descendant that
1897  * goes to exit will block in perf_event_exit_event().
1898  *
1899  * When called from perf_pending_event it's OK because event->ctx
1900  * is the current context on this CPU and preemption is disabled,
1901  * hence we can't get into perf_event_task_sched_out for this context.
1902  */
1903 static void _perf_event_disable(struct perf_event *event)
1904 {
1905         struct perf_event_context *ctx = event->ctx;
1906 
1907         raw_spin_lock_irq(&ctx->lock);
1908         if (event->state <= PERF_EVENT_STATE_OFF) {
1909                 raw_spin_unlock_irq(&ctx->lock);
1910                 return;
1911         }
1912         raw_spin_unlock_irq(&ctx->lock);
1913 
1914         event_function_call(event, __perf_event_disable, NULL);
1915 }
1916 
1917 void perf_event_disable_local(struct perf_event *event)
1918 {
1919         event_function_local(event, __perf_event_disable, NULL);
1920 }
1921 
1922 /*
1923  * Strictly speaking kernel users cannot create groups and therefore this
1924  * interface does not need the perf_event_ctx_lock() magic.
1925  */
1926 void perf_event_disable(struct perf_event *event)
1927 {
1928         struct perf_event_context *ctx;
1929 
1930         ctx = perf_event_ctx_lock(event);
1931         _perf_event_disable(event);
1932         perf_event_ctx_unlock(event, ctx);
1933 }
1934 EXPORT_SYMBOL_GPL(perf_event_disable);
1935 
1936 static void perf_set_shadow_time(struct perf_event *event,
1937                                  struct perf_event_context *ctx,
1938                                  u64 tstamp)
1939 {
1940         /*
1941          * use the correct time source for the time snapshot
1942          *
1943          * We could get by without this by leveraging the
1944          * fact that to get to this function, the caller
1945          * has most likely already called update_context_time()
1946          * and update_cgrp_time_xx() and thus both timestamp
1947          * are identical (or very close). Given that tstamp is,
1948          * already adjusted for cgroup, we could say that:
1949          *    tstamp - ctx->timestamp
1950          * is equivalent to
1951          *    tstamp - cgrp->timestamp.
1952          *
1953          * Then, in perf_output_read(), the calculation would
1954          * work with no changes because:
1955          * - event is guaranteed scheduled in
1956          * - no scheduled out in between
1957          * - thus the timestamp would be the same
1958          *
1959          * But this is a bit hairy.
1960          *
1961          * So instead, we have an explicit cgroup call to remain
1962          * within the time time source all along. We believe it
1963          * is cleaner and simpler to understand.
1964          */
1965         if (is_cgroup_event(event))
1966                 perf_cgroup_set_shadow_time(event, tstamp);
1967         else
1968                 event->shadow_ctx_time = tstamp - ctx->timestamp;
1969 }
1970 
1971 #define MAX_INTERRUPTS (~0ULL)
1972 
1973 static void perf_log_throttle(struct perf_event *event, int enable);
1974 static void perf_log_itrace_start(struct perf_event *event);
1975 
1976 static int
1977 event_sched_in(struct perf_event *event,
1978                  struct perf_cpu_context *cpuctx,
1979                  struct perf_event_context *ctx)
1980 {
1981         u64 tstamp = perf_event_time(event);
1982         int ret = 0;
1983 
1984         lockdep_assert_held(&ctx->lock);
1985 
1986         if (event->state <= PERF_EVENT_STATE_OFF)
1987                 return 0;
1988 
1989         WRITE_ONCE(event->oncpu, smp_processor_id());
1990         /*
1991          * Order event::oncpu write to happen before the ACTIVE state
1992          * is visible.
1993          */
1994         smp_wmb();
1995         WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
1996 
1997         /*
1998          * Unthrottle events, since we scheduled we might have missed several
1999          * ticks already, also for a heavily scheduling task there is little
2000          * guarantee it'll get a tick in a timely manner.
2001          */
2002         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2003                 perf_log_throttle(event, 1);
2004                 event->hw.interrupts = 0;
2005         }
2006 
2007         /*
2008          * The new state must be visible before we turn it on in the hardware:
2009          */
2010         smp_wmb();
2011 
2012         perf_pmu_disable(event->pmu);
2013 
2014         perf_set_shadow_time(event, ctx, tstamp);
2015 
2016         perf_log_itrace_start(event);
2017 
2018         if (event->pmu->add(event, PERF_EF_START)) {
2019                 event->state = PERF_EVENT_STATE_INACTIVE;
2020                 event->oncpu = -1;
2021                 ret = -EAGAIN;
2022                 goto out;
2023         }
2024 
2025         event->tstamp_running += tstamp - event->tstamp_stopped;
2026 
2027         if (!is_software_event(event))
2028                 cpuctx->active_oncpu++;
2029         if (!ctx->nr_active++)
2030                 perf_event_ctx_activate(ctx);
2031         if (event->attr.freq && event->attr.sample_freq)
2032                 ctx->nr_freq++;
2033 
2034         if (event->attr.exclusive)
2035                 cpuctx->exclusive = 1;
2036 
2037 out:
2038         perf_pmu_enable(event->pmu);
2039 
2040         return ret;
2041 }
2042 
2043 static int
2044 group_sched_in(struct perf_event *group_event,
2045                struct perf_cpu_context *cpuctx,
2046                struct perf_event_context *ctx)
2047 {
2048         struct perf_event *event, *partial_group = NULL;
2049         struct pmu *pmu = ctx->pmu;
2050         u64 now = ctx->time;
2051         bool simulate = false;
2052 
2053         if (group_event->state == PERF_EVENT_STATE_OFF)
2054                 return 0;
2055 
2056         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2057 
2058         if (event_sched_in(group_event, cpuctx, ctx)) {
2059                 pmu->cancel_txn(pmu);
2060                 perf_mux_hrtimer_restart(cpuctx);
2061                 return -EAGAIN;
2062         }
2063 
2064         /*
2065          * Schedule in siblings as one group (if any):
2066          */
2067         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2068                 if (event_sched_in(event, cpuctx, ctx)) {
2069                         partial_group = event;
2070                         goto group_error;
2071                 }
2072         }
2073 
2074         if (!pmu->commit_txn(pmu))
2075                 return 0;
2076 
2077 group_error:
2078         /*
2079          * Groups can be scheduled in as one unit only, so undo any
2080          * partial group before returning:
2081          * The events up to the failed event are scheduled out normally,
2082          * tstamp_stopped will be updated.
2083          *
2084          * The failed events and the remaining siblings need to have
2085          * their timings updated as if they had gone thru event_sched_in()
2086          * and event_sched_out(). This is required to get consistent timings
2087          * across the group. This also takes care of the case where the group
2088          * could never be scheduled by ensuring tstamp_stopped is set to mark
2089          * the time the event was actually stopped, such that time delta
2090          * calculation in update_event_times() is correct.
2091          */
2092         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2093                 if (event == partial_group)
2094                         simulate = true;
2095 
2096                 if (simulate) {
2097                         event->tstamp_running += now - event->tstamp_stopped;
2098                         event->tstamp_stopped = now;
2099                 } else {
2100                         event_sched_out(event, cpuctx, ctx);
2101                 }
2102         }
2103         event_sched_out(group_event, cpuctx, ctx);
2104 
2105         pmu->cancel_txn(pmu);
2106 
2107         perf_mux_hrtimer_restart(cpuctx);
2108 
2109         return -EAGAIN;
2110 }
2111 
2112 /*
2113  * Work out whether we can put this event group on the CPU now.
2114  */
2115 static int group_can_go_on(struct perf_event *event,
2116                            struct perf_cpu_context *cpuctx,
2117                            int can_add_hw)
2118 {
2119         /*
2120          * Groups consisting entirely of software events can always go on.
2121          */
2122         if (event->group_flags & PERF_GROUP_SOFTWARE)
2123                 return 1;
2124         /*
2125          * If an exclusive group is already on, no other hardware
2126          * events can go on.
2127          */
2128         if (cpuctx->exclusive)
2129                 return 0;
2130         /*
2131          * If this group is exclusive and there are already
2132          * events on the CPU, it can't go on.
2133          */
2134         if (event->attr.exclusive && cpuctx->active_oncpu)
2135                 return 0;
2136         /*
2137          * Otherwise, try to add it if all previous groups were able
2138          * to go on.
2139          */
2140         return can_add_hw;
2141 }
2142 
2143 static void add_event_to_ctx(struct perf_event *event,
2144                                struct perf_event_context *ctx)
2145 {
2146         u64 tstamp = perf_event_time(event);
2147 
2148         list_add_event(event, ctx);
2149         perf_group_attach(event);
2150         event->tstamp_enabled = tstamp;
2151         event->tstamp_running = tstamp;
2152         event->tstamp_stopped = tstamp;
2153 }
2154 
2155 static void ctx_sched_out(struct perf_event_context *ctx,
2156                           struct perf_cpu_context *cpuctx,
2157                           enum event_type_t event_type);
2158 static void
2159 ctx_sched_in(struct perf_event_context *ctx,
2160              struct perf_cpu_context *cpuctx,
2161              enum event_type_t event_type,
2162              struct task_struct *task);
2163 
2164 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2165                                struct perf_event_context *ctx)
2166 {
2167         if (!cpuctx->task_ctx)
2168                 return;
2169 
2170         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2171                 return;
2172 
2173         ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2174 }
2175 
2176 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2177                                 struct perf_event_context *ctx,
2178                                 struct task_struct *task)
2179 {
2180         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2181         if (ctx)
2182                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2183         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2184         if (ctx)
2185                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2186 }
2187 
2188 static void ctx_resched(struct perf_cpu_context *cpuctx,
2189                         struct perf_event_context *task_ctx)
2190 {
2191         perf_pmu_disable(cpuctx->ctx.pmu);
2192         if (task_ctx)
2193                 task_ctx_sched_out(cpuctx, task_ctx);
2194         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2195         perf_event_sched_in(cpuctx, task_ctx, current);
2196         perf_pmu_enable(cpuctx->ctx.pmu);
2197 }
2198 
2199 /*
2200  * Cross CPU call to install and enable a performance event
2201  *
2202  * Very similar to remote_function() + event_function() but cannot assume that
2203  * things like ctx->is_active and cpuctx->task_ctx are set.
2204  */
2205 static int  __perf_install_in_context(void *info)
2206 {
2207         struct perf_event *event = info;
2208         struct perf_event_context *ctx = event->ctx;
2209         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2210         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2211         bool activate = true;
2212         int ret = 0;
2213 
2214         raw_spin_lock(&cpuctx->ctx.lock);
2215         if (ctx->task) {
2216                 raw_spin_lock(&ctx->lock);
2217                 task_ctx = ctx;
2218 
2219                 /* If we're on the wrong CPU, try again */
2220                 if (task_cpu(ctx->task) != smp_processor_id()) {
2221                         ret = -ESRCH;
2222                         goto unlock;
2223                 }
2224 
2225                 /*
2226                  * If we're on the right CPU, see if the task we target is
2227                  * current, if not we don't have to activate the ctx, a future
2228                  * context switch will do that for us.
2229                  */
2230                 if (ctx->task != current)
2231                         activate = false;
2232                 else
2233                         WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2234 
2235         } else if (task_ctx) {
2236                 raw_spin_lock(&task_ctx->lock);
2237         }
2238 
2239         if (activate) {
2240                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2241                 add_event_to_ctx(event, ctx);
2242                 ctx_resched(cpuctx, task_ctx);
2243         } else {
2244                 add_event_to_ctx(event, ctx);
2245         }
2246 
2247 unlock:
2248         perf_ctx_unlock(cpuctx, task_ctx);
2249 
2250         return ret;
2251 }
2252 
2253 /*
2254  * Attach a performance event to a context.
2255  *
2256  * Very similar to event_function_call, see comment there.
2257  */
2258 static void
2259 perf_install_in_context(struct perf_event_context *ctx,
2260                         struct perf_event *event,
2261                         int cpu)
2262 {
2263         struct task_struct *task = READ_ONCE(ctx->task);
2264 
2265         lockdep_assert_held(&ctx->mutex);
2266 
2267         event->ctx = ctx;
2268         if (event->cpu != -1)
2269                 event->cpu = cpu;
2270 
2271         if (!task) {
2272                 cpu_function_call(cpu, __perf_install_in_context, event);
2273                 return;
2274         }
2275 
2276         /*
2277          * Should not happen, we validate the ctx is still alive before calling.
2278          */
2279         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2280                 return;
2281 
2282         /*
2283          * Installing events is tricky because we cannot rely on ctx->is_active
2284          * to be set in case this is the nr_events 0 -> 1 transition.
2285          */
2286 again:
2287         /*
2288          * Cannot use task_function_call() because we need to run on the task's
2289          * CPU regardless of whether its current or not.
2290          */
2291         if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
2292                 return;
2293 
2294         raw_spin_lock_irq(&ctx->lock);
2295         task = ctx->task;
2296         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2297                 /*
2298                  * Cannot happen because we already checked above (which also
2299                  * cannot happen), and we hold ctx->mutex, which serializes us
2300                  * against perf_event_exit_task_context().
2301                  */
2302                 raw_spin_unlock_irq(&ctx->lock);
2303                 return;
2304         }
2305         raw_spin_unlock_irq(&ctx->lock);
2306         /*
2307          * Since !ctx->is_active doesn't mean anything, we must IPI
2308          * unconditionally.
2309          */
2310         goto again;
2311 }
2312 
2313 /*
2314  * Put a event into inactive state and update time fields.
2315  * Enabling the leader of a group effectively enables all
2316  * the group members that aren't explicitly disabled, so we
2317  * have to update their ->tstamp_enabled also.
2318  * Note: this works for group members as well as group leaders
2319  * since the non-leader members' sibling_lists will be empty.
2320  */
2321 static void __perf_event_mark_enabled(struct perf_event *event)
2322 {
2323         struct perf_event *sub;
2324         u64 tstamp = perf_event_time(event);
2325 
2326         event->state = PERF_EVENT_STATE_INACTIVE;
2327         event->tstamp_enabled = tstamp - event->total_time_enabled;
2328         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2329                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2330                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2331         }
2332 }
2333 
2334 /*
2335  * Cross CPU call to enable a performance event
2336  */
2337 static void __perf_event_enable(struct perf_event *event,
2338                                 struct perf_cpu_context *cpuctx,
2339                                 struct perf_event_context *ctx,
2340                                 void *info)
2341 {
2342         struct perf_event *leader = event->group_leader;
2343         struct perf_event_context *task_ctx;
2344 
2345         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2346             event->state <= PERF_EVENT_STATE_ERROR)
2347                 return;
2348 
2349         if (ctx->is_active)
2350                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2351 
2352         __perf_event_mark_enabled(event);
2353 
2354         if (!ctx->is_active)
2355                 return;
2356 
2357         if (!event_filter_match(event)) {
2358                 if (is_cgroup_event(event))
2359                         perf_cgroup_defer_enabled(event);
2360                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2361                 return;
2362         }
2363 
2364         /*
2365          * If the event is in a group and isn't the group leader,
2366          * then don't put it on unless the group is on.
2367          */
2368         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2369                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2370                 return;
2371         }
2372 
2373         task_ctx = cpuctx->task_ctx;
2374         if (ctx->task)
2375                 WARN_ON_ONCE(task_ctx != ctx);
2376 
2377         ctx_resched(cpuctx, task_ctx);
2378 }
2379 
2380 /*
2381  * Enable a event.
2382  *
2383  * If event->ctx is a cloned context, callers must make sure that
2384  * every task struct that event->ctx->task could possibly point to
2385  * remains valid.  This condition is satisfied when called through
2386  * perf_event_for_each_child or perf_event_for_each as described
2387  * for perf_event_disable.
2388  */
2389 static void _perf_event_enable(struct perf_event *event)
2390 {
2391         struct perf_event_context *ctx = event->ctx;
2392 
2393         raw_spin_lock_irq(&ctx->lock);
2394         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2395             event->state <  PERF_EVENT_STATE_ERROR) {
2396                 raw_spin_unlock_irq(&ctx->lock);
2397                 return;
2398         }
2399 
2400         /*
2401          * If the event is in error state, clear that first.
2402          *
2403          * That way, if we see the event in error state below, we know that it
2404          * has gone back into error state, as distinct from the task having
2405          * been scheduled away before the cross-call arrived.
2406          */
2407         if (event->state == PERF_EVENT_STATE_ERROR)
2408                 event->state = PERF_EVENT_STATE_OFF;
2409         raw_spin_unlock_irq(&ctx->lock);
2410 
2411         event_function_call(event, __perf_event_enable, NULL);
2412 }
2413 
2414 /*
2415  * See perf_event_disable();
2416  */
2417 void perf_event_enable(struct perf_event *event)
2418 {
2419         struct perf_event_context *ctx;
2420 
2421         ctx = perf_event_ctx_lock(event);
2422         _perf_event_enable(event);
2423         perf_event_ctx_unlock(event, ctx);
2424 }
2425 EXPORT_SYMBOL_GPL(perf_event_enable);
2426 
2427 struct stop_event_data {
2428         struct perf_event       *event;
2429         unsigned int            restart;
2430 };
2431 
2432 static int __perf_event_stop(void *info)
2433 {
2434         struct stop_event_data *sd = info;
2435         struct perf_event *event = sd->event;
2436 
2437         /* if it's already INACTIVE, do nothing */
2438         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2439                 return 0;
2440 
2441         /* matches smp_wmb() in event_sched_in() */
2442         smp_rmb();
2443 
2444         /*
2445          * There is a window with interrupts enabled before we get here,
2446          * so we need to check again lest we try to stop another CPU's event.
2447          */
2448         if (READ_ONCE(event->oncpu) != smp_processor_id())
2449                 return -EAGAIN;
2450 
2451         event->pmu->stop(event, PERF_EF_UPDATE);
2452 
2453         /*
2454          * May race with the actual stop (through perf_pmu_output_stop()),
2455          * but it is only used for events with AUX ring buffer, and such
2456          * events will refuse to restart because of rb::aux_mmap_count==0,
2457          * see comments in perf_aux_output_begin().
2458          *
2459          * Since this is happening on a event-local CPU, no trace is lost
2460          * while restarting.
2461          */
2462         if (sd->restart)
2463                 event->pmu->start(event, PERF_EF_START);
2464 
2465         return 0;
2466 }
2467 
2468 static int perf_event_restart(struct perf_event *event)
2469 {
2470         struct stop_event_data sd = {
2471                 .event          = event,
2472                 .restart        = 1,
2473         };
2474         int ret = 0;
2475 
2476         do {
2477                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2478                         return 0;
2479 
2480                 /* matches smp_wmb() in event_sched_in() */
2481                 smp_rmb();
2482 
2483                 /*
2484                  * We only want to restart ACTIVE events, so if the event goes
2485                  * inactive here (event->oncpu==-1), there's nothing more to do;
2486                  * fall through with ret==-ENXIO.
2487                  */
2488                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2489                                         __perf_event_stop, &sd);
2490         } while (ret == -EAGAIN);
2491 
2492         return ret;
2493 }
2494 
2495 /*
2496  * In order to contain the amount of racy and tricky in the address filter
2497  * configuration management, it is a two part process:
2498  *
2499  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2500  *      we update the addresses of corresponding vmas in
2501  *      event::addr_filters_offs array and bump the event::addr_filters_gen;
2502  * (p2) when an event is scheduled in (pmu::add), it calls
2503  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2504  *      if the generation has changed since the previous call.
2505  *
2506  * If (p1) happens while the event is active, we restart it to force (p2).
2507  *
2508  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2509  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2510  *     ioctl;
2511  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2512  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2513  *     for reading;
2514  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2515  *     of exec.
2516  */
2517 void perf_event_addr_filters_sync(struct perf_event *event)
2518 {
2519         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2520 
2521         if (!has_addr_filter(event))
2522                 return;
2523 
2524         raw_spin_lock(&ifh->lock);
2525         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2526                 event->pmu->addr_filters_sync(event);
2527                 event->hw.addr_filters_gen = event->addr_filters_gen;
2528         }
2529         raw_spin_unlock(&ifh->lock);
2530 }
2531 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2532 
2533 static int _perf_event_refresh(struct perf_event *event, int refresh)
2534 {
2535         /*
2536          * not supported on inherited events
2537          */
2538         if (event->attr.inherit || !is_sampling_event(event))
2539                 return -EINVAL;
2540 
2541         atomic_add(refresh, &event->event_limit);
2542         _perf_event_enable(event);
2543 
2544         return 0;
2545 }
2546 
2547 /*
2548  * See perf_event_disable()
2549  */
2550 int perf_event_refresh(struct perf_event *event, int refresh)
2551 {
2552         struct perf_event_context *ctx;
2553         int ret;
2554 
2555         ctx = perf_event_ctx_lock(event);
2556         ret = _perf_event_refresh(event, refresh);
2557         perf_event_ctx_unlock(event, ctx);
2558 
2559         return ret;
2560 }
2561 EXPORT_SYMBOL_GPL(perf_event_refresh);
2562 
2563 static void ctx_sched_out(struct perf_event_context *ctx,
2564                           struct perf_cpu_context *cpuctx,
2565                           enum event_type_t event_type)
2566 {
2567         int is_active = ctx->is_active;
2568         struct perf_event *event;
2569 
2570         lockdep_assert_held(&ctx->lock);
2571 
2572         if (likely(!ctx->nr_events)) {
2573                 /*
2574                  * See __perf_remove_from_context().
2575                  */
2576                 WARN_ON_ONCE(ctx->is_active);
2577                 if (ctx->task)
2578                         WARN_ON_ONCE(cpuctx->task_ctx);
2579                 return;
2580         }
2581 
2582         ctx->is_active &= ~event_type;
2583         if (!(ctx->is_active & EVENT_ALL))
2584                 ctx->is_active = 0;
2585 
2586         if (ctx->task) {
2587                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2588                 if (!ctx->is_active)
2589                         cpuctx->task_ctx = NULL;
2590         }
2591 
2592         /*
2593          * Always update time if it was set; not only when it changes.
2594          * Otherwise we can 'forget' to update time for any but the last
2595          * context we sched out. For example:
2596          *
2597          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2598          *   ctx_sched_out(.event_type = EVENT_PINNED)
2599          *
2600          * would only update time for the pinned events.
2601          */
2602         if (is_active & EVENT_TIME) {
2603                 /* update (and stop) ctx time */
2604                 update_context_time(ctx);
2605                 update_cgrp_time_from_cpuctx(cpuctx);
2606         }
2607 
2608         is_active ^= ctx->is_active; /* changed bits */
2609 
2610         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2611                 return;
2612 
2613         perf_pmu_disable(ctx->pmu);
2614         if (is_active & EVENT_PINNED) {
2615                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2616                         group_sched_out(event, cpuctx, ctx);
2617         }
2618 
2619         if (is_active & EVENT_FLEXIBLE) {
2620                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2621                         group_sched_out(event, cpuctx, ctx);
2622         }
2623         perf_pmu_enable(ctx->pmu);
2624 }
2625 
2626 /*
2627  * Test whether two contexts are equivalent, i.e. whether they have both been
2628  * cloned from the same version of the same context.
2629  *
2630  * Equivalence is measured using a generation number in the context that is
2631  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2632  * and list_del_event().
2633  */
2634 static int context_equiv(struct perf_event_context *ctx1,
2635                          struct perf_event_context *ctx2)
2636 {
2637         lockdep_assert_held(&ctx1->lock);
2638         lockdep_assert_held(&ctx2->lock);
2639 
2640         /* Pinning disables the swap optimization */
2641         if (ctx1->pin_count || ctx2->pin_count)
2642                 return 0;
2643 
2644         /* If ctx1 is the parent of ctx2 */
2645         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2646                 return 1;
2647 
2648         /* If ctx2 is the parent of ctx1 */
2649         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2650                 return 1;
2651 
2652         /*
2653          * If ctx1 and ctx2 have the same parent; we flatten the parent
2654          * hierarchy, see perf_event_init_context().
2655          */
2656         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2657                         ctx1->parent_gen == ctx2->parent_gen)
2658                 return 1;
2659 
2660         /* Unmatched */
2661         return 0;
2662 }
2663 
2664 static void __perf_event_sync_stat(struct perf_event *event,
2665                                      struct perf_event *next_event)
2666 {
2667         u64 value;
2668 
2669         if (!event->attr.inherit_stat)
2670                 return;
2671 
2672         /*
2673          * Update the event value, we cannot use perf_event_read()
2674          * because we're in the middle of a context switch and have IRQs
2675          * disabled, which upsets smp_call_function_single(), however
2676          * we know the event must be on the current CPU, therefore we
2677          * don't need to use it.
2678          */
2679         switch (event->state) {
2680         case PERF_EVENT_STATE_ACTIVE:
2681                 event->pmu->read(event);
2682                 /* fall-through */
2683 
2684         case PERF_EVENT_STATE_INACTIVE:
2685                 update_event_times(event);
2686                 break;
2687 
2688         default:
2689                 break;
2690         }
2691 
2692         /*
2693          * In order to keep per-task stats reliable we need to flip the event
2694          * values when we flip the contexts.
2695          */
2696         value = local64_read(&next_event->count);
2697         value = local64_xchg(&event->count, value);
2698         local64_set(&next_event->count, value);
2699 
2700         swap(event->total_time_enabled, next_event->total_time_enabled);
2701         swap(event->total_time_running, next_event->total_time_running);
2702 
2703         /*
2704          * Since we swizzled the values, update the user visible data too.
2705          */
2706         perf_event_update_userpage(event);
2707         perf_event_update_userpage(next_event);
2708 }
2709 
2710 static void perf_event_sync_stat(struct perf_event_context *ctx,
2711                                    struct perf_event_context *next_ctx)
2712 {
2713         struct perf_event *event, *next_event;
2714 
2715         if (!ctx->nr_stat)
2716                 return;
2717 
2718         update_context_time(ctx);
2719 
2720         event = list_first_entry(&ctx->event_list,
2721                                    struct perf_event, event_entry);
2722 
2723         next_event = list_first_entry(&next_ctx->event_list,
2724                                         struct perf_event, event_entry);
2725 
2726         while (&event->event_entry != &ctx->event_list &&
2727                &next_event->event_entry != &next_ctx->event_list) {
2728 
2729                 __perf_event_sync_stat(event, next_event);
2730 
2731                 event = list_next_entry(event, event_entry);
2732                 next_event = list_next_entry(next_event, event_entry);
2733         }
2734 }
2735 
2736 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2737                                          struct task_struct *next)
2738 {
2739         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2740         struct perf_event_context *next_ctx;
2741         struct perf_event_context *parent, *next_parent;
2742         struct perf_cpu_context *cpuctx;
2743         int do_switch = 1;
2744 
2745         if (likely(!ctx))
2746                 return;
2747 
2748         cpuctx = __get_cpu_context(ctx);
2749         if (!cpuctx->task_ctx)
2750                 return;
2751 
2752         rcu_read_lock();
2753         next_ctx = next->perf_event_ctxp[ctxn];
2754         if (!next_ctx)
2755                 goto unlock;
2756 
2757         parent = rcu_dereference(ctx->parent_ctx);
2758         next_parent = rcu_dereference(next_ctx->parent_ctx);
2759 
2760         /* If neither context have a parent context; they cannot be clones. */
2761         if (!parent && !next_parent)
2762                 goto unlock;
2763 
2764         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2765                 /*
2766                  * Looks like the two contexts are clones, so we might be
2767                  * able to optimize the context switch.  We lock both
2768                  * contexts and check that they are clones under the
2769                  * lock (including re-checking that neither has been
2770                  * uncloned in the meantime).  It doesn't matter which
2771                  * order we take the locks because no other cpu could
2772                  * be trying to lock both of these tasks.
2773                  */
2774                 raw_spin_lock(&ctx->lock);
2775                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2776                 if (context_equiv(ctx, next_ctx)) {
2777                         WRITE_ONCE(ctx->task, next);
2778                         WRITE_ONCE(next_ctx->task, task);
2779 
2780                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2781 
2782                         /*
2783                          * RCU_INIT_POINTER here is safe because we've not
2784                          * modified the ctx and the above modification of
2785                          * ctx->task and ctx->task_ctx_data are immaterial
2786                          * since those values are always verified under
2787                          * ctx->lock which we're now holding.
2788                          */
2789                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2790                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2791 
2792                         do_switch = 0;
2793 
2794                         perf_event_sync_stat(ctx, next_ctx);
2795                 }
2796                 raw_spin_unlock(&next_ctx->lock);
2797                 raw_spin_unlock(&ctx->lock);
2798         }
2799 unlock:
2800         rcu_read_unlock();
2801 
2802         if (do_switch) {
2803                 raw_spin_lock(&ctx->lock);
2804                 task_ctx_sched_out(cpuctx, ctx);
2805                 raw_spin_unlock(&ctx->lock);
2806         }
2807 }
2808 
2809 void perf_sched_cb_dec(struct pmu *pmu)
2810 {
2811         this_cpu_dec(perf_sched_cb_usages);
2812 }
2813 
2814 void perf_sched_cb_inc(struct pmu *pmu)
2815 {
2816         this_cpu_inc(perf_sched_cb_usages);
2817 }
2818 
2819 /*
2820  * This function provides the context switch callback to the lower code
2821  * layer. It is invoked ONLY when the context switch callback is enabled.
2822  */
2823 static void perf_pmu_sched_task(struct task_struct *prev,
2824                                 struct task_struct *next,
2825                                 bool sched_in)
2826 {
2827         struct perf_cpu_context *cpuctx;
2828         struct pmu *pmu;
2829         unsigned long flags;
2830 
2831         if (prev == next)
2832                 return;
2833 
2834         local_irq_save(flags);
2835 
2836         rcu_read_lock();
2837 
2838         list_for_each_entry_rcu(pmu, &pmus, entry) {
2839                 if (pmu->sched_task) {
2840                         cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2841 
2842                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2843 
2844                         perf_pmu_disable(pmu);
2845 
2846                         pmu->sched_task(cpuctx->task_ctx, sched_in);
2847 
2848                         perf_pmu_enable(pmu);
2849 
2850                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2851                 }
2852         }
2853 
2854         rcu_read_unlock();
2855 
2856         local_irq_restore(flags);
2857 }
2858 
2859 static void perf_event_switch(struct task_struct *task,
2860                               struct task_struct *next_prev, bool sched_in);
2861 
2862 #define for_each_task_context_nr(ctxn)                                  \
2863         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2864 
2865 /*
2866  * Called from scheduler to remove the events of the current task,
2867  * with interrupts disabled.
2868  *
2869  * We stop each event and update the event value in event->count.
2870  *
2871  * This does not protect us against NMI, but disable()
2872  * sets the disabled bit in the control field of event _before_
2873  * accessing the event control register. If a NMI hits, then it will
2874  * not restart the event.
2875  */
2876 void __perf_event_task_sched_out(struct task_struct *task,
2877                                  struct task_struct *next)
2878 {
2879         int ctxn;
2880 
2881         if (__this_cpu_read(perf_sched_cb_usages))
2882                 perf_pmu_sched_task(task, next, false);
2883 
2884         if (atomic_read(&nr_switch_events))
2885                 perf_event_switch(task, next, false);
2886 
2887         for_each_task_context_nr(ctxn)
2888                 perf_event_context_sched_out(task, ctxn, next);
2889 
2890         /*
2891          * if cgroup events exist on this CPU, then we need
2892          * to check if we have to switch out PMU state.
2893          * cgroup event are system-wide mode only
2894          */
2895         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2896                 perf_cgroup_sched_out(task, next);
2897 }
2898 
2899 /*
2900  * Called with IRQs disabled
2901  */
2902 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2903                               enum event_type_t event_type)
2904 {
2905         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2906 }
2907 
2908 static void
2909 ctx_pinned_sched_in(struct perf_event_context *ctx,
2910                     struct perf_cpu_context *cpuctx)
2911 {
2912         struct perf_event *event;
2913 
2914         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2915                 if (event->state <= PERF_EVENT_STATE_OFF)
2916                         continue;
2917                 if (!event_filter_match(event))
2918                         continue;
2919 
2920                 /* may need to reset tstamp_enabled */
2921                 if (is_cgroup_event(event))
2922                         perf_cgroup_mark_enabled(event, ctx);
2923 
2924                 if (group_can_go_on(event, cpuctx, 1))
2925                         group_sched_in(event, cpuctx, ctx);
2926 
2927                 /*
2928                  * If this pinned group hasn't been scheduled,
2929                  * put it in error state.
2930                  */
2931                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2932                         update_group_times(event);
2933                         event->state = PERF_EVENT_STATE_ERROR;
2934                 }
2935         }
2936 }
2937 
2938 static void
2939 ctx_flexible_sched_in(struct perf_event_context *ctx,
2940                       struct perf_cpu_context *cpuctx)
2941 {
2942         struct perf_event *event;
2943         int can_add_hw = 1;
2944 
2945         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2946                 /* Ignore events in OFF or ERROR state */
2947                 if (event->state <= PERF_EVENT_STATE_OFF)
2948                         continue;
2949                 /*
2950                  * Listen to the 'cpu' scheduling filter constraint
2951                  * of events:
2952                  */
2953                 if (!event_filter_match(event))
2954                         continue;
2955 
2956                 /* may need to reset tstamp_enabled */
2957                 if (is_cgroup_event(event))
2958                         perf_cgroup_mark_enabled(event, ctx);
2959 
2960                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2961                         if (group_sched_in(event, cpuctx, ctx))
2962                                 can_add_hw = 0;
2963                 }
2964         }
2965 }
2966 
2967 static void
2968 ctx_sched_in(struct perf_event_context *ctx,
2969              struct perf_cpu_context *cpuctx,
2970              enum event_type_t event_type,
2971              struct task_struct *task)
2972 {
2973         int is_active = ctx->is_active;
2974         u64 now;
2975 
2976         lockdep_assert_held(&ctx->lock);
2977 
2978         if (likely(!ctx->nr_events))
2979                 return;
2980 
2981         ctx->is_active |= (event_type | EVENT_TIME);
2982         if (ctx->task) {
2983                 if (!is_active)
2984                         cpuctx->task_ctx = ctx;
2985                 else
2986                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2987         }
2988 
2989         is_active ^= ctx->is_active; /* changed bits */
2990 
2991         if (is_active & EVENT_TIME) {
2992                 /* start ctx time */
2993                 now = perf_clock();
2994                 ctx->timestamp = now;
2995                 perf_cgroup_set_timestamp(task, ctx);
2996         }
2997 
2998         /*
2999          * First go through the list and put on any pinned groups
3000          * in order to give them the best chance of going on.
3001          */
3002         if (is_active & EVENT_PINNED)
3003                 ctx_pinned_sched_in(ctx, cpuctx);
3004 
3005         /* Then walk through the lower prio flexible groups */
3006         if (is_active & EVENT_FLEXIBLE)
3007                 ctx_flexible_sched_in(ctx, cpuctx);
3008 }
3009 
3010 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3011                              enum event_type_t event_type,
3012                              struct task_struct *task)
3013 {
3014         struct perf_event_context *ctx = &cpuctx->ctx;
3015 
3016         ctx_sched_in(ctx, cpuctx, event_type, task);
3017 }
3018 
3019 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3020                                         struct task_struct *task)
3021 {
3022         struct perf_cpu_context *cpuctx;
3023 
3024         cpuctx = __get_cpu_context(ctx);
3025         if (cpuctx->task_ctx == ctx)
3026                 return;
3027 
3028         perf_ctx_lock(cpuctx, ctx);
3029         perf_pmu_disable(ctx->pmu);
3030         /*
3031          * We want to keep the following priority order:
3032          * cpu pinned (that don't need to move), task pinned,
3033          * cpu flexible, task flexible.
3034          */
3035         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3036         perf_event_sched_in(cpuctx, ctx, task);
3037         perf_pmu_enable(ctx->pmu);
3038         perf_ctx_unlock(cpuctx, ctx);
3039 }
3040 
3041 /*
3042  * Called from scheduler to add the events of the current task
3043  * with interrupts disabled.
3044  *
3045  * We restore the event value and then enable it.
3046  *
3047  * This does not protect us against NMI, but enable()
3048  * sets the enabled bit in the control field of event _before_
3049  * accessing the event control register. If a NMI hits, then it will
3050  * keep the event running.
3051  */
3052 void __perf_event_task_sched_in(struct task_struct *prev,
3053                                 struct task_struct *task)
3054 {
3055         struct perf_event_context *ctx;
3056         int ctxn;
3057 
3058         /*
3059          * If cgroup events exist on this CPU, then we need to check if we have
3060          * to switch in PMU state; cgroup event are system-wide mode only.
3061          *
3062          * Since cgroup events are CPU events, we must schedule these in before
3063          * we schedule in the task events.
3064          */
3065         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3066                 perf_cgroup_sched_in(prev, task);
3067 
3068         for_each_task_context_nr(ctxn) {
3069                 ctx = task->perf_event_ctxp[ctxn];
3070                 if (likely(!ctx))
3071                         continue;
3072 
3073                 perf_event_context_sched_in(ctx, task);
3074         }
3075 
3076         if (atomic_read(&nr_switch_events))
3077                 perf_event_switch(task, prev, true);
3078 
3079         if (__this_cpu_read(perf_sched_cb_usages))
3080                 perf_pmu_sched_task(prev, task, true);
3081 }
3082 
3083 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3084 {
3085         u64 frequency = event->attr.sample_freq;
3086         u64 sec = NSEC_PER_SEC;
3087         u64 divisor, dividend;
3088 
3089         int count_fls, nsec_fls, frequency_fls, sec_fls;
3090 
3091         count_fls = fls64(count);
3092         nsec_fls = fls64(nsec);
3093         frequency_fls = fls64(frequency);
3094         sec_fls = 30;
3095 
3096         /*
3097          * We got @count in @nsec, with a target of sample_freq HZ
3098          * the target period becomes:
3099          *
3100          *             @count * 10^9
3101          * period = -------------------
3102          *          @nsec * sample_freq
3103          *
3104          */
3105 
3106         /*
3107          * Reduce accuracy by one bit such that @a and @b converge
3108          * to a similar magnitude.
3109          */
3110 #define REDUCE_FLS(a, b)                \
3111 do {                                    \
3112         if (a##_fls > b##_fls) {        \
3113                 a >>= 1;                \
3114                 a##_fls--;              \
3115         } else {                        \
3116                 b >>= 1;                \
3117                 b##_fls--;              \
3118         }                               \
3119 } while (0)
3120 
3121         /*
3122          * Reduce accuracy until either term fits in a u64, then proceed with
3123          * the other, so that finally we can do a u64/u64 division.
3124          */
3125         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3126                 REDUCE_FLS(nsec, frequency);
3127                 REDUCE_FLS(sec, count);
3128         }
3129 
3130         if (count_fls + sec_fls > 64) {
3131                 divisor = nsec * frequency;
3132 
3133                 while (count_fls + sec_fls > 64) {
3134                         REDUCE_FLS(count, sec);
3135                         divisor >>= 1;
3136                 }
3137 
3138                 dividend = count * sec;
3139         } else {
3140                 dividend = count * sec;
3141 
3142                 while (nsec_fls + frequency_fls > 64) {
3143                         REDUCE_FLS(nsec, frequency);
3144                         dividend >>= 1;
3145                 }
3146 
3147                 divisor = nsec * frequency;
3148         }
3149 
3150         if (!divisor)
3151                 return dividend;
3152 
3153         return div64_u64(dividend, divisor);
3154 }
3155 
3156 static DEFINE_PER_CPU(int, perf_throttled_count);
3157 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3158 
3159 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3160 {
3161         struct hw_perf_event *hwc = &event->hw;
3162         s64 period, sample_period;
3163         s64 delta;
3164 
3165         period = perf_calculate_period(event, nsec, count);
3166 
3167         delta = (s64)(period - hwc->sample_period);
3168         delta = (delta + 7) / 8; /* low pass filter */
3169 
3170         sample_period = hwc->sample_period + delta;
3171 
3172         if (!sample_period)
3173                 sample_period = 1;
3174 
3175         hwc->sample_period = sample_period;
3176 
3177         if (local64_read(&hwc->period_left) > 8*sample_period) {
3178                 if (disable)
3179                         event->pmu->stop(event, PERF_EF_UPDATE);
3180 
3181                 local64_set(&hwc->period_left, 0);
3182 
3183                 if (disable)
3184                         event->pmu->start(event, PERF_EF_RELOAD);
3185         }
3186 }
3187 
3188 /*
3189  * combine freq adjustment with unthrottling to avoid two passes over the
3190  * events. At the same time, make sure, having freq events does not change
3191  * the rate of unthrottling as that would introduce bias.
3192  */
3193 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3194                                            int needs_unthr)
3195 {
3196         struct perf_event *event;
3197         struct hw_perf_event *hwc;
3198         u64 now, period = TICK_NSEC;
3199         s64 delta;
3200 
3201         /*
3202          * only need to iterate over all events iff:
3203          * - context have events in frequency mode (needs freq adjust)
3204          * - there are events to unthrottle on this cpu
3205          */
3206         if (!(ctx->nr_freq || needs_unthr))
3207                 return;
3208 
3209         raw_spin_lock(&ctx->lock);
3210         perf_pmu_disable(ctx->pmu);
3211 
3212         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3213                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3214                         continue;
3215 
3216                 if (!event_filter_match(event))
3217                         continue;
3218 
3219                 perf_pmu_disable(event->pmu);
3220 
3221                 hwc = &event->hw;
3222 
3223                 if (hwc->interrupts == MAX_INTERRUPTS) {
3224                         hwc->interrupts = 0;
3225                         perf_log_throttle(event, 1);
3226                         event->pmu->start(event, 0);
3227                 }
3228 
3229                 if (!event->attr.freq || !event->attr.sample_freq)
3230                         goto next;
3231 
3232                 /*
3233                  * stop the event and update event->count
3234                  */
3235                 event->pmu->stop(event, PERF_EF_UPDATE);
3236 
3237                 now = local64_read(&event->count);
3238                 delta = now - hwc->freq_count_stamp;
3239                 hwc->freq_count_stamp = now;
3240 
3241                 /*
3242                  * restart the event
3243                  * reload only if value has changed
3244                  * we have stopped the event so tell that
3245                  * to perf_adjust_period() to avoid stopping it
3246                  * twice.
3247                  */
3248                 if (delta > 0)
3249                         perf_adjust_period(event, period, delta, false);
3250 
3251                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3252         next:
3253                 perf_pmu_enable(event->pmu);
3254         }
3255 
3256         perf_pmu_enable(ctx->pmu);
3257         raw_spin_unlock(&ctx->lock);
3258 }
3259 
3260 /*
3261  * Round-robin a context's events:
3262  */
3263 static void rotate_ctx(struct perf_event_context *ctx)
3264 {
3265         /*
3266          * Rotate the first entry last of non-pinned groups. Rotation might be
3267          * disabled by the inheritance code.
3268          */
3269         if (!ctx->rotate_disable)
3270                 list_rotate_left(&ctx->flexible_groups);
3271 }
3272 
3273 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3274 {
3275         struct perf_event_context *ctx = NULL;
3276         int rotate = 0;
3277 
3278         if (cpuctx->ctx.nr_events) {
3279                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3280                         rotate = 1;
3281         }
3282 
3283         ctx = cpuctx->task_ctx;
3284         if (ctx && ctx->nr_events) {
3285                 if (ctx->nr_events != ctx->nr_active)
3286                         rotate = 1;
3287         }
3288 
3289         if (!rotate)
3290                 goto done;
3291 
3292         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3293         perf_pmu_disable(cpuctx->ctx.pmu);
3294 
3295         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3296         if (ctx)
3297                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3298 
3299         rotate_ctx(&cpuctx->ctx);
3300         if (ctx)
3301                 rotate_ctx(ctx);
3302 
3303         perf_event_sched_in(cpuctx, ctx, current);
3304 
3305         perf_pmu_enable(cpuctx->ctx.pmu);
3306         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3307 done:
3308 
3309         return rotate;
3310 }
3311 
3312 void perf_event_task_tick(void)
3313 {
3314         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3315         struct perf_event_context *ctx, *tmp;
3316         int throttled;
3317 
3318         WARN_ON(!irqs_disabled());
3319 
3320         __this_cpu_inc(perf_throttled_seq);
3321         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3322         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3323 
3324         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3325                 perf_adjust_freq_unthr_context(ctx, throttled);
3326 }
3327 
3328 static int event_enable_on_exec(struct perf_event *event,
3329                                 struct perf_event_context *ctx)
3330 {
3331         if (!event->attr.enable_on_exec)
3332                 return 0;
3333 
3334         event->attr.enable_on_exec = 0;
3335         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3336                 return 0;
3337 
3338         __perf_event_mark_enabled(event);
3339 
3340         return 1;
3341 }
3342 
3343 /*
3344  * Enable all of a task's events that have been marked enable-on-exec.
3345  * This expects task == current.
3346  */
3347 static void perf_event_enable_on_exec(int ctxn)
3348 {
3349         struct perf_event_context *ctx, *clone_ctx = NULL;
3350         struct perf_cpu_context *cpuctx;
3351         struct perf_event *event;
3352         unsigned long flags;
3353         int enabled = 0;
3354 
3355         local_irq_save(flags);
3356         ctx = current->perf_event_ctxp[ctxn];
3357         if (!ctx || !ctx->nr_events)
3358                 goto out;
3359 
3360         cpuctx = __get_cpu_context(ctx);
3361         perf_ctx_lock(cpuctx, ctx);
3362         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3363         list_for_each_entry(event, &ctx->event_list, event_entry)
3364                 enabled |= event_enable_on_exec(event, ctx);
3365 
3366         /*
3367          * Unclone and reschedule this context if we enabled any event.
3368          */
3369         if (enabled) {
3370                 clone_ctx = unclone_ctx(ctx);
3371                 ctx_resched(cpuctx, ctx);
3372         }
3373         perf_ctx_unlock(cpuctx, ctx);
3374 
3375 out:
3376         local_irq_restore(flags);
3377 
3378         if (clone_ctx)
3379                 put_ctx(clone_ctx);
3380 }
3381 
3382 struct perf_read_data {
3383         struct perf_event *event;
3384         bool group;
3385         int ret;
3386 };
3387 
3388 /*
3389  * Cross CPU call to read the hardware event
3390  */
3391 static void __perf_event_read(void *info)
3392 {
3393         struct perf_read_data *data = info;
3394         struct perf_event *sub, *event = data->event;
3395         struct perf_event_context *ctx = event->ctx;
3396         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3397         struct pmu *pmu = event->pmu;
3398 
3399         /*
3400          * If this is a task context, we need to check whether it is
3401          * the current task context of this cpu.  If not it has been
3402          * scheduled out before the smp call arrived.  In that case
3403          * event->count would have been updated to a recent sample
3404          * when the event was scheduled out.
3405          */
3406         if (ctx->task && cpuctx->task_ctx != ctx)
3407                 return;
3408 
3409         raw_spin_lock(&ctx->lock);
3410         if (ctx->is_active) {
3411                 update_context_time(ctx);
3412                 update_cgrp_time_from_event(event);
3413         }
3414 
3415         update_event_times(event);
3416         if (event->state != PERF_EVENT_STATE_ACTIVE)
3417                 goto unlock;
3418 
3419         if (!data->group) {
3420                 pmu->read(event);
3421                 data->ret = 0;
3422                 goto unlock;
3423         }
3424 
3425         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3426 
3427         pmu->read(event);
3428 
3429         list_for_each_entry(sub, &event->sibling_list, group_entry) {
3430                 update_event_times(sub);
3431                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3432                         /*
3433                          * Use sibling's PMU rather than @event's since
3434                          * sibling could be on different (eg: software) PMU.
3435                          */
3436                         sub->pmu->read(sub);
3437                 }
3438         }
3439 
3440         data->ret = pmu->commit_txn(pmu);
3441 
3442 unlock:
3443         raw_spin_unlock(&ctx->lock);
3444 }
3445 
3446 static inline u64 perf_event_count(struct perf_event *event)
3447 {
3448         if (event->pmu->count)
3449                 return event->pmu->count(event);
3450 
3451         return __perf_event_count(event);
3452 }
3453 
3454 /*
3455  * NMI-safe method to read a local event, that is an event that
3456  * is:
3457  *   - either for the current task, or for this CPU
3458  *   - does not have inherit set, for inherited task events
3459  *     will not be local and we cannot read them atomically
3460  *   - must not have a pmu::count method
3461  */
3462 u64 perf_event_read_local(struct perf_event *event)
3463 {
3464         unsigned long flags;
3465         u64 val;
3466 
3467         /*
3468          * Disabling interrupts avoids all counter scheduling (context
3469          * switches, timer based rotation and IPIs).
3470          */
3471         local_irq_save(flags);
3472 
3473         /* If this is a per-task event, it must be for current */
3474         WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3475                      event->hw.target != current);
3476 
3477         /* If this is a per-CPU event, it must be for this CPU */
3478         WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3479                      event->cpu != smp_processor_id());
3480 
3481         /*
3482          * It must not be an event with inherit set, we cannot read
3483          * all child counters from atomic context.
3484          */
3485         WARN_ON_ONCE(event->attr.inherit);
3486 
3487         /*
3488          * It must not have a pmu::count method, those are not
3489          * NMI safe.
3490          */
3491         WARN_ON_ONCE(event->pmu->count);
3492 
3493         /*
3494          * If the event is currently on this CPU, its either a per-task event,
3495          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3496          * oncpu == -1).
3497          */
3498         if (event->oncpu == smp_processor_id())
3499                 event->pmu->read(event);
3500 
3501         val = local64_read(&event->count);
3502         local_irq_restore(flags);
3503 
3504         return val;
3505 }
3506 
3507 static int perf_event_read(struct perf_event *event, bool group)
3508 {
3509         int ret = 0;
3510 
3511         /*
3512          * If event is enabled and currently active on a CPU, update the
3513          * value in the event structure:
3514          */
3515         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3516                 struct perf_read_data data = {
3517                         .event = event,
3518                         .group = group,
3519                         .ret = 0,
3520                 };
3521                 smp_call_function_single(event->oncpu,
3522                                          __perf_event_read, &data, 1);
3523                 ret = data.ret;
3524         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3525                 struct perf_event_context *ctx = event->ctx;
3526                 unsigned long flags;
3527 
3528                 raw_spin_lock_irqsave(&ctx->lock, flags);
3529                 /*
3530                  * may read while context is not active
3531                  * (e.g., thread is blocked), in that case
3532                  * we cannot update context time
3533                  */
3534                 if (ctx->is_active) {
3535                         update_context_time(ctx);
3536                         update_cgrp_time_from_event(event);
3537                 }
3538                 if (group)
3539                         update_group_times(event);
3540                 else
3541                         update_event_times(event);
3542                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3543         }
3544 
3545         return ret;
3546 }
3547 
3548 /*
3549  * Initialize the perf_event context in a task_struct:
3550  */
3551 static void __perf_event_init_context(struct perf_event_context *ctx)
3552 {
3553         raw_spin_lock_init(&ctx->lock);
3554         mutex_init(&ctx->mutex);
3555         INIT_LIST_HEAD(&ctx->active_ctx_list);
3556         INIT_LIST_HEAD(&ctx->pinned_groups);
3557         INIT_LIST_HEAD(&ctx->flexible_groups);
3558         INIT_LIST_HEAD(&ctx->event_list);
3559         atomic_set(&ctx->refcount, 1);
3560 }
3561 
3562 static struct perf_event_context *
3563 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3564 {
3565         struct perf_event_context *ctx;
3566 
3567         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3568         if (!ctx)
3569                 return NULL;
3570 
3571         __perf_event_init_context(ctx);
3572         if (task) {
3573                 ctx->task = task;
3574                 get_task_struct(task);
3575         }
3576         ctx->pmu = pmu;
3577 
3578         return ctx;
3579 }
3580 
3581 static struct task_struct *
3582 find_lively_task_by_vpid(pid_t vpid)
3583 {
3584         struct task_struct *task;
3585 
3586         rcu_read_lock();
3587         if (!vpid)
3588                 task = current;
3589         else
3590                 task = find_task_by_vpid(vpid);
3591         if (task)
3592                 get_task_struct(task);
3593         rcu_read_unlock();
3594 
3595         if (!task)
3596                 return ERR_PTR(-ESRCH);
3597 
3598         return task;
3599 }
3600 
3601 /*
3602  * Returns a matching context with refcount and pincount.
3603  */
3604 static struct perf_event_context *
3605 find_get_context(struct pmu *pmu, struct task_struct *task,
3606                 struct perf_event *event)
3607 {
3608         struct perf_event_context *ctx, *clone_ctx = NULL;
3609         struct perf_cpu_context *cpuctx;
3610         void *task_ctx_data = NULL;
3611         unsigned long flags;
3612         int ctxn, err;
3613         int cpu = event->cpu;
3614 
3615         if (!task) {
3616                 /* Must be root to operate on a CPU event: */
3617                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3618                         return ERR_PTR(-EACCES);
3619 
3620                 /*
3621                  * We could be clever and allow to attach a event to an
3622                  * offline CPU and activate it when the CPU comes up, but
3623                  * that's for later.
3624                  */
3625                 if (!cpu_online(cpu))
3626                         return ERR_PTR(-ENODEV);
3627 
3628                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3629                 ctx = &cpuctx->ctx;
3630                 get_ctx(ctx);
3631                 ++ctx->pin_count;
3632 
3633                 return ctx;
3634         }
3635 
3636         err = -EINVAL;
3637         ctxn = pmu->task_ctx_nr;
3638         if (ctxn < 0)
3639                 goto errout;
3640 
3641         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3642                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3643                 if (!task_ctx_data) {
3644                         err = -ENOMEM;
3645                         goto errout;
3646                 }
3647         }
3648 
3649 retry:
3650         ctx = perf_lock_task_context(task, ctxn, &flags);
3651         if (ctx) {
3652                 clone_ctx = unclone_ctx(ctx);
3653                 ++ctx->pin_count;
3654 
3655                 if (task_ctx_data && !ctx->task_ctx_data) {
3656                         ctx->task_ctx_data = task_ctx_data;
3657                         task_ctx_data = NULL;
3658                 }
3659                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3660 
3661                 if (clone_ctx)
3662                         put_ctx(clone_ctx);
3663         } else {
3664                 ctx = alloc_perf_context(pmu, task);
3665                 err = -ENOMEM;
3666                 if (!ctx)
3667                         goto errout;
3668 
3669                 if (task_ctx_data) {
3670                         ctx->task_ctx_data = task_ctx_data;
3671                         task_ctx_data = NULL;
3672                 }
3673 
3674                 err = 0;
3675                 mutex_lock(&task->perf_event_mutex);
3676                 /*
3677                  * If it has already passed perf_event_exit_task().
3678                  * we must see PF_EXITING, it takes this mutex too.
3679                  */
3680                 if (task->flags & PF_EXITING)
3681                         err = -ESRCH;
3682                 else if (task->perf_event_ctxp[ctxn])
3683                         err = -EAGAIN;
3684                 else {
3685                         get_ctx(ctx);
3686                         ++ctx->pin_count;
3687                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3688                 }
3689                 mutex_unlock(&task->perf_event_mutex);
3690 
3691                 if (unlikely(err)) {
3692                         put_ctx(ctx);
3693 
3694                         if (err == -EAGAIN)
3695                                 goto retry;
3696                         goto errout;
3697                 }
3698         }
3699 
3700         kfree(task_ctx_data);
3701         return ctx;
3702 
3703 errout:
3704         kfree(task_ctx_data);
3705         return ERR_PTR(err);
3706 }
3707 
3708 static void perf_event_free_filter(struct perf_event *event);
3709 static void perf_event_free_bpf_prog(struct perf_event *event);
3710 
3711 static void free_event_rcu(struct rcu_head *head)
3712 {
3713         struct perf_event *event;
3714 
3715         event = container_of(head, struct perf_event, rcu_head);
3716         if (event->ns)
3717                 put_pid_ns(event->ns);
3718         perf_event_free_filter(event);
3719         kfree(event);
3720 }
3721 
3722 static void ring_buffer_attach(struct perf_event *event,
3723                                struct ring_buffer *rb);
3724 
3725 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3726 {
3727         if (event->parent)
3728                 return;
3729 
3730         if (is_cgroup_event(event))
3731                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3732 }
3733 
3734 #ifdef CONFIG_NO_HZ_FULL
3735 static DEFINE_SPINLOCK(nr_freq_lock);
3736 #endif
3737 
3738 static void unaccount_freq_event_nohz(void)
3739 {
3740 #ifdef CONFIG_NO_HZ_FULL
3741         spin_lock(&nr_freq_lock);
3742         if (atomic_dec_and_test(&nr_freq_events))
3743                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3744         spin_unlock(&nr_freq_lock);
3745 #endif
3746 }
3747 
3748 static void unaccount_freq_event(void)
3749 {
3750         if (tick_nohz_full_enabled())
3751                 unaccount_freq_event_nohz();
3752         else
3753                 atomic_dec(&nr_freq_events);
3754 }
3755 
3756 static void unaccount_event(struct perf_event *event)
3757 {
3758         bool dec = false;
3759 
3760         if (event->parent)
3761                 return;
3762 
3763         if (event->attach_state & PERF_ATTACH_TASK)
3764                 dec = true;
3765         if (event->attr.mmap || event->attr.mmap_data)
3766                 atomic_dec(&nr_mmap_events);
3767         if (event->attr.comm)
3768                 atomic_dec(&nr_comm_events);
3769         if (event->attr.task)
3770                 atomic_dec(&nr_task_events);
3771         if (event->attr.freq)
3772                 unaccount_freq_event();
3773         if (event->attr.context_switch) {
3774                 dec = true;
3775                 atomic_dec(&nr_switch_events);
3776         }
3777         if (is_cgroup_event(event))
3778                 dec = true;
3779         if (has_branch_stack(event))
3780                 dec = true;
3781 
3782         if (dec) {
3783                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3784                         schedule_delayed_work(&perf_sched_work, HZ);
3785         }
3786 
3787         unaccount_event_cpu(event, event->cpu);
3788 }
3789 
3790 static void perf_sched_delayed(struct work_struct *work)
3791 {
3792         mutex_lock(&perf_sched_mutex);
3793         if (atomic_dec_and_test(&perf_sched_count))
3794                 static_branch_disable(&perf_sched_events);
3795         mutex_unlock(&perf_sched_mutex);
3796 }
3797 
3798 /*
3799  * The following implement mutual exclusion of events on "exclusive" pmus
3800  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3801  * at a time, so we disallow creating events that might conflict, namely:
3802  *
3803  *  1) cpu-wide events in the presence of per-task events,
3804  *  2) per-task events in the presence of cpu-wide events,
3805  *  3) two matching events on the same context.
3806  *
3807  * The former two cases are handled in the allocation path (perf_event_alloc(),
3808  * _free_event()), the latter -- before the first perf_install_in_context().
3809  */
3810 static int exclusive_event_init(struct perf_event *event)
3811 {
3812         struct pmu *pmu = event->pmu;
3813 
3814         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3815                 return 0;
3816 
3817         /*
3818          * Prevent co-existence of per-task and cpu-wide events on the
3819          * same exclusive pmu.
3820          *
3821          * Negative pmu::exclusive_cnt means there are cpu-wide
3822          * events on this "exclusive" pmu, positive means there are
3823          * per-task events.
3824          *
3825          * Since this is called in perf_event_alloc() path, event::ctx
3826          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3827          * to mean "per-task event", because unlike other attach states it
3828          * never gets cleared.
3829          */
3830         if (event->attach_state & PERF_ATTACH_TASK) {
3831                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3832                         return -EBUSY;
3833         } else {
3834                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3835                         return -EBUSY;
3836         }
3837 
3838         return 0;
3839 }
3840 
3841 static void exclusive_event_destroy(struct perf_event *event)
3842 {
3843         struct pmu *pmu = event->pmu;
3844 
3845         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3846                 return;
3847 
3848         /* see comment in exclusive_event_init() */
3849         if (event->attach_state & PERF_ATTACH_TASK)
3850                 atomic_dec(&pmu->exclusive_cnt);
3851         else
3852                 atomic_inc(&pmu->exclusive_cnt);
3853 }
3854 
3855 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3856 {
3857         if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3858             (e1->cpu == e2->cpu ||
3859              e1->cpu == -1 ||
3860              e2->cpu == -1))
3861                 return true;
3862         return false;
3863 }
3864 
3865 /* Called under the same ctx::mutex as perf_install_in_context() */
3866 static bool exclusive_event_installable(struct perf_event *event,
3867                                         struct perf_event_context *ctx)
3868 {
3869         struct perf_event *iter_event;
3870         struct pmu *pmu = event->pmu;
3871 
3872         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3873                 return true;
3874 
3875         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3876                 if (exclusive_event_match(iter_event, event))
3877                         return false;
3878         }
3879 
3880         return true;
3881 }
3882 
3883 static void perf_addr_filters_splice(struct perf_event *event,
3884                                        struct list_head *head);
3885 
3886 static void _free_event(struct perf_event *event)
3887 {
3888         irq_work_sync(&event->pending);
3889 
3890         unaccount_event(event);
3891 
3892         if (event->rb) {
3893                 /*
3894                  * Can happen when we close an event with re-directed output.
3895                  *
3896                  * Since we have a 0 refcount, perf_mmap_close() will skip
3897                  * over us; possibly making our ring_buffer_put() the last.
3898                  */
3899                 mutex_lock(&event->mmap_mutex);
3900                 ring_buffer_attach(event, NULL);
3901                 mutex_unlock(&event->mmap_mutex);
3902         }
3903 
3904         if (is_cgroup_event(event))
3905                 perf_detach_cgroup(event);
3906 
3907         if (!event->parent) {
3908                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3909                         put_callchain_buffers();
3910         }
3911 
3912         perf_event_free_bpf_prog(event);
3913         perf_addr_filters_splice(event, NULL);
3914         kfree(event->addr_filters_offs);
3915 
3916         if (event->destroy)
3917                 event->destroy(event);
3918 
3919         if (event->ctx)
3920                 put_ctx(event->ctx);
3921 
3922         exclusive_event_destroy(event);
3923         module_put(event->pmu->module);
3924 
3925         call_rcu(&event->rcu_head, free_event_rcu);
3926 }
3927 
3928 /*
3929  * Used to free events which have a known refcount of 1, such as in error paths
3930  * where the event isn't exposed yet and inherited events.
3931  */
3932 static void free_event(struct perf_event *event)
3933 {
3934         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3935                                 "unexpected event refcount: %ld; ptr=%p\n",
3936                                 atomic_long_read(&event->refcount), event)) {
3937                 /* leak to avoid use-after-free */
3938                 return;
3939         }
3940 
3941         _free_event(event);
3942 }
3943 
3944 /*
3945  * Remove user event from the owner task.
3946  */
3947 static void perf_remove_from_owner(struct perf_event *event)
3948 {
3949         struct task_struct *owner;
3950 
3951         rcu_read_lock();
3952         /*
3953          * Matches the smp_store_release() in perf_event_exit_task(). If we
3954          * observe !owner it means the list deletion is complete and we can
3955          * indeed free this event, otherwise we need to serialize on
3956          * owner->perf_event_mutex.
3957          */
3958         owner = lockless_dereference(event->owner);
3959         if (owner) {
3960                 /*
3961                  * Since delayed_put_task_struct() also drops the last
3962                  * task reference we can safely take a new reference
3963                  * while holding the rcu_read_lock().
3964                  */
3965                 get_task_struct(owner);
3966         }
3967         rcu_read_unlock();
3968 
3969         if (owner) {
3970                 /*
3971                  * If we're here through perf_event_exit_task() we're already
3972                  * holding ctx->mutex which would be an inversion wrt. the
3973                  * normal lock order.
3974                  *
3975                  * However we can safely take this lock because its the child
3976                  * ctx->mutex.
3977                  */
3978                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3979 
3980                 /*
3981                  * We have to re-check the event->owner field, if it is cleared
3982                  * we raced with perf_event_exit_task(), acquiring the mutex
3983                  * ensured they're done, and we can proceed with freeing the
3984                  * event.
3985                  */
3986                 if (event->owner) {
3987                         list_del_init(&event->owner_entry);
3988                         smp_store_release(&event->owner, NULL);
3989                 }
3990                 mutex_unlock(&owner->perf_event_mutex);
3991                 put_task_struct(owner);
3992         }
3993 }
3994 
3995 static void put_event(struct perf_event *event)
3996 {
3997         if (!atomic_long_dec_and_test(&event->refcount))
3998                 return;
3999 
4000         _free_event(event);
4001 }
4002 
4003 /*
4004  * Kill an event dead; while event:refcount will preserve the event
4005  * object, it will not preserve its functionality. Once the last 'user'
4006  * gives up the object, we'll destroy the thing.
4007  */
4008 int perf_event_release_kernel(struct perf_event *event)
4009 {
4010         struct perf_event_context *ctx = event->ctx;
4011         struct perf_event *child, *tmp;
4012 
4013         /*
4014          * If we got here through err_file: fput(event_file); we will not have
4015          * attached to a context yet.
4016          */
4017         if (!ctx) {
4018                 WARN_ON_ONCE(event->attach_state &
4019                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4020                 goto no_ctx;
4021         }
4022 
4023         if (!is_kernel_event(event))
4024                 perf_remove_from_owner(event);
4025 
4026         ctx = perf_event_ctx_lock(event);
4027         WARN_ON_ONCE(ctx->parent_ctx);
4028         perf_remove_from_context(event, DETACH_GROUP);
4029 
4030         raw_spin_lock_irq(&ctx->lock);
4031         /*
4032          * Mark this even as STATE_DEAD, there is no external reference to it
4033          * anymore.
4034          *
4035          * Anybody acquiring event->child_mutex after the below loop _must_
4036          * also see this, most importantly inherit_event() which will avoid
4037          * placing more children on the list.
4038          *
4039          * Thus this guarantees that we will in fact observe and kill _ALL_
4040          * child events.
4041          */
4042         event->state = PERF_EVENT_STATE_DEAD;
4043         raw_spin_unlock_irq(&ctx->lock);
4044 
4045         perf_event_ctx_unlock(event, ctx);
4046 
4047 again:
4048         mutex_lock(&event->child_mutex);
4049         list_for_each_entry(child, &event->child_list, child_list) {
4050 
4051                 /*
4052                  * Cannot change, child events are not migrated, see the
4053                  * comment with perf_event_ctx_lock_nested().
4054                  */
4055                 ctx = lockless_dereference(child->ctx);
4056                 /*
4057                  * Since child_mutex nests inside ctx::mutex, we must jump
4058                  * through hoops. We start by grabbing a reference on the ctx.
4059                  *
4060                  * Since the event cannot get freed while we hold the
4061                  * child_mutex, the context must also exist and have a !0
4062                  * reference count.
4063                  */
4064                 get_ctx(ctx);
4065 
4066                 /*
4067                  * Now that we have a ctx ref, we can drop child_mutex, and
4068                  * acquire ctx::mutex without fear of it going away. Then we
4069                  * can re-acquire child_mutex.
4070                  */
4071                 mutex_unlock(&event->child_mutex);
4072                 mutex_lock(&ctx->mutex);
4073                 mutex_lock(&event->child_mutex);
4074 
4075                 /*
4076                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4077                  * state, if child is still the first entry, it didn't get freed
4078                  * and we can continue doing so.
4079                  */
4080                 tmp = list_first_entry_or_null(&event->child_list,
4081                                                struct perf_event, child_list);
4082                 if (tmp == child) {
4083                         perf_remove_from_context(child, DETACH_GROUP);
4084                         list_del(&child->child_list);
4085                         free_event(child);
4086                         /*
4087                          * This matches the refcount bump in inherit_event();
4088                          * this can't be the last reference.
4089                          */
4090                         put_event(event);
4091                 }
4092 
4093                 mutex_unlock(&event->child_mutex);
4094                 mutex_unlock(&ctx->mutex);
4095                 put_ctx(ctx);
4096                 goto again;
4097         }
4098         mutex_unlock(&event->child_mutex);
4099 
4100 no_ctx:
4101         put_event(event); /* Must be the 'last' reference */
4102         return 0;
4103 }
4104 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4105 
4106 /*
4107  * Called when the last reference to the file is gone.
4108  */
4109 static int perf_release(struct inode *inode, struct file *file)
4110 {
4111         perf_event_release_kernel(file->private_data);
4112         return 0;
4113 }
4114 
4115 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4116 {
4117         struct perf_event *child;
4118         u64 total = 0;
4119 
4120         *enabled = 0;
4121         *running = 0;
4122 
4123         mutex_lock(&event->child_mutex);
4124 
4125         (void)perf_event_read(event, false);
4126         total += perf_event_count(event);
4127 
4128         *enabled += event->total_time_enabled +
4129                         atomic64_read(&event->child_total_time_enabled);
4130         *running += event->total_time_running +
4131                         atomic64_read(&event->child_total_time_running);
4132 
4133         list_for_each_entry(child, &event->child_list, child_list) {
4134                 (void)perf_event_read(child, false);
4135                 total += perf_event_count(child);
4136                 *enabled += child->total_time_enabled;
4137                 *running += child->total_time_running;
4138         }
4139         mutex_unlock(&event->child_mutex);
4140 
4141         return total;
4142 }
4143 EXPORT_SYMBOL_GPL(perf_event_read_value);
4144 
4145 static int __perf_read_group_add(struct perf_event *leader,
4146                                         u64 read_format, u64 *values)
4147 {
4148         struct perf_event *sub;
4149         int n = 1; /* skip @nr */
4150         int ret;
4151 
4152         ret = perf_event_read(leader, true);
4153         if (ret)
4154                 return ret;
4155 
4156         /*
4157          * Since we co-schedule groups, {enabled,running} times of siblings
4158          * will be identical to those of the leader, so we only publish one
4159          * set.
4160          */
4161         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4162                 values[n++] += leader->total_time_enabled +
4163                         atomic64_read(&leader->child_total_time_enabled);
4164         }
4165 
4166         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4167                 values[n++] += leader->total_time_running +
4168                         atomic64_read(&leader->child_total_time_running);
4169         }
4170 
4171         /*
4172          * Write {count,id} tuples for every sibling.
4173          */
4174         values[n++] += perf_event_count(leader);
4175         if (read_format & PERF_FORMAT_ID)
4176                 values[n++] = primary_event_id(leader);
4177 
4178         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4179                 values[n++] += perf_event_count(sub);
4180                 if (read_format & PERF_FORMAT_ID)
4181                         values[n++] = primary_event_id(sub);
4182         }
4183 
4184         return 0;
4185 }
4186 
4187 static int perf_read_group(struct perf_event *event,
4188                                    u64 read_format, char __user *buf)
4189 {
4190         struct perf_event *leader = event->group_leader, *child;
4191         struct perf_event_context *ctx = leader->ctx;
4192         int ret;
4193         u64 *values;
4194 
4195         lockdep_assert_held(&ctx->mutex);
4196 
4197         values = kzalloc(event->read_size, GFP_KERNEL);
4198         if (!values)
4199                 return -ENOMEM;
4200 
4201         values[0] = 1 + leader->nr_siblings;
4202 
4203         /*
4204          * By locking the child_mutex of the leader we effectively
4205          * lock the child list of all siblings.. XXX explain how.
4206          */
4207         mutex_lock(&leader->child_mutex);
4208 
4209         ret = __perf_read_group_add(leader, read_format, values);
4210         if (ret)
4211                 goto unlock;
4212 
4213         list_for_each_entry(child, &leader->child_list, child_list) {
4214                 ret = __perf_read_group_add(child, read_format, values);
4215                 if (ret)
4216                         goto unlock;
4217         }
4218 
4219         mutex_unlock(&leader->child_mutex);
4220 
4221         ret = event->read_size;
4222         if (copy_to_user(buf, values, event->read_size))
4223                 ret = -EFAULT;
4224         goto out;
4225 
4226 unlock:
4227         mutex_unlock(&leader->child_mutex);
4228 out:
4229         kfree(values);
4230         return ret;
4231 }
4232 
4233 static int perf_read_one(struct perf_event *event,
4234                                  u64 read_format, char __user *buf)
4235 {
4236         u64 enabled, running;
4237         u64 values[4];
4238         int n = 0;
4239 
4240         values[n++] = perf_event_read_value(event, &enabled, &running);
4241         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4242                 values[n++] = enabled;
4243         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4244                 values[n++] = running;
4245         if (read_format & PERF_FORMAT_ID)
4246                 values[n++] = primary_event_id(event);
4247 
4248         if (copy_to_user(buf, values, n * sizeof(u64)))
4249                 return -EFAULT;
4250 
4251         return n * sizeof(u64);
4252 }
4253 
4254 static bool is_event_hup(struct perf_event *event)
4255 {
4256         bool no_children;
4257 
4258         if (event->state > PERF_EVENT_STATE_EXIT)
4259                 return false;
4260 
4261         mutex_lock(&event->child_mutex);
4262         no_children = list_empty(&event->child_list);
4263         mutex_unlock(&event->child_mutex);
4264         return no_children;
4265 }
4266 
4267 /*
4268  * Read the performance event - simple non blocking version for now
4269  */
4270 static ssize_t
4271 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4272 {
4273         u64 read_format = event->attr.read_format;
4274         int ret;
4275 
4276         /*
4277          * Return end-of-file for a read on a event that is in
4278          * error state (i.e. because it was pinned but it couldn't be
4279          * scheduled on to the CPU at some point).
4280          */
4281         if (event->state == PERF_EVENT_STATE_ERROR)
4282                 return 0;
4283 
4284         if (count < event->read_size)
4285                 return -ENOSPC;
4286 
4287         WARN_ON_ONCE(event->ctx->parent_ctx);
4288         if (read_format & PERF_FORMAT_GROUP)
4289                 ret = perf_read_group(event, read_format, buf);
4290         else
4291                 ret = perf_read_one(event, read_format, buf);
4292 
4293         return ret;
4294 }
4295 
4296 static ssize_t
4297 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4298 {
4299         struct perf_event *event = file->private_data;
4300         struct perf_event_context *ctx;
4301         int ret;
4302 
4303         ctx = perf_event_ctx_lock(event);
4304         ret = __perf_read(event, buf, count);
4305         perf_event_ctx_unlock(event, ctx);
4306 
4307         return ret;
4308 }
4309 
4310 static unsigned int perf_poll(struct file *file, poll_table *wait)
4311 {
4312         struct perf_event *event = file->private_data;
4313         struct ring_buffer *rb;
4314         unsigned int events = POLLHUP;
4315 
4316         poll_wait(file, &event->waitq, wait);
4317 
4318         if (is_event_hup(event))
4319                 return events;
4320 
4321         /*
4322          * Pin the event->rb by taking event->mmap_mutex; otherwise
4323          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4324          */
4325         mutex_lock(&event->mmap_mutex);
4326         rb = event->rb;
4327         if (rb)
4328                 events = atomic_xchg(&rb->poll, 0);
4329         mutex_unlock(&event->mmap_mutex);
4330         return events;
4331 }
4332 
4333 static void _perf_event_reset(struct perf_event *event)
4334 {
4335         (void)perf_event_read(event, false);
4336         local64_set(&event->count, 0);
4337         perf_event_update_userpage(event);
4338 }
4339 
4340 /*
4341  * Holding the top-level event's child_mutex means that any
4342  * descendant process that has inherited this event will block
4343  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4344  * task existence requirements of perf_event_enable/disable.
4345  */
4346 static void perf_event_for_each_child(struct perf_event *event,
4347                                         void (*func)(struct perf_event *))
4348 {
4349         struct perf_event *child;
4350 
4351         WARN_ON_ONCE(event->ctx->parent_ctx);
4352 
4353         mutex_lock(&event->child_mutex);
4354         func(event);
4355         list_for_each_entry(child, &event->child_list, child_list)
4356                 func(child);
4357         mutex_unlock(&event->child_mutex);
4358 }
4359 
4360 static void perf_event_for_each(struct perf_event *event,
4361                                   void (*func)(struct perf_event *))
4362 {
4363         struct perf_event_context *ctx = event->ctx;
4364         struct perf_event *sibling;
4365 
4366         lockdep_assert_held(&ctx->mutex);
4367 
4368         event = event->group_leader;
4369 
4370         perf_event_for_each_child(event, func);
4371         list_for_each_entry(sibling, &event->sibling_list, group_entry)
4372                 perf_event_for_each_child(sibling, func);
4373 }
4374 
4375 static void __perf_event_period(struct perf_event *event,
4376                                 struct perf_cpu_context *cpuctx,
4377                                 struct perf_event_context *ctx,
4378                                 void *info)
4379 {
4380         u64 value = *((u64 *)info);
4381         bool active;
4382 
4383         if (event->attr.freq) {
4384                 event->attr.sample_freq = value;
4385         } else {
4386                 event->attr.sample_period = value;
4387                 event->hw.sample_period = value;
4388         }
4389 
4390         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4391         if (active) {
4392                 perf_pmu_disable(ctx->pmu);
4393                 /*
4394                  * We could be throttled; unthrottle now to avoid the tick
4395                  * trying to unthrottle while we already re-started the event.
4396                  */
4397                 if (event->hw.interrupts == MAX_INTERRUPTS) {
4398                         event->hw.interrupts = 0;
4399                         perf_log_throttle(event, 1);
4400                 }
4401                 event->pmu->stop(event, PERF_EF_UPDATE);
4402         }
4403 
4404         local64_set(&event->hw.period_left, 0);
4405 
4406         if (active) {
4407                 event->pmu->start(event, PERF_EF_RELOAD);
4408                 perf_pmu_enable(ctx->pmu);
4409         }
4410 }
4411 
4412 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4413 {
4414         u64 value;
4415 
4416         if (!is_sampling_event(event))
4417                 return -EINVAL;
4418 
4419         if (copy_from_user(&value, arg, sizeof(value)))
4420                 return -EFAULT;
4421 
4422         if (!value)
4423                 return -EINVAL;
4424 
4425         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4426                 return -EINVAL;
4427 
4428         event_function_call(event, __perf_event_period, &value);
4429 
4430         return 0;
4431 }
4432 
4433 static const struct file_operations perf_fops;
4434 
4435 static inline int perf_fget_light(int fd, struct fd *p)
4436 {
4437         struct fd f = fdget(fd);
4438         if (!f.file)
4439                 return -EBADF;
4440 
4441         if (f.file->f_op != &perf_fops) {
4442                 fdput(f);
4443                 return -EBADF;
4444         }
4445         *p = f;
4446         return 0;
4447 }
4448 
4449 static int perf_event_set_output(struct perf_event *event,
4450                                  struct perf_event *output_event);
4451 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4452 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4453 
4454 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4455 {
4456         void (*func)(struct perf_event *);
4457         u32 flags = arg;
4458 
4459         switch (cmd) {
4460         case PERF_EVENT_IOC_ENABLE:
4461                 func = _perf_event_enable;
4462                 break;
4463         case PERF_EVENT_IOC_DISABLE:
4464                 func = _perf_event_disable;
4465                 break;
4466         case PERF_EVENT_IOC_RESET:
4467                 func = _perf_event_reset;
4468                 break;
4469 
4470         case PERF_EVENT_IOC_REFRESH:
4471                 return _perf_event_refresh(event, arg);
4472 
4473         case PERF_EVENT_IOC_PERIOD:
4474                 return perf_event_period(event, (u64 __user *)arg);
4475 
4476         case PERF_EVENT_IOC_ID:
4477         {
4478                 u64 id = primary_event_id(event);
4479 
4480                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4481                         return -EFAULT;
4482                 return 0;
4483         }
4484 
4485         case PERF_EVENT_IOC_SET_OUTPUT:
4486         {
4487                 int ret;
4488                 if (arg != -1) {
4489                         struct perf_event *output_event;
4490                         struct fd output;
4491                         ret = perf_fget_light(arg, &output);
4492                         if (ret)
4493                                 return ret;
4494                         output_event = output.file->private_data;
4495                         ret = perf_event_set_output(event, output_event);
4496                         fdput(output);
4497                 } else {
4498                         ret = perf_event_set_output(event, NULL);
4499                 }
4500                 return ret;
4501         }
4502 
4503         case PERF_EVENT_IOC_SET_FILTER:
4504                 return perf_event_set_filter(event, (void __user *)arg);
4505 
4506         case PERF_EVENT_IOC_SET_BPF:
4507                 return perf_event_set_bpf_prog(event, arg);
4508 
4509         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4510                 struct ring_buffer *rb;
4511 
4512                 rcu_read_lock();
4513                 rb = rcu_dereference(event->rb);
4514                 if (!rb || !rb->nr_pages) {
4515                         rcu_read_unlock();
4516                         return -EINVAL;
4517                 }
4518                 rb_toggle_paused(rb, !!arg);
4519                 rcu_read_unlock();
4520                 return 0;
4521         }
4522         default:
4523                 return -ENOTTY;
4524         }
4525 
4526         if (flags & PERF_IOC_FLAG_GROUP)
4527                 perf_event_for_each(event, func);
4528         else
4529                 perf_event_for_each_child(event, func);
4530 
4531         return 0;
4532 }
4533 
4534 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4535 {
4536         struct perf_event *event = file->private_data;
4537         struct perf_event_context *ctx;
4538         long ret;
4539 
4540         ctx = perf_event_ctx_lock(event);
4541         ret = _perf_ioctl(event, cmd, arg);
4542         perf_event_ctx_unlock(event, ctx);
4543 
4544         return ret;
4545 }
4546 
4547 #ifdef CONFIG_COMPAT
4548 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4549                                 unsigned long arg)
4550 {
4551         switch (_IOC_NR(cmd)) {
4552         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4553         case _IOC_NR(PERF_EVENT_IOC_ID):
4554                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4555                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4556                         cmd &= ~IOCSIZE_MASK;
4557                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4558                 }
4559                 break;
4560         }
4561         return perf_ioctl(file, cmd, arg);
4562 }
4563 #else
4564 # define perf_compat_ioctl NULL
4565 #endif
4566 
4567 int perf_event_task_enable(void)
4568 {
4569         struct perf_event_context *ctx;
4570         struct perf_event *event;
4571 
4572         mutex_lock(&current->perf_event_mutex);
4573         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4574                 ctx = perf_event_ctx_lock(event);
4575                 perf_event_for_each_child(event, _perf_event_enable);
4576                 perf_event_ctx_unlock(event, ctx);
4577         }
4578         mutex_unlock(&current->perf_event_mutex);
4579 
4580         return 0;
4581 }
4582 
4583 int perf_event_task_disable(void)
4584 {
4585         struct perf_event_context *ctx;
4586         struct perf_event *event;
4587 
4588         mutex_lock(&current->perf_event_mutex);
4589         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4590                 ctx = perf_event_ctx_lock(event);
4591                 perf_event_for_each_child(event, _perf_event_disable);
4592                 perf_event_ctx_unlock(event, ctx);
4593         }
4594         mutex_unlock(&current->perf_event_mutex);
4595 
4596         return 0;
4597 }
4598 
4599 static int perf_event_index(struct perf_event *event)
4600 {
4601         if (event->hw.state & PERF_HES_STOPPED)
4602                 return 0;
4603 
4604         if (event->state != PERF_EVENT_STATE_ACTIVE)
4605                 return 0;
4606 
4607         return event->pmu->event_idx(event);
4608 }
4609 
4610 static void calc_timer_values(struct perf_event *event,
4611                                 u64 *now,
4612                                 u64 *enabled,
4613                                 u64 *running)
4614 {
4615         u64 ctx_time;
4616 
4617         *now = perf_clock();
4618         ctx_time = event->shadow_ctx_time + *now;
4619         *enabled = ctx_time - event->tstamp_enabled;
4620         *running = ctx_time - event->tstamp_running;
4621 }
4622 
4623 static void perf_event_init_userpage(struct perf_event *event)
4624 {
4625         struct perf_event_mmap_page *userpg;
4626         struct ring_buffer *rb;
4627 
4628         rcu_read_lock();
4629         rb = rcu_dereference(event->rb);
4630         if (!rb)
4631                 goto unlock;
4632 
4633         userpg = rb->user_page;
4634 
4635         /* Allow new userspace to detect that bit 0 is deprecated */
4636         userpg->cap_bit0_is_deprecated = 1;
4637         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4638         userpg->data_offset = PAGE_SIZE;
4639         userpg->data_size = perf_data_size(rb);
4640 
4641 unlock:
4642         rcu_read_unlock();
4643 }
4644 
4645 void __weak arch_perf_update_userpage(
4646         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4647 {
4648 }
4649 
4650 /*
4651  * Callers need to ensure there can be no nesting of this function, otherwise
4652  * the seqlock logic goes bad. We can not serialize this because the arch
4653  * code calls this from NMI context.
4654  */
4655 void perf_event_update_userpage(struct perf_event *event)
4656 {
4657         struct perf_event_mmap_page *userpg;
4658         struct ring_buffer *rb;
4659         u64 enabled, running, now;
4660 
4661         rcu_read_lock();
4662         rb = rcu_dereference(event->rb);
4663         if (!rb)
4664                 goto unlock;
4665 
4666         /*
4667          * compute total_time_enabled, total_time_running
4668          * based on snapshot values taken when the event
4669          * was last scheduled in.
4670          *
4671          * we cannot simply called update_context_time()
4672          * because of locking issue as we can be called in
4673          * NMI context
4674          */
4675         calc_timer_values(event, &now, &enabled, &running);
4676 
4677         userpg = rb->user_page;
4678         /*
4679          * Disable preemption so as to not let the corresponding user-space
4680          * spin too long if we get preempted.
4681          */
4682         preempt_disable();
4683         ++userpg->lock;
4684         barrier();
4685         userpg->index = perf_event_index(event);
4686         userpg->offset = perf_event_count(event);
4687         if (userpg->index)
4688                 userpg->offset -= local64_read(&event->hw.prev_count);
4689 
4690         userpg->time_enabled = enabled +
4691                         atomic64_read(&event->child_total_time_enabled);
4692 
4693         userpg->time_running = running +
4694                         atomic64_read(&event->child_total_time_running);
4695 
4696         arch_perf_update_userpage(event, userpg, now);
4697 
4698         barrier();
4699         ++userpg->lock;
4700         preempt_enable();
4701 unlock:
4702         rcu_read_unlock();
4703 }
4704 
4705 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4706 {
4707         struct perf_event *event = vma->vm_file->private_data;
4708         struct ring_buffer *rb;
4709         int ret = VM_FAULT_SIGBUS;
4710 
4711         if (vmf->flags & FAULT_FLAG_MKWRITE) {
4712                 if (vmf->pgoff == 0)
4713                         ret = 0;
4714                 return ret;
4715         }
4716 
4717         rcu_read_lock();
4718         rb = rcu_dereference(event->rb);
4719         if (!rb)
4720                 goto unlock;
4721 
4722         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4723                 goto unlock;
4724 
4725         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4726         if (!vmf->page)
4727                 goto unlock;
4728 
4729         get_page(vmf->page);
4730         vmf->page->mapping = vma->vm_file->f_mapping;
4731         vmf->page->index   = vmf->pgoff;
4732 
4733         ret = 0;
4734 unlock:
4735         rcu_read_unlock();
4736 
4737         return ret;
4738 }
4739 
4740 static void ring_buffer_attach(struct perf_event *event,
4741                                struct ring_buffer *rb)
4742 {
4743         struct ring_buffer *old_rb = NULL;
4744         unsigned long flags;
4745 
4746         if (event->rb) {
4747                 /*
4748                  * Should be impossible, we set this when removing
4749                  * event->rb_entry and wait/clear when adding event->rb_entry.
4750                  */
4751                 WARN_ON_ONCE(event->rcu_pending);
4752 
4753                 old_rb = event->rb;
4754                 spin_lock_irqsave(&old_rb->event_lock, flags);
4755                 list_del_rcu(&event->rb_entry);
4756                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4757 
4758                 event->rcu_batches = get_state_synchronize_rcu();
4759                 event->rcu_pending = 1;
4760         }
4761 
4762         if (rb) {
4763                 if (event->rcu_pending) {
4764                         cond_synchronize_rcu(event->rcu_batches);
4765                         event->rcu_pending = 0;
4766                 }
4767 
4768                 spin_lock_irqsave(&rb->event_lock, flags);
4769                 list_add_rcu(&event->rb_entry, &rb->event_list);
4770                 spin_unlock_irqrestore(&rb->event_lock, flags);
4771         }
4772 
4773         rcu_assign_pointer(event->rb, rb);
4774 
4775         if (old_rb) {
4776                 ring_buffer_put(old_rb);
4777                 /*
4778                  * Since we detached before setting the new rb, so that we
4779                  * could attach the new rb, we could have missed a wakeup.
4780                  * Provide it now.
4781                  */
4782                 wake_up_all(&event->waitq);
4783         }
4784 }
4785 
4786 static void ring_buffer_wakeup(struct perf_event *event)
4787 {
4788         struct ring_buffer *rb;
4789 
4790         rcu_read_lock();
4791         rb = rcu_dereference(event->rb);
4792         if (rb) {
4793                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4794                         wake_up_all(&event->waitq);
4795         }
4796         rcu_read_unlock();
4797 }
4798 
4799 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4800 {
4801         struct ring_buffer *rb;
4802 
4803         rcu_read_lock();
4804         rb = rcu_dereference(event->rb);
4805         if (rb) {
4806                 if (!atomic_inc_not_zero(&rb->refcount))
4807                         rb = NULL;
4808         }
4809         rcu_read_unlock();
4810 
4811         return rb;
4812 }
4813 
4814 void ring_buffer_put(struct ring_buffer *rb)
4815 {
4816         if (!atomic_dec_and_test(&rb->refcount))
4817                 return;
4818 
4819         WARN_ON_ONCE(!list_empty(&rb->event_list));
4820 
4821         call_rcu(&rb->rcu_head, rb_free_rcu);
4822 }
4823 
4824 static void perf_mmap_open(struct vm_area_struct *vma)
4825 {
4826         struct perf_event *event = vma->vm_file->private_data;
4827 
4828         atomic_inc(&event->mmap_count);
4829         atomic_inc(&event->rb->mmap_count);
4830 
4831         if (vma->vm_pgoff)
4832                 atomic_inc(&event->rb->aux_mmap_count);
4833 
4834         if (event->pmu->event_mapped)
4835                 event->pmu->event_mapped(event);
4836 }
4837 
4838 static void perf_pmu_output_stop(struct perf_event *event);
4839 
4840 /*
4841  * A buffer can be mmap()ed multiple times; either directly through the same
4842  * event, or through other events by use of perf_event_set_output().
4843  *
4844  * In order to undo the VM accounting done by perf_mmap() we need to destroy
4845  * the buffer here, where we still have a VM context. This means we need
4846  * to detach all events redirecting to us.
4847  */
4848 static void perf_mmap_close(struct vm_area_struct *vma)
4849 {
4850         struct perf_event *event = vma->vm_file->private_data;
4851 
4852         struct ring_buffer *rb = ring_buffer_get(event);
4853         struct user_struct *mmap_user = rb->mmap_user;
4854         int mmap_locked = rb->mmap_locked;
4855         unsigned long size = perf_data_size(rb);
4856 
4857         if (event->pmu->event_unmapped)
4858                 event->pmu->event_unmapped(event);
4859 
4860         /*
4861          * rb->aux_mmap_count will always drop before rb->mmap_count and
4862          * event->mmap_count, so it is ok to use event->mmap_mutex to
4863          * serialize with perf_mmap here.
4864          */
4865         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4866             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4867                 /*
4868                  * Stop all AUX events that are writing to this buffer,
4869                  * so that we can free its AUX pages and corresponding PMU
4870                  * data. Note that after rb::aux_mmap_count dropped to zero,
4871                  * they won't start any more (see perf_aux_output_begin()).
4872                  */
4873                 perf_pmu_output_stop(event);
4874 
4875                 /* now it's safe to free the pages */
4876                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4877                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4878 
4879                 /* this has to be the last one */
4880                 rb_free_aux(rb);
4881                 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
4882 
4883                 mutex_unlock(&event->mmap_mutex);
4884         }
4885 
4886         atomic_dec(&rb->mmap_count);
4887 
4888         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4889                 goto out_put;
4890 
4891         ring_buffer_attach(event, NULL);
4892         mutex_unlock(&event->mmap_mutex);
4893 
4894         /* If there's still other mmap()s of this buffer, we're done. */
4895         if (atomic_read(&rb->mmap_count))
4896                 goto out_put;
4897 
4898         /*
4899          * No other mmap()s, detach from all other events that might redirect
4900          * into the now unreachable buffer. Somewhat complicated by the
4901          * fact that rb::event_lock otherwise nests inside mmap_mutex.
4902          */
4903 again:
4904         rcu_read_lock();
4905         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4906                 if (!atomic_long_inc_not_zero(&event->refcount)) {
4907                         /*
4908                          * This event is en-route to free_event() which will
4909                          * detach it and remove it from the list.
4910                          */
4911                         continue;
4912                 }
4913                 rcu_read_unlock();
4914 
4915                 mutex_lock(&event->mmap_mutex);
4916                 /*
4917                  * Check we didn't race with perf_event_set_output() which can
4918                  * swizzle the rb from under us while we were waiting to
4919                  * acquire mmap_mutex.
4920                  *
4921                  * If we find a different rb; ignore this event, a next
4922                  * iteration will no longer find it on the list. We have to
4923                  * still restart the iteration to make sure we're not now
4924                  * iterating the wrong list.
4925                  */
4926                 if (event->rb == rb)
4927                         ring_buffer_attach(event, NULL);
4928 
4929                 mutex_unlock(&event->mmap_mutex);
4930                 put_event(event);
4931 
4932                 /*
4933                  * Restart the iteration; either we're on the wrong list or
4934                  * destroyed its integrity by doing a deletion.
4935                  */
4936                 goto again;
4937         }
4938         rcu_read_unlock();
4939 
4940         /*
4941          * It could be there's still a few 0-ref events on the list; they'll
4942          * get cleaned up by free_event() -- they'll also still have their
4943          * ref on the rb and will free it whenever they are done with it.
4944          *
4945          * Aside from that, this buffer is 'fully' detached and unmapped,
4946          * undo the VM accounting.
4947          */
4948 
4949         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4950         vma->vm_mm->pinned_vm -= mmap_locked;
4951         free_uid(mmap_user);
4952 
4953 out_put:
4954         ring_buffer_put(rb); /* could be last */
4955 }
4956 
4957 static const struct vm_operations_struct perf_mmap_vmops = {
4958         .open           = perf_mmap_open,
4959         .close          = perf_mmap_close, /* non mergable */
4960         .fault          = perf_mmap_fault,
4961         .page_mkwrite   = perf_mmap_fault,
4962 };
4963 
4964 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4965 {
4966         struct perf_event *event = file->private_data;
4967         unsigned long user_locked, user_lock_limit;
4968         struct user_struct *user = current_user();
4969         unsigned long locked, lock_limit;
4970         struct ring_buffer *rb = NULL;
4971         unsigned long vma_size;
4972         unsigned long nr_pages;
4973         long user_extra = 0, extra = 0;
4974         int ret = 0, flags = 0;
4975 
4976         /*
4977          * Don't allow mmap() of inherited per-task counters. This would
4978          * create a performance issue due to all children writing to the
4979          * same rb.
4980          */
4981         if (event->cpu == -1 && event->attr.inherit)
4982                 return -EINVAL;
4983 
4984         if (!(vma->vm_flags & VM_SHARED))
4985                 return -EINVAL;
4986 
4987         vma_size = vma->vm_end - vma->vm_start;
4988 
4989         if (vma->vm_pgoff == 0) {
4990                 nr_pages = (vma_size / PAGE_SIZE) - 1;
4991         } else {
4992                 /*
4993                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4994                  * mapped, all subsequent mappings should have the same size
4995                  * and offset. Must be above the normal perf buffer.
4996                  */
4997                 u64 aux_offset, aux_size;
4998 
4999                 if (!event->rb)
5000                         return -EINVAL;
5001 
5002                 nr_pages = vma_size / PAGE_SIZE;
5003 
5004                 mutex_lock(&event->mmap_mutex);
5005                 ret = -EINVAL;
5006 
5007                 rb = event->rb;
5008                 if (!rb)
5009                         goto aux_unlock;
5010 
5011                 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5012                 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5013 
5014                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5015                         goto aux_unlock;
5016 
5017                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5018                         goto aux_unlock;
5019 
5020                 /* already mapped with a different offset */
5021                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5022                         goto aux_unlock;
5023 
5024                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5025                         goto aux_unlock;
5026 
5027                 /* already mapped with a different size */
5028                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5029                         goto aux_unlock;
5030 
5031                 if (!is_power_of_2(nr_pages))
5032                         goto aux_unlock;
5033 
5034                 if (!atomic_inc_not_zero(&rb->mmap_count))
5035                         goto aux_unlock;
5036 
5037                 if (rb_has_aux(rb)) {
5038                         atomic_inc(&rb->aux_mmap_count);
5039                         ret = 0;
5040                         goto unlock;
5041                 }
5042 
5043                 atomic_set(&rb->aux_mmap_count, 1);
5044                 user_extra = nr_pages;
5045 
5046                 goto accounting;
5047         }
5048 
5049         /*
5050          * If we have rb pages ensure they're a power-of-two number, so we
5051          * can do bitmasks instead of modulo.
5052          */
5053         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5054                 return -EINVAL;
5055 
5056         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5057                 return -EINVAL;
5058 
5059         WARN_ON_ONCE(event->ctx->parent_ctx);
5060 again:
5061         mutex_lock(&event->mmap_mutex);
5062         if (event->rb) {
5063                 if (event->rb->nr_pages != nr_pages) {
5064                         ret = -EINVAL;
5065                         goto unlock;
5066                 }
5067 
5068                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5069                         /*
5070                          * Raced against perf_mmap_close() through
5071                          * perf_event_set_output(). Try again, hope for better
5072                          * luck.
5073                          */
5074                         mutex_unlock(&event->mmap_mutex);
5075                         goto again;
5076                 }
5077 
5078                 goto unlock;
5079         }
5080 
5081         user_extra = nr_pages + 1;
5082 
5083 accounting:
5084         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5085 
5086         /*
5087          * Increase the limit linearly with more CPUs:
5088          */
5089         user_lock_limit *= num_online_cpus();
5090 
5091         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5092 
5093         if (user_locked > user_lock_limit)
5094                 extra = user_locked - user_lock_limit;
5095 
5096         lock_limit = rlimit(RLIMIT_MEMLOCK);
5097         lock_limit >>= PAGE_SHIFT;
5098         locked = vma->vm_mm->pinned_vm + extra;
5099 
5100         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5101                 !capable(CAP_IPC_LOCK)) {
5102                 ret = -EPERM;
5103                 goto unlock;
5104         }
5105 
5106         WARN_ON(!rb && event->rb);
5107 
5108         if (vma->vm_flags & VM_WRITE)
5109                 flags |= RING_BUFFER_WRITABLE;
5110 
5111         if (!rb) {
5112                 rb = rb_alloc(nr_pages,
5113                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5114                               event->cpu, flags);
5115 
5116                 if (!rb) {
5117                         ret = -ENOMEM;
5118                         goto unlock;
5119                 }
5120 
5121                 atomic_set(&rb->mmap_count, 1);
5122                 rb->mmap_user = get_current_user();
5123                 rb->mmap_locked = extra;
5124 
5125                 ring_buffer_attach(event, rb);
5126 
5127                 perf_event_init_userpage(event);
5128                 perf_event_update_userpage(event);
5129         } else {
5130                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5131                                    event->attr.aux_watermark, flags);
5132                 if (!ret)
5133                         rb->aux_mmap_locked = extra;
5134         }
5135 
5136 unlock:
5137         if (!ret) {
5138                 atomic_long_add(user_extra, &user->locked_vm);
5139                 vma->vm_mm->pinned_vm += extra;
5140 
5141                 atomic_inc(&event->mmap_count);
5142         } else if (rb) {
5143                 atomic_dec(&rb->mmap_count);
5144         }
5145 aux_unlock:
5146         mutex_unlock(&event->mmap_mutex);
5147 
5148         /*
5149          * Since pinned accounting is per vm we cannot allow fork() to copy our
5150          * vma.
5151          */
5152         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5153         vma->vm_ops = &perf_mmap_vmops;
5154 
5155         if (event->pmu->event_mapped)
5156                 event->pmu->event_mapped(event);
5157 
5158         return ret;
5159 }
5160 
5161 static int perf_fasync(int fd, struct file *filp, int on)
5162 {
5163         struct inode *inode = file_inode(filp);
5164         struct perf_event *event = filp->private_data;
5165         int retval;
5166 
5167         inode_lock(inode);
5168         retval = fasync_helper(fd, filp, on, &event->fasync);
5169         inode_unlock(inode);
5170 
5171         if (retval < 0)
5172                 return retval;
5173 
5174         return 0;
5175 }
5176 
5177 static const struct file_operations perf_fops = {
5178         .llseek                 = no_llseek,
5179         .release                = perf_release,
5180         .read                   = perf_read,
5181         .poll                   = perf_poll,
5182         .unlocked_ioctl         = perf_ioctl,
5183         .compat_ioctl           = perf_compat_ioctl,
5184         .mmap                   = perf_mmap,
5185         .fasync                 = perf_fasync,
5186 };
5187 
5188 /*
5189  * Perf event wakeup
5190  *
5191  * If there's data, ensure we set the poll() state and publish everything
5192  * to user-space before waking everybody up.
5193  */
5194 
5195 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5196 {
5197         /* only the parent has fasync state */
5198         if (event->parent)
5199                 event = event->parent;
5200         return &event->fasync;
5201 }
5202 
5203 void perf_event_wakeup(struct perf_event *event)
5204 {
5205         ring_buffer_wakeup(event);
5206 
5207         if (event->pending_kill) {
5208                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5209                 event->pending_kill = 0;
5210         }
5211 }
5212 
5213 static void perf_pending_event(struct irq_work *entry)
5214 {
5215         struct perf_event *event = container_of(entry,
5216                         struct perf_event, pending);
5217         int rctx;
5218 
5219         rctx = perf_swevent_get_recursion_context();
5220         /*
5221          * If we 'fail' here, that's OK, it means recursion is already disabled
5222          * and we won't recurse 'further'.
5223          */
5224 
5225         if (event->pending_disable) {
5226                 event->pending_disable = 0;
5227                 perf_event_disable_local(event);
5228         }
5229 
5230         if (event->pending_wakeup) {
5231                 event->pending_wakeup = 0;
5232                 perf_event_wakeup(event);
5233         }
5234 
5235         if (rctx >= 0)
5236                 perf_swevent_put_recursion_context(rctx);
5237 }
5238 
5239 /*
5240  * We assume there is only KVM supporting the callbacks.
5241  * Later on, we might change it to a list if there is
5242  * another virtualization implementation supporting the callbacks.
5243  */
5244 struct perf_guest_info_callbacks *perf_guest_cbs;
5245 
5246 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5247 {
5248         perf_guest_cbs = cbs;
5249         return 0;
5250 }
5251 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5252 
5253 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5254 {
5255         perf_guest_cbs = NULL;
5256         return 0;
5257 }
5258 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5259 
5260 static void
5261 perf_output_sample_regs(struct perf_output_handle *handle,
5262                         struct pt_regs *regs, u64 mask)
5263 {
5264         int bit;
5265 
5266         for_each_set_bit(bit, (const unsigned long *) &mask,
5267                          sizeof(mask) * BITS_PER_BYTE) {
5268                 u64 val;
5269 
5270                 val = perf_reg_value(regs, bit);
5271                 perf_output_put(handle, val);
5272         }
5273 }
5274 
5275 static void perf_sample_regs_user(struct perf_regs *regs_user,
5276                                   struct pt_regs *regs,
5277                                   struct pt_regs *regs_user_copy)
5278 {
5279         if (user_mode(regs)) {
5280                 regs_user->abi = perf_reg_abi(current);
5281                 regs_user->regs = regs;
5282         } else if (current->mm) {
5283                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5284         } else {
5285                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5286                 regs_user->regs = NULL;
5287         }
5288 }
5289 
5290 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5291                                   struct pt_regs *regs)
5292 {
5293         regs_intr->regs = regs;
5294         regs_intr->abi  = perf_reg_abi(current);
5295 }
5296 
5297 
5298 /*
5299  * Get remaining task size from user stack pointer.
5300  *
5301  * It'd be better to take stack vma map and limit this more
5302  * precisly, but there's no way to get it safely under interrupt,
5303  * so using TASK_SIZE as limit.
5304  */
5305 static u64 perf_ustack_task_size(struct pt_regs *regs)
5306 {
5307         unsigned long addr = perf_user_stack_pointer(regs);
5308 
5309         if (!addr || addr >= TASK_SIZE)
5310                 return 0;
5311 
5312         return TASK_SIZE - addr;
5313 }
5314 
5315 static u16
5316 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5317                         struct pt_regs *regs)
5318 {
5319         u64 task_size;
5320 
5321         /* No regs, no stack pointer, no dump. */
5322         if (!regs)
5323                 return 0;
5324 
5325         /*
5326          * Check if we fit in with the requested stack size into the:
5327          * - TASK_SIZE
5328          *   If we don't, we limit the size to the TASK_SIZE.
5329          *
5330          * - remaining sample size
5331          *   If we don't, we customize the stack size to
5332          *   fit in to the remaining sample size.
5333          */
5334 
5335         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5336         stack_size = min(stack_size, (u16) task_size);
5337 
5338         /* Current header size plus static size and dynamic size. */
5339         header_size += 2 * sizeof(u64);
5340 
5341         /* Do we fit in with the current stack dump size? */
5342         if ((u16) (header_size + stack_size) < header_size) {
5343                 /*
5344                  * If we overflow the maximum size for the sample,
5345                  * we customize the stack dump size to fit in.
5346                  */
5347                 stack_size = USHRT_MAX - header_size - sizeof(u64);
5348                 stack_size = round_up(stack_size, sizeof(u64));
5349         }
5350 
5351         return stack_size;
5352 }
5353 
5354 static void
5355 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5356                           struct pt_regs *regs)
5357 {
5358         /* Case of a kernel thread, nothing to dump */
5359         if (!regs) {
5360                 u64 size = 0;
5361                 perf_output_put(handle, size);
5362         } else {
5363                 unsigned long sp;
5364                 unsigned int rem;
5365                 u64 dyn_size;
5366 
5367                 /*
5368                  * We dump:
5369                  * static size
5370                  *   - the size requested by user or the best one we can fit
5371                  *     in to the sample max size
5372                  * data
5373                  *   - user stack dump data
5374                  * dynamic size
5375                  *   - the actual dumped size
5376                  */
5377 
5378                 /* Static size. */
5379                 perf_output_put(handle, dump_size);
5380 
5381                 /* Data. */
5382                 sp = perf_user_stack_pointer(regs);
5383                 rem = __output_copy_user(handle, (void *) sp, dump_size);
5384                 dyn_size = dump_size - rem;
5385 
5386                 perf_output_skip(handle, rem);
5387 
5388                 /* Dynamic size. */
5389                 perf_output_put(handle, dyn_size);
5390         }
5391 }
5392 
5393 static void __perf_event_header__init_id(struct perf_event_header *header,
5394                                          struct perf_sample_data *data,
5395                                          struct perf_event *event)
5396 {
5397         u64 sample_type = event->attr.sample_type;
5398 
5399         data->type = sample_type;
5400         header->size += event->id_header_size;
5401 
5402         if (sample_type & PERF_SAMPLE_TID) {
5403                 /* namespace issues */
5404                 data->tid_entry.pid = perf_event_pid(event, current);
5405                 data->tid_entry.tid = perf_event_tid(event, current);
5406         }
5407 
5408         if (sample_type & PERF_SAMPLE_TIME)
5409                 data->time = perf_event_clock(event);
5410 
5411         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5412                 data->id = primary_event_id(event);
5413 
5414         if (sample_type & PERF_SAMPLE_STREAM_ID)
5415                 data->stream_id = event->id;
5416 
5417         if (sample_type & PERF_SAMPLE_CPU) {
5418                 data->cpu_entry.cpu      = raw_smp_processor_id();
5419                 data->cpu_entry.reserved = 0;
5420         }
5421 }
5422 
5423 void perf_event_header__init_id(struct perf_event_header *header,
5424                                 struct perf_sample_data *data,
5425                                 struct perf_event *event)
5426 {
5427         if (event->attr.sample_id_all)
5428                 __perf_event_header__init_id(header, data, event);
5429 }
5430 
5431 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5432                                            struct perf_sample_data *data)
5433 {
5434         u64 sample_type = data->type;
5435 
5436         if (sample_type & PERF_SAMPLE_TID)
5437                 perf_output_put(handle, data->tid_entry);
5438 
5439         if (sample_type & PERF_SAMPLE_TIME)
5440                 perf_output_put(handle, data->time);
5441 
5442         if (sample_type & PERF_SAMPLE_ID)
5443                 perf_output_put(handle, data->id);
5444 
5445         if (sample_type & PERF_SAMPLE_STREAM_ID)
5446                 perf_output_put(handle, data->stream_id);
5447 
5448         if (sample_type & PERF_SAMPLE_CPU)
5449                 perf_output_put(handle, data->cpu_entry);
5450 
5451         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5452                 perf_output_put(handle, data->id);
5453 }
5454 
5455 void perf_event__output_id_sample(struct perf_event *event,
5456                                   struct perf_output_handle *handle,
5457                                   struct perf_sample_data *sample)
5458 {
5459         if (event->attr.sample_id_all)
5460                 __perf_event__output_id_sample(handle, sample);
5461 }
5462 
5463 static void perf_output_read_one(struct perf_output_handle *handle,
5464                                  struct perf_event *event,
5465                                  u64 enabled, u64 running)
5466 {
5467         u64 read_format = event->attr.read_format;
5468         u64 values[4];
5469         int n = 0;
5470 
5471         values[n++] = perf_event_count(event);
5472         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5473                 values[n++] = enabled +
5474                         atomic64_read(&event->child_total_time_enabled);
5475         }
5476         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5477                 values[n++] = running +
5478                         atomic64_read(&event->child_total_time_running);
5479         }
5480         if (read_format & PERF_FORMAT_ID)
5481                 values[n++] = primary_event_id(event);
5482 
5483         __output_copy(handle, values, n * sizeof(u64));
5484 }
5485 
5486 /*
5487  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5488  */
5489 static void perf_output_read_group(struct perf_output_handle *handle,
5490                             struct perf_event *event,
5491                             u64 enabled, u64 running)
5492 {
5493         struct perf_event *leader = event->group_leader, *sub;
5494         u64 read_format = event->attr.read_format;
5495         u64 values[5];
5496         int n = 0;
5497 
5498         values[n++] = 1 + leader->nr_siblings;
5499 
5500         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5501                 values[n++] = enabled;
5502 
5503         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5504                 values[n++] = running;
5505 
5506         if (leader != event)
5507                 leader->pmu->read(leader);
5508 
5509         values[n++] = perf_event_count(leader);
5510         if (read_format & PERF_FORMAT_ID)
5511                 values[n++] = primary_event_id(leader);
5512 
5513         __output_copy(handle, values, n * sizeof(u64));
5514 
5515         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5516                 n = 0;
5517 
5518                 if ((sub != event) &&
5519                     (sub->state == PERF_EVENT_STATE_ACTIVE))
5520                         sub->pmu->read(sub);
5521 
5522                 values[n++] = perf_event_count(sub);
5523                 if (read_format & PERF_FORMAT_ID)
5524                         values[n++] = primary_event_id(sub);
5525 
5526                 __output_copy(handle, values, n * sizeof(u64));
5527         }
5528 }
5529 
5530 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5531                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5532 
5533 static void perf_output_read(struct perf_output_handle *handle,
5534                              struct perf_event *event)
5535 {
5536         u64 enabled = 0, running = 0, now;
5537         u64 read_format = event->attr.read_format;
5538 
5539         /*
5540          * compute total_time_enabled, total_time_running
5541          * based on snapshot values taken when the event
5542          * was last scheduled in.
5543          *
5544          * we cannot simply called update_context_time()
5545          * because of locking issue as we are called in
5546          * NMI context
5547          */
5548         if (read_format & PERF_FORMAT_TOTAL_TIMES)
5549                 calc_timer_values(event, &now, &enabled, &running);
5550 
5551         if (event->attr.read_format & PERF_FORMAT_GROUP)
5552                 perf_output_read_group(handle, event, enabled, running);
5553         else
5554                 perf_output_read_one(handle, event, enabled, running);
5555 }
5556 
5557 void perf_output_sample(struct perf_output_handle *handle,
5558                         struct perf_event_header *header,
5559                         struct perf_sample_data *data,
5560                         struct perf_event *event)
5561 {
5562         u64 sample_type = data->type;
5563 
5564         perf_output_put(handle, *header);
5565 
5566         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5567                 perf_output_put(handle, data->id);
5568 
5569         if (sample_type & PERF_SAMPLE_IP)
5570                 perf_output_put(handle, data->ip);
5571 
5572         if (sample_type & PERF_SAMPLE_TID)
5573                 perf_output_put(handle, data->tid_entry);
5574 
5575         if (sample_type & PERF_SAMPLE_TIME)
5576                 perf_output_put(handle, data->time);
5577 
5578         if (sample_type & PERF_SAMPLE_ADDR)
5579                 perf_output_put(handle, data->addr);
5580 
5581         if (sample_type & PERF_SAMPLE_ID)
5582                 perf_output_put(handle, data->id);
5583 
5584         if (sample_type & PERF_SAMPLE_STREAM_ID)
5585                 perf_output_put(handle, data->stream_id);
5586 
5587         if (sample_type & PERF_SAMPLE_CPU)
5588                 perf_output_put(handle, data->cpu_entry);
5589 
5590         if (sample_type & PERF_SAMPLE_PERIOD)
5591                 perf_output_put(handle, data->period);
5592 
5593         if (sample_type & PERF_SAMPLE_READ)
5594                 perf_output_read(handle, event);
5595 
5596         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5597                 if (data->callchain) {
5598                         int size = 1;
5599 
5600                         if (data->callchain)
5601                                 size += data->callchain->nr;
5602 
5603                         size *= sizeof(u64);
5604 
5605                         __output_copy(handle, data->callchain, size);
5606                 } else {
5607                         u64 nr = 0;
5608                         perf_output_put(handle, nr);
5609                 }
5610         }
5611 
5612         if (sample_type & PERF_SAMPLE_RAW) {
5613                 if (data->raw) {
5614                         u32 raw_size = data->raw->size;
5615                         u32 real_size = round_up(raw_size + sizeof(u32),
5616                                                  sizeof(u64)) - sizeof(u32);
5617                         u64 zero = 0;
5618 
5619                         perf_output_put(handle, real_size);
5620                         __output_copy(handle, data->raw->data, raw_size);
5621                         if (real_size - raw_size)
5622                                 __output_copy(handle, &zero, real_size - raw_size);
5623                 } else {
5624                         struct {
5625                                 u32     size;
5626                                 u32     data;
5627                         } raw = {
5628                                 .size = sizeof(u32),
5629                                 .data = 0,
5630                         };
5631                         perf_output_put(handle, raw);
5632                 }
5633         }
5634 
5635         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5636                 if (data->br_stack) {
5637                         size_t size;
5638 
5639                         size = data->br_stack->nr
5640                              * sizeof(struct perf_branch_entry);
5641 
5642                         perf_output_put(handle, data->br_stack->nr);
5643                         perf_output_copy(handle, data->br_stack->entries, size);
5644                 } else {
5645                         /*
5646                          * we always store at least the value of nr
5647                          */
5648                         u64 nr = 0;
5649                         perf_output_put(handle, nr);
5650                 }
5651         }
5652 
5653         if (sample_type & PERF_SAMPLE_REGS_USER) {
5654                 u64 abi = data->regs_user.abi;
5655 
5656                 /*
5657                  * If there are no regs to dump, notice it through
5658                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5659                  */
5660                 perf_output_put(handle, abi);
5661 
5662                 if (abi) {
5663                         u64 mask = event->attr.sample_regs_user;
5664                         perf_output_sample_regs(handle,
5665                                                 data->regs_user.regs,
5666                                                 mask);
5667                 }
5668         }
5669 
5670         if (sample_type & PERF_SAMPLE_STACK_USER) {
5671                 perf_output_sample_ustack(handle,
5672                                           data->stack_user_size,
5673                                           data->regs_user.regs);
5674         }
5675 
5676         if (sample_type & PERF_SAMPLE_WEIGHT)
5677                 perf_output_put(handle, data->weight);
5678 
5679         if (sample_type & PERF_SAMPLE_DATA_SRC)
5680                 perf_output_put(handle, data->data_src.val);
5681 
5682         if (sample_type & PERF_SAMPLE_TRANSACTION)
5683                 perf_output_put(handle, data->txn);
5684 
5685         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5686                 u64 abi = data->regs_intr.abi;
5687                 /*
5688                  * If there are no regs to dump, notice it through
5689                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5690                  */
5691                 perf_output_put(handle, abi);
5692 
5693                 if (abi) {
5694                         u64 mask = event->attr.sample_regs_intr;
5695 
5696                         perf_output_sample_regs(handle,
5697                                                 data->regs_intr.regs,
5698                                                 mask);
5699                 }
5700         }
5701 
5702         if (!event->attr.watermark) {
5703                 int wakeup_events = event->attr.wakeup_events;
5704 
5705                 if (wakeup_events) {
5706                         struct ring_buffer *rb = handle->rb;
5707                         int events = local_inc_return(&rb->events);
5708 
5709                         if (events >= wakeup_events) {
5710                                 local_sub(wakeup_events, &rb->events);
5711                                 local_inc(&rb->wakeup);
5712                         }
5713                 }
5714         }
5715 }
5716 
5717 void perf_prepare_sample(struct perf_event_header *header,
5718                          struct perf_sample_data *data,
5719                          struct perf_event *event,
5720                          struct pt_regs *regs)
5721 {
5722         u64 sample_type = event->attr.sample_type;
5723 
5724         header->type = PERF_RECORD_SAMPLE;
5725         header->size = sizeof(*header) + event->header_size;
5726 
5727         header->misc = 0;
5728         header->misc |= perf_misc_flags(regs);
5729 
5730         __perf_event_header__init_id(header, data, event);
5731 
5732         if (sample_type & PERF_SAMPLE_IP)
5733                 data->ip = perf_instruction_pointer(regs);
5734 
5735         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5736                 int size = 1;
5737 
5738                 data->callchain = perf_callchain(event, regs);
5739 
5740                 if (data->callchain)
5741                         size += data->callchain->nr;
5742 
5743                 header->size += size * sizeof(u64);
5744         }
5745 
5746         if (sample_type & PERF_SAMPLE_RAW) {
5747                 int size = sizeof(u32);
5748 
5749                 if (data->raw)
5750                         size += data->raw->size;
5751                 else
5752                         size += sizeof(u32);
5753 
5754                 header->size += round_up(size, sizeof(u64));
5755         }
5756 
5757         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5758                 int size = sizeof(u64); /* nr */
5759                 if (data->br_stack) {
5760                         size += data->br_stack->nr
5761                               * sizeof(struct perf_branch_entry);
5762                 }
5763                 header->size += size;
5764         }
5765 
5766         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5767                 perf_sample_regs_user(&data->regs_user, regs,
5768                                       &data->regs_user_copy);
5769 
5770         if (sample_type & PERF_SAMPLE_REGS_USER) {
5771                 /* regs dump ABI info */
5772                 int size = sizeof(u64);
5773 
5774                 if (data->regs_user.regs) {
5775                         u64 mask = event->attr.sample_regs_user;
5776                         size += hweight64(mask) * sizeof(u64);
5777                 }
5778 
5779                 header->size += size;
5780         }
5781 
5782         if (sample_type & PERF_SAMPLE_STACK_USER) {
5783                 /*
5784                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5785                  * processed as the last one or have additional check added
5786                  * in case new sample type is added, because we could eat
5787                  * up the rest of the sample size.
5788                  */
5789                 u16 stack_size = event->attr.sample_stack_user;
5790                 u16 size = sizeof(u64);
5791 
5792                 stack_size = perf_sample_ustack_size(stack_size, header->size,
5793                                                      data->regs_user.regs);
5794 
5795                 /*
5796                  * If there is something to dump, add space for the dump
5797                  * itself and for the field that tells the dynamic size,
5798                  * which is how many have been actually dumped.
5799                  */
5800                 if (stack_size)
5801                         size += sizeof(u64) + stack_size;
5802 
5803                 data->stack_user_size = stack_size;
5804                 header->size += size;
5805         }
5806 
5807         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5808                 /* regs dump ABI info */
5809                 int size = sizeof(u64);
5810 
5811                 perf_sample_regs_intr(&data->regs_intr, regs);
5812 
5813                 if (data->regs_intr.regs) {
5814                         u64 mask = event->attr.sample_regs_intr;
5815 
5816                         size += hweight64(mask) * sizeof(u64);
5817                 }
5818 
5819                 header->size += size;
5820         }
5821 }
5822 
5823 static void __always_inline
5824 __perf_event_output(struct perf_event *event,
5825                     struct perf_sample_data *data,
5826                     struct pt_regs *regs,
5827                     int (*output_begin)(struct perf_output_handle *,
5828                                         struct perf_event *,
5829                                         unsigned int))
5830 {
5831         struct perf_output_handle handle;
5832         struct perf_event_header header;
5833 
5834         /* protect the callchain buffers */
5835         rcu_read_lock();
5836 
5837         perf_prepare_sample(&header, data, event, regs);
5838 
5839         if (output_begin(&handle, event, header.size))
5840                 goto exit;
5841 
5842         perf_output_sample(&handle, &header, data, event);
5843 
5844         perf_output_end(&handle);
5845 
5846 exit:
5847         rcu_read_unlock();
5848 }
5849 
5850 void
5851 perf_event_output_forward(struct perf_event *event,
5852                          struct perf_sample_data *data,
5853                          struct pt_regs *regs)
5854 {
5855         __perf_event_output(event, data, regs, perf_output_begin_forward);
5856 }
5857 
5858 void
5859 perf_event_output_backward(struct perf_event *event,
5860                            struct perf_sample_data *data,
5861                            struct pt_regs *regs)
5862 {
5863         __perf_event_output(event, data, regs, perf_output_begin_backward);
5864 }
5865 
5866 void
5867 perf_event_output(struct perf_event *event,
5868                   struct perf_sample_data *data,
5869                   struct pt_regs *regs)
5870 {
5871         __perf_event_output(event, data, regs, perf_output_begin);
5872 }
5873 
5874 /*
5875  * read event_id
5876  */
5877 
5878 struct perf_read_event {
5879         struct perf_event_header        header;
5880 
5881         u32                             pid;
5882         u32                             tid;
5883 };
5884 
5885 static void
5886 perf_event_read_event(struct perf_event *event,
5887                         struct task_struct *task)
5888 {
5889         struct perf_output_handle handle;
5890         struct perf_sample_data sample;
5891         struct perf_read_event read_event = {
5892                 .header = {
5893                         .type = PERF_RECORD_READ,
5894                         .misc = 0,
5895                         .size = sizeof(read_event) + event->read_size,
5896                 },
5897                 .pid = perf_event_pid(event, task),
5898                 .tid = perf_event_tid(event, task),
5899         };
5900         int ret;
5901 
5902         perf_event_header__init_id(&read_event.header, &sample, event);
5903         ret = perf_output_begin(&handle, event, read_event.header.size);
5904         if (ret)
5905                 return;
5906 
5907         perf_output_put(&handle, read_event);
5908         perf_output_read(&handle, event);
5909         perf_event__output_id_sample(event, &handle, &sample);
5910 
5911         perf_output_end(&handle);
5912 }
5913 
5914 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5915 
5916 static void
5917 perf_event_aux_ctx(struct perf_event_context *ctx,
5918                    perf_event_aux_output_cb output,
5919                    void *data, bool all)
5920 {
5921         struct perf_event *event;
5922 
5923         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5924                 if (!all) {
5925                         if (event->state < PERF_EVENT_STATE_INACTIVE)
5926                                 continue;
5927                         if (!event_filter_match(event))
5928                                 continue;
5929                 }
5930 
5931                 output(event, data);
5932         }
5933 }
5934 
5935 static void
5936 perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5937                         struct perf_event_context *task_ctx)
5938 {
5939         rcu_read_lock();
5940         preempt_disable();
5941         perf_event_aux_ctx(task_ctx, output, data, false);
5942         preempt_enable();
5943         rcu_read_unlock();
5944 }
5945 
5946 static void
5947 perf_event_aux(perf_event_aux_output_cb output, void *data,
5948                struct perf_event_context *task_ctx)
5949 {
5950         struct perf_cpu_context *cpuctx;
5951         struct perf_event_context *ctx;
5952         struct pmu *pmu;
5953         int ctxn;
5954 
5955         /*
5956          * If we have task_ctx != NULL we only notify
5957          * the task context itself. The task_ctx is set
5958          * only for EXIT events before releasing task
5959          * context.
5960          */
5961         if (task_ctx) {
5962                 perf_event_aux_task_ctx(output, data, task_ctx);
5963                 return;
5964         }
5965 
5966         rcu_read_lock();
5967         list_for_each_entry_rcu(pmu, &pmus, entry) {
5968                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5969                 if (cpuctx->unique_pmu != pmu)
5970                         goto next;
5971                 perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
5972                 ctxn = pmu->task_ctx_nr;
5973                 if (ctxn < 0)
5974                         goto next;
5975                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5976                 if (ctx)
5977                         perf_event_aux_ctx(ctx, output, data, false);
5978 next:
5979                 put_cpu_ptr(pmu->pmu_cpu_context);
5980         }
5981         rcu_read_unlock();
5982 }
5983 
5984 /*
5985  * Clear all file-based filters at exec, they'll have to be
5986  * re-instated when/if these objects are mmapped again.
5987  */
5988 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
5989 {
5990         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
5991         struct perf_addr_filter *filter;
5992         unsigned int restart = 0, count = 0;
5993         unsigned long flags;
5994 
5995         if (!has_addr_filter(event))
5996                 return;
5997 
5998         raw_spin_lock_irqsave(&ifh->lock, flags);
5999         list_for_each_entry(filter, &ifh->list, entry) {
6000                 if (filter->inode) {
6001                         event->addr_filters_offs[count] = 0;
6002                         restart++;
6003                 }
6004 
6005                 count++;
6006         }
6007 
6008         if (restart)
6009                 event->addr_filters_gen++;
6010         raw_spin_unlock_irqrestore(&ifh->lock, flags);
6011 
6012         if (restart)
6013                 perf_event_restart(event);
6014 }
6015 
6016 void perf_event_exec(void)
6017 {
6018         struct perf_event_context *ctx;
6019         int ctxn;
6020 
6021         rcu_read_lock();
6022         for_each_task_context_nr(ctxn) {
6023                 ctx = current->perf_event_ctxp[ctxn];
6024                 if (!ctx)
6025                         continue;
6026 
6027                 perf_event_enable_on_exec(ctxn);
6028 
6029                 perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
6030                                    true);
6031         }
6032         rcu_read_unlock();
6033 }
6034 
6035 struct remote_output {
6036         struct ring_buffer      *rb;
6037         int                     err;
6038 };
6039 
6040 static void __perf_event_output_stop(struct perf_event *event, void *data)
6041 {
6042         struct perf_event *parent = event->parent;
6043         struct remote_output *ro = data;
6044         struct ring_buffer *rb = ro->rb;
6045         struct stop_event_data sd = {
6046                 .event  = event,
6047         };
6048 
6049         if (!has_aux