~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.16-rc3 ] ~ [ linux-5.15.5 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.82 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.162 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.218 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.256 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.291 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.293 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Performance events core code:
  4  *
  5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  9  */
 10 
 11 #include <linux/fs.h>
 12 #include <linux/mm.h>
 13 #include <linux/cpu.h>
 14 #include <linux/smp.h>
 15 #include <linux/idr.h>
 16 #include <linux/file.h>
 17 #include <linux/poll.h>
 18 #include <linux/slab.h>
 19 #include <linux/hash.h>
 20 #include <linux/tick.h>
 21 #include <linux/sysfs.h>
 22 #include <linux/dcache.h>
 23 #include <linux/percpu.h>
 24 #include <linux/ptrace.h>
 25 #include <linux/reboot.h>
 26 #include <linux/vmstat.h>
 27 #include <linux/device.h>
 28 #include <linux/export.h>
 29 #include <linux/vmalloc.h>
 30 #include <linux/hardirq.h>
 31 #include <linux/hugetlb.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/cgroup.h>
 38 #include <linux/perf_event.h>
 39 #include <linux/trace_events.h>
 40 #include <linux/hw_breakpoint.h>
 41 #include <linux/mm_types.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 #include <linux/bpf.h>
 46 #include <linux/filter.h>
 47 #include <linux/namei.h>
 48 #include <linux/parser.h>
 49 #include <linux/sched/clock.h>
 50 #include <linux/sched/mm.h>
 51 #include <linux/proc_ns.h>
 52 #include <linux/mount.h>
 53 #include <linux/min_heap.h>
 54 
 55 #include "internal.h"
 56 
 57 #include <asm/irq_regs.h>
 58 
 59 typedef int (*remote_function_f)(void *);
 60 
 61 struct remote_function_call {
 62         struct task_struct      *p;
 63         remote_function_f       func;
 64         void                    *info;
 65         int                     ret;
 66 };
 67 
 68 static void remote_function(void *data)
 69 {
 70         struct remote_function_call *tfc = data;
 71         struct task_struct *p = tfc->p;
 72 
 73         if (p) {
 74                 /* -EAGAIN */
 75                 if (task_cpu(p) != smp_processor_id())
 76                         return;
 77 
 78                 /*
 79                  * Now that we're on right CPU with IRQs disabled, we can test
 80                  * if we hit the right task without races.
 81                  */
 82 
 83                 tfc->ret = -ESRCH; /* No such (running) process */
 84                 if (p != current)
 85                         return;
 86         }
 87 
 88         tfc->ret = tfc->func(tfc->info);
 89 }
 90 
 91 /**
 92  * task_function_call - call a function on the cpu on which a task runs
 93  * @p:          the task to evaluate
 94  * @func:       the function to be called
 95  * @info:       the function call argument
 96  *
 97  * Calls the function @func when the task is currently running. This might
 98  * be on the current CPU, which just calls the function directly.  This will
 99  * retry due to any failures in smp_call_function_single(), such as if the
100  * task_cpu() goes offline concurrently.
101  *
102  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
103  */
104 static int
105 task_function_call(struct task_struct *p, remote_function_f func, void *info)
106 {
107         struct remote_function_call data = {
108                 .p      = p,
109                 .func   = func,
110                 .info   = info,
111                 .ret    = -EAGAIN,
112         };
113         int ret;
114 
115         for (;;) {
116                 ret = smp_call_function_single(task_cpu(p), remote_function,
117                                                &data, 1);
118                 if (!ret)
119                         ret = data.ret;
120 
121                 if (ret != -EAGAIN)
122                         break;
123 
124                 cond_resched();
125         }
126 
127         return ret;
128 }
129 
130 /**
131  * cpu_function_call - call a function on the cpu
132  * @func:       the function to be called
133  * @info:       the function call argument
134  *
135  * Calls the function @func on the remote cpu.
136  *
137  * returns: @func return value or -ENXIO when the cpu is offline
138  */
139 static int cpu_function_call(int cpu, remote_function_f func, void *info)
140 {
141         struct remote_function_call data = {
142                 .p      = NULL,
143                 .func   = func,
144                 .info   = info,
145                 .ret    = -ENXIO, /* No such CPU */
146         };
147 
148         smp_call_function_single(cpu, remote_function, &data, 1);
149 
150         return data.ret;
151 }
152 
153 static inline struct perf_cpu_context *
154 __get_cpu_context(struct perf_event_context *ctx)
155 {
156         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
157 }
158 
159 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
160                           struct perf_event_context *ctx)
161 {
162         raw_spin_lock(&cpuctx->ctx.lock);
163         if (ctx)
164                 raw_spin_lock(&ctx->lock);
165 }
166 
167 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
168                             struct perf_event_context *ctx)
169 {
170         if (ctx)
171                 raw_spin_unlock(&ctx->lock);
172         raw_spin_unlock(&cpuctx->ctx.lock);
173 }
174 
175 #define TASK_TOMBSTONE ((void *)-1L)
176 
177 static bool is_kernel_event(struct perf_event *event)
178 {
179         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
180 }
181 
182 /*
183  * On task ctx scheduling...
184  *
185  * When !ctx->nr_events a task context will not be scheduled. This means
186  * we can disable the scheduler hooks (for performance) without leaving
187  * pending task ctx state.
188  *
189  * This however results in two special cases:
190  *
191  *  - removing the last event from a task ctx; this is relatively straight
192  *    forward and is done in __perf_remove_from_context.
193  *
194  *  - adding the first event to a task ctx; this is tricky because we cannot
195  *    rely on ctx->is_active and therefore cannot use event_function_call().
196  *    See perf_install_in_context().
197  *
198  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
199  */
200 
201 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
202                         struct perf_event_context *, void *);
203 
204 struct event_function_struct {
205         struct perf_event *event;
206         event_f func;
207         void *data;
208 };
209 
210 static int event_function(void *info)
211 {
212         struct event_function_struct *efs = info;
213         struct perf_event *event = efs->event;
214         struct perf_event_context *ctx = event->ctx;
215         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
216         struct perf_event_context *task_ctx = cpuctx->task_ctx;
217         int ret = 0;
218 
219         lockdep_assert_irqs_disabled();
220 
221         perf_ctx_lock(cpuctx, task_ctx);
222         /*
223          * Since we do the IPI call without holding ctx->lock things can have
224          * changed, double check we hit the task we set out to hit.
225          */
226         if (ctx->task) {
227                 if (ctx->task != current) {
228                         ret = -ESRCH;
229                         goto unlock;
230                 }
231 
232                 /*
233                  * We only use event_function_call() on established contexts,
234                  * and event_function() is only ever called when active (or
235                  * rather, we'll have bailed in task_function_call() or the
236                  * above ctx->task != current test), therefore we must have
237                  * ctx->is_active here.
238                  */
239                 WARN_ON_ONCE(!ctx->is_active);
240                 /*
241                  * And since we have ctx->is_active, cpuctx->task_ctx must
242                  * match.
243                  */
244                 WARN_ON_ONCE(task_ctx != ctx);
245         } else {
246                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
247         }
248 
249         efs->func(event, cpuctx, ctx, efs->data);
250 unlock:
251         perf_ctx_unlock(cpuctx, task_ctx);
252 
253         return ret;
254 }
255 
256 static void event_function_call(struct perf_event *event, event_f func, void *data)
257 {
258         struct perf_event_context *ctx = event->ctx;
259         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
260         struct event_function_struct efs = {
261                 .event = event,
262                 .func = func,
263                 .data = data,
264         };
265 
266         if (!event->parent) {
267                 /*
268                  * If this is a !child event, we must hold ctx::mutex to
269                  * stabilize the the event->ctx relation. See
270                  * perf_event_ctx_lock().
271                  */
272                 lockdep_assert_held(&ctx->mutex);
273         }
274 
275         if (!task) {
276                 cpu_function_call(event->cpu, event_function, &efs);
277                 return;
278         }
279 
280         if (task == TASK_TOMBSTONE)
281                 return;
282 
283 again:
284         if (!task_function_call(task, event_function, &efs))
285                 return;
286 
287         raw_spin_lock_irq(&ctx->lock);
288         /*
289          * Reload the task pointer, it might have been changed by
290          * a concurrent perf_event_context_sched_out().
291          */
292         task = ctx->task;
293         if (task == TASK_TOMBSTONE) {
294                 raw_spin_unlock_irq(&ctx->lock);
295                 return;
296         }
297         if (ctx->is_active) {
298                 raw_spin_unlock_irq(&ctx->lock);
299                 goto again;
300         }
301         func(event, NULL, ctx, data);
302         raw_spin_unlock_irq(&ctx->lock);
303 }
304 
305 /*
306  * Similar to event_function_call() + event_function(), but hard assumes IRQs
307  * are already disabled and we're on the right CPU.
308  */
309 static void event_function_local(struct perf_event *event, event_f func, void *data)
310 {
311         struct perf_event_context *ctx = event->ctx;
312         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
313         struct task_struct *task = READ_ONCE(ctx->task);
314         struct perf_event_context *task_ctx = NULL;
315 
316         lockdep_assert_irqs_disabled();
317 
318         if (task) {
319                 if (task == TASK_TOMBSTONE)
320                         return;
321 
322                 task_ctx = ctx;
323         }
324 
325         perf_ctx_lock(cpuctx, task_ctx);
326 
327         task = ctx->task;
328         if (task == TASK_TOMBSTONE)
329                 goto unlock;
330 
331         if (task) {
332                 /*
333                  * We must be either inactive or active and the right task,
334                  * otherwise we're screwed, since we cannot IPI to somewhere
335                  * else.
336                  */
337                 if (ctx->is_active) {
338                         if (WARN_ON_ONCE(task != current))
339                                 goto unlock;
340 
341                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
342                                 goto unlock;
343                 }
344         } else {
345                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
346         }
347 
348         func(event, cpuctx, ctx, data);
349 unlock:
350         perf_ctx_unlock(cpuctx, task_ctx);
351 }
352 
353 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
354                        PERF_FLAG_FD_OUTPUT  |\
355                        PERF_FLAG_PID_CGROUP |\
356                        PERF_FLAG_FD_CLOEXEC)
357 
358 /*
359  * branch priv levels that need permission checks
360  */
361 #define PERF_SAMPLE_BRANCH_PERM_PLM \
362         (PERF_SAMPLE_BRANCH_KERNEL |\
363          PERF_SAMPLE_BRANCH_HV)
364 
365 enum event_type_t {
366         EVENT_FLEXIBLE = 0x1,
367         EVENT_PINNED = 0x2,
368         EVENT_TIME = 0x4,
369         /* see ctx_resched() for details */
370         EVENT_CPU = 0x8,
371         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
372 };
373 
374 /*
375  * perf_sched_events : >0 events exist
376  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
377  */
378 
379 static void perf_sched_delayed(struct work_struct *work);
380 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
381 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
382 static DEFINE_MUTEX(perf_sched_mutex);
383 static atomic_t perf_sched_count;
384 
385 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
386 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
387 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
388 
389 static atomic_t nr_mmap_events __read_mostly;
390 static atomic_t nr_comm_events __read_mostly;
391 static atomic_t nr_namespaces_events __read_mostly;
392 static atomic_t nr_task_events __read_mostly;
393 static atomic_t nr_freq_events __read_mostly;
394 static atomic_t nr_switch_events __read_mostly;
395 static atomic_t nr_ksymbol_events __read_mostly;
396 static atomic_t nr_bpf_events __read_mostly;
397 static atomic_t nr_cgroup_events __read_mostly;
398 static atomic_t nr_text_poke_events __read_mostly;
399 
400 static LIST_HEAD(pmus);
401 static DEFINE_MUTEX(pmus_lock);
402 static struct srcu_struct pmus_srcu;
403 static cpumask_var_t perf_online_mask;
404 
405 /*
406  * perf event paranoia level:
407  *  -1 - not paranoid at all
408  *   0 - disallow raw tracepoint access for unpriv
409  *   1 - disallow cpu events for unpriv
410  *   2 - disallow kernel profiling for unpriv
411  */
412 int sysctl_perf_event_paranoid __read_mostly = 2;
413 
414 /* Minimum for 512 kiB + 1 user control page */
415 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
416 
417 /*
418  * max perf event sample rate
419  */
420 #define DEFAULT_MAX_SAMPLE_RATE         100000
421 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
422 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
423 
424 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
425 
426 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
427 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
428 
429 static int perf_sample_allowed_ns __read_mostly =
430         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
431 
432 static void update_perf_cpu_limits(void)
433 {
434         u64 tmp = perf_sample_period_ns;
435 
436         tmp *= sysctl_perf_cpu_time_max_percent;
437         tmp = div_u64(tmp, 100);
438         if (!tmp)
439                 tmp = 1;
440 
441         WRITE_ONCE(perf_sample_allowed_ns, tmp);
442 }
443 
444 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
445 
446 int perf_proc_update_handler(struct ctl_table *table, int write,
447                 void *buffer, size_t *lenp, loff_t *ppos)
448 {
449         int ret;
450         int perf_cpu = sysctl_perf_cpu_time_max_percent;
451         /*
452          * If throttling is disabled don't allow the write:
453          */
454         if (write && (perf_cpu == 100 || perf_cpu == 0))
455                 return -EINVAL;
456 
457         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
458         if (ret || !write)
459                 return ret;
460 
461         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
462         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
463         update_perf_cpu_limits();
464 
465         return 0;
466 }
467 
468 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
469 
470 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
471                 void *buffer, size_t *lenp, loff_t *ppos)
472 {
473         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
474 
475         if (ret || !write)
476                 return ret;
477 
478         if (sysctl_perf_cpu_time_max_percent == 100 ||
479             sysctl_perf_cpu_time_max_percent == 0) {
480                 printk(KERN_WARNING
481                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
482                 WRITE_ONCE(perf_sample_allowed_ns, 0);
483         } else {
484                 update_perf_cpu_limits();
485         }
486 
487         return 0;
488 }
489 
490 /*
491  * perf samples are done in some very critical code paths (NMIs).
492  * If they take too much CPU time, the system can lock up and not
493  * get any real work done.  This will drop the sample rate when
494  * we detect that events are taking too long.
495  */
496 #define NR_ACCUMULATED_SAMPLES 128
497 static DEFINE_PER_CPU(u64, running_sample_length);
498 
499 static u64 __report_avg;
500 static u64 __report_allowed;
501 
502 static void perf_duration_warn(struct irq_work *w)
503 {
504         printk_ratelimited(KERN_INFO
505                 "perf: interrupt took too long (%lld > %lld), lowering "
506                 "kernel.perf_event_max_sample_rate to %d\n",
507                 __report_avg, __report_allowed,
508                 sysctl_perf_event_sample_rate);
509 }
510 
511 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
512 
513 void perf_sample_event_took(u64 sample_len_ns)
514 {
515         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
516         u64 running_len;
517         u64 avg_len;
518         u32 max;
519 
520         if (max_len == 0)
521                 return;
522 
523         /* Decay the counter by 1 average sample. */
524         running_len = __this_cpu_read(running_sample_length);
525         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
526         running_len += sample_len_ns;
527         __this_cpu_write(running_sample_length, running_len);
528 
529         /*
530          * Note: this will be biased artifically low until we have
531          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
532          * from having to maintain a count.
533          */
534         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
535         if (avg_len <= max_len)
536                 return;
537 
538         __report_avg = avg_len;
539         __report_allowed = max_len;
540 
541         /*
542          * Compute a throttle threshold 25% below the current duration.
543          */
544         avg_len += avg_len / 4;
545         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
546         if (avg_len < max)
547                 max /= (u32)avg_len;
548         else
549                 max = 1;
550 
551         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
552         WRITE_ONCE(max_samples_per_tick, max);
553 
554         sysctl_perf_event_sample_rate = max * HZ;
555         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
556 
557         if (!irq_work_queue(&perf_duration_work)) {
558                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
559                              "kernel.perf_event_max_sample_rate to %d\n",
560                              __report_avg, __report_allowed,
561                              sysctl_perf_event_sample_rate);
562         }
563 }
564 
565 static atomic64_t perf_event_id;
566 
567 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
568                               enum event_type_t event_type);
569 
570 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
571                              enum event_type_t event_type,
572                              struct task_struct *task);
573 
574 static void update_context_time(struct perf_event_context *ctx);
575 static u64 perf_event_time(struct perf_event *event);
576 
577 void __weak perf_event_print_debug(void)        { }
578 
579 extern __weak const char *perf_pmu_name(void)
580 {
581         return "pmu";
582 }
583 
584 static inline u64 perf_clock(void)
585 {
586         return local_clock();
587 }
588 
589 static inline u64 perf_event_clock(struct perf_event *event)
590 {
591         return event->clock();
592 }
593 
594 /*
595  * State based event timekeeping...
596  *
597  * The basic idea is to use event->state to determine which (if any) time
598  * fields to increment with the current delta. This means we only need to
599  * update timestamps when we change state or when they are explicitly requested
600  * (read).
601  *
602  * Event groups make things a little more complicated, but not terribly so. The
603  * rules for a group are that if the group leader is OFF the entire group is
604  * OFF, irrespecive of what the group member states are. This results in
605  * __perf_effective_state().
606  *
607  * A futher ramification is that when a group leader flips between OFF and
608  * !OFF, we need to update all group member times.
609  *
610  *
611  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
612  * need to make sure the relevant context time is updated before we try and
613  * update our timestamps.
614  */
615 
616 static __always_inline enum perf_event_state
617 __perf_effective_state(struct perf_event *event)
618 {
619         struct perf_event *leader = event->group_leader;
620 
621         if (leader->state <= PERF_EVENT_STATE_OFF)
622                 return leader->state;
623 
624         return event->state;
625 }
626 
627 static __always_inline void
628 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
629 {
630         enum perf_event_state state = __perf_effective_state(event);
631         u64 delta = now - event->tstamp;
632 
633         *enabled = event->total_time_enabled;
634         if (state >= PERF_EVENT_STATE_INACTIVE)
635                 *enabled += delta;
636 
637         *running = event->total_time_running;
638         if (state >= PERF_EVENT_STATE_ACTIVE)
639                 *running += delta;
640 }
641 
642 static void perf_event_update_time(struct perf_event *event)
643 {
644         u64 now = perf_event_time(event);
645 
646         __perf_update_times(event, now, &event->total_time_enabled,
647                                         &event->total_time_running);
648         event->tstamp = now;
649 }
650 
651 static void perf_event_update_sibling_time(struct perf_event *leader)
652 {
653         struct perf_event *sibling;
654 
655         for_each_sibling_event(sibling, leader)
656                 perf_event_update_time(sibling);
657 }
658 
659 static void
660 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
661 {
662         if (event->state == state)
663                 return;
664 
665         perf_event_update_time(event);
666         /*
667          * If a group leader gets enabled/disabled all its siblings
668          * are affected too.
669          */
670         if ((event->state < 0) ^ (state < 0))
671                 perf_event_update_sibling_time(event);
672 
673         WRITE_ONCE(event->state, state);
674 }
675 
676 #ifdef CONFIG_CGROUP_PERF
677 
678 static inline bool
679 perf_cgroup_match(struct perf_event *event)
680 {
681         struct perf_event_context *ctx = event->ctx;
682         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
683 
684         /* @event doesn't care about cgroup */
685         if (!event->cgrp)
686                 return true;
687 
688         /* wants specific cgroup scope but @cpuctx isn't associated with any */
689         if (!cpuctx->cgrp)
690                 return false;
691 
692         /*
693          * Cgroup scoping is recursive.  An event enabled for a cgroup is
694          * also enabled for all its descendant cgroups.  If @cpuctx's
695          * cgroup is a descendant of @event's (the test covers identity
696          * case), it's a match.
697          */
698         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
699                                     event->cgrp->css.cgroup);
700 }
701 
702 static inline void perf_detach_cgroup(struct perf_event *event)
703 {
704         css_put(&event->cgrp->css);
705         event->cgrp = NULL;
706 }
707 
708 static inline int is_cgroup_event(struct perf_event *event)
709 {
710         return event->cgrp != NULL;
711 }
712 
713 static inline u64 perf_cgroup_event_time(struct perf_event *event)
714 {
715         struct perf_cgroup_info *t;
716 
717         t = per_cpu_ptr(event->cgrp->info, event->cpu);
718         return t->time;
719 }
720 
721 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
722 {
723         struct perf_cgroup_info *info;
724         u64 now;
725 
726         now = perf_clock();
727 
728         info = this_cpu_ptr(cgrp->info);
729 
730         info->time += now - info->timestamp;
731         info->timestamp = now;
732 }
733 
734 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
735 {
736         struct perf_cgroup *cgrp = cpuctx->cgrp;
737         struct cgroup_subsys_state *css;
738 
739         if (cgrp) {
740                 for (css = &cgrp->css; css; css = css->parent) {
741                         cgrp = container_of(css, struct perf_cgroup, css);
742                         __update_cgrp_time(cgrp);
743                 }
744         }
745 }
746 
747 static inline void update_cgrp_time_from_event(struct perf_event *event)
748 {
749         struct perf_cgroup *cgrp;
750 
751         /*
752          * ensure we access cgroup data only when needed and
753          * when we know the cgroup is pinned (css_get)
754          */
755         if (!is_cgroup_event(event))
756                 return;
757 
758         cgrp = perf_cgroup_from_task(current, event->ctx);
759         /*
760          * Do not update time when cgroup is not active
761          */
762         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
763                 __update_cgrp_time(event->cgrp);
764 }
765 
766 static inline void
767 perf_cgroup_set_timestamp(struct task_struct *task,
768                           struct perf_event_context *ctx)
769 {
770         struct perf_cgroup *cgrp;
771         struct perf_cgroup_info *info;
772         struct cgroup_subsys_state *css;
773 
774         /*
775          * ctx->lock held by caller
776          * ensure we do not access cgroup data
777          * unless we have the cgroup pinned (css_get)
778          */
779         if (!task || !ctx->nr_cgroups)
780                 return;
781 
782         cgrp = perf_cgroup_from_task(task, ctx);
783 
784         for (css = &cgrp->css; css; css = css->parent) {
785                 cgrp = container_of(css, struct perf_cgroup, css);
786                 info = this_cpu_ptr(cgrp->info);
787                 info->timestamp = ctx->timestamp;
788         }
789 }
790 
791 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
792 
793 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
794 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
795 
796 /*
797  * reschedule events based on the cgroup constraint of task.
798  *
799  * mode SWOUT : schedule out everything
800  * mode SWIN : schedule in based on cgroup for next
801  */
802 static void perf_cgroup_switch(struct task_struct *task, int mode)
803 {
804         struct perf_cpu_context *cpuctx;
805         struct list_head *list;
806         unsigned long flags;
807 
808         /*
809          * Disable interrupts and preemption to avoid this CPU's
810          * cgrp_cpuctx_entry to change under us.
811          */
812         local_irq_save(flags);
813 
814         list = this_cpu_ptr(&cgrp_cpuctx_list);
815         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
816                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
817 
818                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
819                 perf_pmu_disable(cpuctx->ctx.pmu);
820 
821                 if (mode & PERF_CGROUP_SWOUT) {
822                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
823                         /*
824                          * must not be done before ctxswout due
825                          * to event_filter_match() in event_sched_out()
826                          */
827                         cpuctx->cgrp = NULL;
828                 }
829 
830                 if (mode & PERF_CGROUP_SWIN) {
831                         WARN_ON_ONCE(cpuctx->cgrp);
832                         /*
833                          * set cgrp before ctxsw in to allow
834                          * event_filter_match() to not have to pass
835                          * task around
836                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
837                          * because cgorup events are only per-cpu
838                          */
839                         cpuctx->cgrp = perf_cgroup_from_task(task,
840                                                              &cpuctx->ctx);
841                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
842                 }
843                 perf_pmu_enable(cpuctx->ctx.pmu);
844                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
845         }
846 
847         local_irq_restore(flags);
848 }
849 
850 static inline void perf_cgroup_sched_out(struct task_struct *task,
851                                          struct task_struct *next)
852 {
853         struct perf_cgroup *cgrp1;
854         struct perf_cgroup *cgrp2 = NULL;
855 
856         rcu_read_lock();
857         /*
858          * we come here when we know perf_cgroup_events > 0
859          * we do not need to pass the ctx here because we know
860          * we are holding the rcu lock
861          */
862         cgrp1 = perf_cgroup_from_task(task, NULL);
863         cgrp2 = perf_cgroup_from_task(next, NULL);
864 
865         /*
866          * only schedule out current cgroup events if we know
867          * that we are switching to a different cgroup. Otherwise,
868          * do no touch the cgroup events.
869          */
870         if (cgrp1 != cgrp2)
871                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
872 
873         rcu_read_unlock();
874 }
875 
876 static inline void perf_cgroup_sched_in(struct task_struct *prev,
877                                         struct task_struct *task)
878 {
879         struct perf_cgroup *cgrp1;
880         struct perf_cgroup *cgrp2 = NULL;
881 
882         rcu_read_lock();
883         /*
884          * we come here when we know perf_cgroup_events > 0
885          * we do not need to pass the ctx here because we know
886          * we are holding the rcu lock
887          */
888         cgrp1 = perf_cgroup_from_task(task, NULL);
889         cgrp2 = perf_cgroup_from_task(prev, NULL);
890 
891         /*
892          * only need to schedule in cgroup events if we are changing
893          * cgroup during ctxsw. Cgroup events were not scheduled
894          * out of ctxsw out if that was not the case.
895          */
896         if (cgrp1 != cgrp2)
897                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
898 
899         rcu_read_unlock();
900 }
901 
902 static int perf_cgroup_ensure_storage(struct perf_event *event,
903                                 struct cgroup_subsys_state *css)
904 {
905         struct perf_cpu_context *cpuctx;
906         struct perf_event **storage;
907         int cpu, heap_size, ret = 0;
908 
909         /*
910          * Allow storage to have sufficent space for an iterator for each
911          * possibly nested cgroup plus an iterator for events with no cgroup.
912          */
913         for (heap_size = 1; css; css = css->parent)
914                 heap_size++;
915 
916         for_each_possible_cpu(cpu) {
917                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
918                 if (heap_size <= cpuctx->heap_size)
919                         continue;
920 
921                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
922                                        GFP_KERNEL, cpu_to_node(cpu));
923                 if (!storage) {
924                         ret = -ENOMEM;
925                         break;
926                 }
927 
928                 raw_spin_lock_irq(&cpuctx->ctx.lock);
929                 if (cpuctx->heap_size < heap_size) {
930                         swap(cpuctx->heap, storage);
931                         if (storage == cpuctx->heap_default)
932                                 storage = NULL;
933                         cpuctx->heap_size = heap_size;
934                 }
935                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
936 
937                 kfree(storage);
938         }
939 
940         return ret;
941 }
942 
943 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
944                                       struct perf_event_attr *attr,
945                                       struct perf_event *group_leader)
946 {
947         struct perf_cgroup *cgrp;
948         struct cgroup_subsys_state *css;
949         struct fd f = fdget(fd);
950         int ret = 0;
951 
952         if (!f.file)
953                 return -EBADF;
954 
955         css = css_tryget_online_from_dir(f.file->f_path.dentry,
956                                          &perf_event_cgrp_subsys);
957         if (IS_ERR(css)) {
958                 ret = PTR_ERR(css);
959                 goto out;
960         }
961 
962         ret = perf_cgroup_ensure_storage(event, css);
963         if (ret)
964                 goto out;
965 
966         cgrp = container_of(css, struct perf_cgroup, css);
967         event->cgrp = cgrp;
968 
969         /*
970          * all events in a group must monitor
971          * the same cgroup because a task belongs
972          * to only one perf cgroup at a time
973          */
974         if (group_leader && group_leader->cgrp != cgrp) {
975                 perf_detach_cgroup(event);
976                 ret = -EINVAL;
977         }
978 out:
979         fdput(f);
980         return ret;
981 }
982 
983 static inline void
984 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
985 {
986         struct perf_cgroup_info *t;
987         t = per_cpu_ptr(event->cgrp->info, event->cpu);
988         event->shadow_ctx_time = now - t->timestamp;
989 }
990 
991 static inline void
992 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
993 {
994         struct perf_cpu_context *cpuctx;
995 
996         if (!is_cgroup_event(event))
997                 return;
998 
999         /*
1000          * Because cgroup events are always per-cpu events,
1001          * @ctx == &cpuctx->ctx.
1002          */
1003         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1004 
1005         /*
1006          * Since setting cpuctx->cgrp is conditional on the current @cgrp
1007          * matching the event's cgroup, we must do this for every new event,
1008          * because if the first would mismatch, the second would not try again
1009          * and we would leave cpuctx->cgrp unset.
1010          */
1011         if (ctx->is_active && !cpuctx->cgrp) {
1012                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1013 
1014                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1015                         cpuctx->cgrp = cgrp;
1016         }
1017 
1018         if (ctx->nr_cgroups++)
1019                 return;
1020 
1021         list_add(&cpuctx->cgrp_cpuctx_entry,
1022                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1023 }
1024 
1025 static inline void
1026 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1027 {
1028         struct perf_cpu_context *cpuctx;
1029 
1030         if (!is_cgroup_event(event))
1031                 return;
1032 
1033         /*
1034          * Because cgroup events are always per-cpu events,
1035          * @ctx == &cpuctx->ctx.
1036          */
1037         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1038 
1039         if (--ctx->nr_cgroups)
1040                 return;
1041 
1042         if (ctx->is_active && cpuctx->cgrp)
1043                 cpuctx->cgrp = NULL;
1044 
1045         list_del(&cpuctx->cgrp_cpuctx_entry);
1046 }
1047 
1048 #else /* !CONFIG_CGROUP_PERF */
1049 
1050 static inline bool
1051 perf_cgroup_match(struct perf_event *event)
1052 {
1053         return true;
1054 }
1055 
1056 static inline void perf_detach_cgroup(struct perf_event *event)
1057 {}
1058 
1059 static inline int is_cgroup_event(struct perf_event *event)
1060 {
1061         return 0;
1062 }
1063 
1064 static inline void update_cgrp_time_from_event(struct perf_event *event)
1065 {
1066 }
1067 
1068 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1069 {
1070 }
1071 
1072 static inline void perf_cgroup_sched_out(struct task_struct *task,
1073                                          struct task_struct *next)
1074 {
1075 }
1076 
1077 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1078                                         struct task_struct *task)
1079 {
1080 }
1081 
1082 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1083                                       struct perf_event_attr *attr,
1084                                       struct perf_event *group_leader)
1085 {
1086         return -EINVAL;
1087 }
1088 
1089 static inline void
1090 perf_cgroup_set_timestamp(struct task_struct *task,
1091                           struct perf_event_context *ctx)
1092 {
1093 }
1094 
1095 static inline void
1096 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1097 {
1098 }
1099 
1100 static inline void
1101 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1102 {
1103 }
1104 
1105 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1106 {
1107         return 0;
1108 }
1109 
1110 static inline void
1111 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1112 {
1113 }
1114 
1115 static inline void
1116 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1117 {
1118 }
1119 #endif
1120 
1121 /*
1122  * set default to be dependent on timer tick just
1123  * like original code
1124  */
1125 #define PERF_CPU_HRTIMER (1000 / HZ)
1126 /*
1127  * function must be called with interrupts disabled
1128  */
1129 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1130 {
1131         struct perf_cpu_context *cpuctx;
1132         bool rotations;
1133 
1134         lockdep_assert_irqs_disabled();
1135 
1136         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1137         rotations = perf_rotate_context(cpuctx);
1138 
1139         raw_spin_lock(&cpuctx->hrtimer_lock);
1140         if (rotations)
1141                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1142         else
1143                 cpuctx->hrtimer_active = 0;
1144         raw_spin_unlock(&cpuctx->hrtimer_lock);
1145 
1146         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1147 }
1148 
1149 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1150 {
1151         struct hrtimer *timer = &cpuctx->hrtimer;
1152         struct pmu *pmu = cpuctx->ctx.pmu;
1153         u64 interval;
1154 
1155         /* no multiplexing needed for SW PMU */
1156         if (pmu->task_ctx_nr == perf_sw_context)
1157                 return;
1158 
1159         /*
1160          * check default is sane, if not set then force to
1161          * default interval (1/tick)
1162          */
1163         interval = pmu->hrtimer_interval_ms;
1164         if (interval < 1)
1165                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1166 
1167         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1168 
1169         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1170         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1171         timer->function = perf_mux_hrtimer_handler;
1172 }
1173 
1174 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1175 {
1176         struct hrtimer *timer = &cpuctx->hrtimer;
1177         struct pmu *pmu = cpuctx->ctx.pmu;
1178         unsigned long flags;
1179 
1180         /* not for SW PMU */
1181         if (pmu->task_ctx_nr == perf_sw_context)
1182                 return 0;
1183 
1184         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1185         if (!cpuctx->hrtimer_active) {
1186                 cpuctx->hrtimer_active = 1;
1187                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1188                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1189         }
1190         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1191 
1192         return 0;
1193 }
1194 
1195 void perf_pmu_disable(struct pmu *pmu)
1196 {
1197         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1198         if (!(*count)++)
1199                 pmu->pmu_disable(pmu);
1200 }
1201 
1202 void perf_pmu_enable(struct pmu *pmu)
1203 {
1204         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1205         if (!--(*count))
1206                 pmu->pmu_enable(pmu);
1207 }
1208 
1209 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1210 
1211 /*
1212  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1213  * perf_event_task_tick() are fully serialized because they're strictly cpu
1214  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1215  * disabled, while perf_event_task_tick is called from IRQ context.
1216  */
1217 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1218 {
1219         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1220 
1221         lockdep_assert_irqs_disabled();
1222 
1223         WARN_ON(!list_empty(&ctx->active_ctx_list));
1224 
1225         list_add(&ctx->active_ctx_list, head);
1226 }
1227 
1228 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1229 {
1230         lockdep_assert_irqs_disabled();
1231 
1232         WARN_ON(list_empty(&ctx->active_ctx_list));
1233 
1234         list_del_init(&ctx->active_ctx_list);
1235 }
1236 
1237 static void get_ctx(struct perf_event_context *ctx)
1238 {
1239         refcount_inc(&ctx->refcount);
1240 }
1241 
1242 static void *alloc_task_ctx_data(struct pmu *pmu)
1243 {
1244         if (pmu->task_ctx_cache)
1245                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1246 
1247         return NULL;
1248 }
1249 
1250 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1251 {
1252         if (pmu->task_ctx_cache && task_ctx_data)
1253                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1254 }
1255 
1256 static void free_ctx(struct rcu_head *head)
1257 {
1258         struct perf_event_context *ctx;
1259 
1260         ctx = container_of(head, struct perf_event_context, rcu_head);
1261         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1262         kfree(ctx);
1263 }
1264 
1265 static void put_ctx(struct perf_event_context *ctx)
1266 {
1267         if (refcount_dec_and_test(&ctx->refcount)) {
1268                 if (ctx->parent_ctx)
1269                         put_ctx(ctx->parent_ctx);
1270                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1271                         put_task_struct(ctx->task);
1272                 call_rcu(&ctx->rcu_head, free_ctx);
1273         }
1274 }
1275 
1276 /*
1277  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1278  * perf_pmu_migrate_context() we need some magic.
1279  *
1280  * Those places that change perf_event::ctx will hold both
1281  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1282  *
1283  * Lock ordering is by mutex address. There are two other sites where
1284  * perf_event_context::mutex nests and those are:
1285  *
1286  *  - perf_event_exit_task_context()    [ child , 0 ]
1287  *      perf_event_exit_event()
1288  *        put_event()                   [ parent, 1 ]
1289  *
1290  *  - perf_event_init_context()         [ parent, 0 ]
1291  *      inherit_task_group()
1292  *        inherit_group()
1293  *          inherit_event()
1294  *            perf_event_alloc()
1295  *              perf_init_event()
1296  *                perf_try_init_event() [ child , 1 ]
1297  *
1298  * While it appears there is an obvious deadlock here -- the parent and child
1299  * nesting levels are inverted between the two. This is in fact safe because
1300  * life-time rules separate them. That is an exiting task cannot fork, and a
1301  * spawning task cannot (yet) exit.
1302  *
1303  * But remember that that these are parent<->child context relations, and
1304  * migration does not affect children, therefore these two orderings should not
1305  * interact.
1306  *
1307  * The change in perf_event::ctx does not affect children (as claimed above)
1308  * because the sys_perf_event_open() case will install a new event and break
1309  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1310  * concerned with cpuctx and that doesn't have children.
1311  *
1312  * The places that change perf_event::ctx will issue:
1313  *
1314  *   perf_remove_from_context();
1315  *   synchronize_rcu();
1316  *   perf_install_in_context();
1317  *
1318  * to affect the change. The remove_from_context() + synchronize_rcu() should
1319  * quiesce the event, after which we can install it in the new location. This
1320  * means that only external vectors (perf_fops, prctl) can perturb the event
1321  * while in transit. Therefore all such accessors should also acquire
1322  * perf_event_context::mutex to serialize against this.
1323  *
1324  * However; because event->ctx can change while we're waiting to acquire
1325  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1326  * function.
1327  *
1328  * Lock order:
1329  *    exec_update_mutex
1330  *      task_struct::perf_event_mutex
1331  *        perf_event_context::mutex
1332  *          perf_event::child_mutex;
1333  *            perf_event_context::lock
1334  *          perf_event::mmap_mutex
1335  *          mmap_lock
1336  *            perf_addr_filters_head::lock
1337  *
1338  *    cpu_hotplug_lock
1339  *      pmus_lock
1340  *        cpuctx->mutex / perf_event_context::mutex
1341  */
1342 static struct perf_event_context *
1343 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1344 {
1345         struct perf_event_context *ctx;
1346 
1347 again:
1348         rcu_read_lock();
1349         ctx = READ_ONCE(event->ctx);
1350         if (!refcount_inc_not_zero(&ctx->refcount)) {
1351                 rcu_read_unlock();
1352                 goto again;
1353         }
1354         rcu_read_unlock();
1355 
1356         mutex_lock_nested(&ctx->mutex, nesting);
1357         if (event->ctx != ctx) {
1358                 mutex_unlock(&ctx->mutex);
1359                 put_ctx(ctx);
1360                 goto again;
1361         }
1362 
1363         return ctx;
1364 }
1365 
1366 static inline struct perf_event_context *
1367 perf_event_ctx_lock(struct perf_event *event)
1368 {
1369         return perf_event_ctx_lock_nested(event, 0);
1370 }
1371 
1372 static void perf_event_ctx_unlock(struct perf_event *event,
1373                                   struct perf_event_context *ctx)
1374 {
1375         mutex_unlock(&ctx->mutex);
1376         put_ctx(ctx);
1377 }
1378 
1379 /*
1380  * This must be done under the ctx->lock, such as to serialize against
1381  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1382  * calling scheduler related locks and ctx->lock nests inside those.
1383  */
1384 static __must_check struct perf_event_context *
1385 unclone_ctx(struct perf_event_context *ctx)
1386 {
1387         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1388 
1389         lockdep_assert_held(&ctx->lock);
1390 
1391         if (parent_ctx)
1392                 ctx->parent_ctx = NULL;
1393         ctx->generation++;
1394 
1395         return parent_ctx;
1396 }
1397 
1398 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1399                                 enum pid_type type)
1400 {
1401         u32 nr;
1402         /*
1403          * only top level events have the pid namespace they were created in
1404          */
1405         if (event->parent)
1406                 event = event->parent;
1407 
1408         nr = __task_pid_nr_ns(p, type, event->ns);
1409         /* avoid -1 if it is idle thread or runs in another ns */
1410         if (!nr && !pid_alive(p))
1411                 nr = -1;
1412         return nr;
1413 }
1414 
1415 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1416 {
1417         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1418 }
1419 
1420 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1421 {
1422         return perf_event_pid_type(event, p, PIDTYPE_PID);
1423 }
1424 
1425 /*
1426  * If we inherit events we want to return the parent event id
1427  * to userspace.
1428  */
1429 static u64 primary_event_id(struct perf_event *event)
1430 {
1431         u64 id = event->id;
1432 
1433         if (event->parent)
1434                 id = event->parent->id;
1435 
1436         return id;
1437 }
1438 
1439 /*
1440  * Get the perf_event_context for a task and lock it.
1441  *
1442  * This has to cope with with the fact that until it is locked,
1443  * the context could get moved to another task.
1444  */
1445 static struct perf_event_context *
1446 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1447 {
1448         struct perf_event_context *ctx;
1449 
1450 retry:
1451         /*
1452          * One of the few rules of preemptible RCU is that one cannot do
1453          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1454          * part of the read side critical section was irqs-enabled -- see
1455          * rcu_read_unlock_special().
1456          *
1457          * Since ctx->lock nests under rq->lock we must ensure the entire read
1458          * side critical section has interrupts disabled.
1459          */
1460         local_irq_save(*flags);
1461         rcu_read_lock();
1462         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1463         if (ctx) {
1464                 /*
1465                  * If this context is a clone of another, it might
1466                  * get swapped for another underneath us by
1467                  * perf_event_task_sched_out, though the
1468                  * rcu_read_lock() protects us from any context
1469                  * getting freed.  Lock the context and check if it
1470                  * got swapped before we could get the lock, and retry
1471                  * if so.  If we locked the right context, then it
1472                  * can't get swapped on us any more.
1473                  */
1474                 raw_spin_lock(&ctx->lock);
1475                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1476                         raw_spin_unlock(&ctx->lock);
1477                         rcu_read_unlock();
1478                         local_irq_restore(*flags);
1479                         goto retry;
1480                 }
1481 
1482                 if (ctx->task == TASK_TOMBSTONE ||
1483                     !refcount_inc_not_zero(&ctx->refcount)) {
1484                         raw_spin_unlock(&ctx->lock);
1485                         ctx = NULL;
1486                 } else {
1487                         WARN_ON_ONCE(ctx->task != task);
1488                 }
1489         }
1490         rcu_read_unlock();
1491         if (!ctx)
1492                 local_irq_restore(*flags);
1493         return ctx;
1494 }
1495 
1496 /*
1497  * Get the context for a task and increment its pin_count so it
1498  * can't get swapped to another task.  This also increments its
1499  * reference count so that the context can't get freed.
1500  */
1501 static struct perf_event_context *
1502 perf_pin_task_context(struct task_struct *task, int ctxn)
1503 {
1504         struct perf_event_context *ctx;
1505         unsigned long flags;
1506 
1507         ctx = perf_lock_task_context(task, ctxn, &flags);
1508         if (ctx) {
1509                 ++ctx->pin_count;
1510                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1511         }
1512         return ctx;
1513 }
1514 
1515 static void perf_unpin_context(struct perf_event_context *ctx)
1516 {
1517         unsigned long flags;
1518 
1519         raw_spin_lock_irqsave(&ctx->lock, flags);
1520         --ctx->pin_count;
1521         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1522 }
1523 
1524 /*
1525  * Update the record of the current time in a context.
1526  */
1527 static void update_context_time(struct perf_event_context *ctx)
1528 {
1529         u64 now = perf_clock();
1530 
1531         ctx->time += now - ctx->timestamp;
1532         ctx->timestamp = now;
1533 }
1534 
1535 static u64 perf_event_time(struct perf_event *event)
1536 {
1537         struct perf_event_context *ctx = event->ctx;
1538 
1539         if (is_cgroup_event(event))
1540                 return perf_cgroup_event_time(event);
1541 
1542         return ctx ? ctx->time : 0;
1543 }
1544 
1545 static enum event_type_t get_event_type(struct perf_event *event)
1546 {
1547         struct perf_event_context *ctx = event->ctx;
1548         enum event_type_t event_type;
1549 
1550         lockdep_assert_held(&ctx->lock);
1551 
1552         /*
1553          * It's 'group type', really, because if our group leader is
1554          * pinned, so are we.
1555          */
1556         if (event->group_leader != event)
1557                 event = event->group_leader;
1558 
1559         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1560         if (!ctx->task)
1561                 event_type |= EVENT_CPU;
1562 
1563         return event_type;
1564 }
1565 
1566 /*
1567  * Helper function to initialize event group nodes.
1568  */
1569 static void init_event_group(struct perf_event *event)
1570 {
1571         RB_CLEAR_NODE(&event->group_node);
1572         event->group_index = 0;
1573 }
1574 
1575 /*
1576  * Extract pinned or flexible groups from the context
1577  * based on event attrs bits.
1578  */
1579 static struct perf_event_groups *
1580 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1581 {
1582         if (event->attr.pinned)
1583                 return &ctx->pinned_groups;
1584         else
1585                 return &ctx->flexible_groups;
1586 }
1587 
1588 /*
1589  * Helper function to initializes perf_event_group trees.
1590  */
1591 static void perf_event_groups_init(struct perf_event_groups *groups)
1592 {
1593         groups->tree = RB_ROOT;
1594         groups->index = 0;
1595 }
1596 
1597 /*
1598  * Compare function for event groups;
1599  *
1600  * Implements complex key that first sorts by CPU and then by virtual index
1601  * which provides ordering when rotating groups for the same CPU.
1602  */
1603 static bool
1604 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1605 {
1606         if (left->cpu < right->cpu)
1607                 return true;
1608         if (left->cpu > right->cpu)
1609                 return false;
1610 
1611 #ifdef CONFIG_CGROUP_PERF
1612         if (left->cgrp != right->cgrp) {
1613                 if (!left->cgrp || !left->cgrp->css.cgroup) {
1614                         /*
1615                          * Left has no cgroup but right does, no cgroups come
1616                          * first.
1617                          */
1618                         return true;
1619                 }
1620                 if (!right->cgrp || !right->cgrp->css.cgroup) {
1621                         /*
1622                          * Right has no cgroup but left does, no cgroups come
1623                          * first.
1624                          */
1625                         return false;
1626                 }
1627                 /* Two dissimilar cgroups, order by id. */
1628                 if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1629                         return true;
1630 
1631                 return false;
1632         }
1633 #endif
1634 
1635         if (left->group_index < right->group_index)
1636                 return true;
1637         if (left->group_index > right->group_index)
1638                 return false;
1639 
1640         return false;
1641 }
1642 
1643 /*
1644  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1645  * key (see perf_event_groups_less). This places it last inside the CPU
1646  * subtree.
1647  */
1648 static void
1649 perf_event_groups_insert(struct perf_event_groups *groups,
1650                          struct perf_event *event)
1651 {
1652         struct perf_event *node_event;
1653         struct rb_node *parent;
1654         struct rb_node **node;
1655 
1656         event->group_index = ++groups->index;
1657 
1658         node = &groups->tree.rb_node;
1659         parent = *node;
1660 
1661         while (*node) {
1662                 parent = *node;
1663                 node_event = container_of(*node, struct perf_event, group_node);
1664 
1665                 if (perf_event_groups_less(event, node_event))
1666                         node = &parent->rb_left;
1667                 else
1668                         node = &parent->rb_right;
1669         }
1670 
1671         rb_link_node(&event->group_node, parent, node);
1672         rb_insert_color(&event->group_node, &groups->tree);
1673 }
1674 
1675 /*
1676  * Helper function to insert event into the pinned or flexible groups.
1677  */
1678 static void
1679 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1680 {
1681         struct perf_event_groups *groups;
1682 
1683         groups = get_event_groups(event, ctx);
1684         perf_event_groups_insert(groups, event);
1685 }
1686 
1687 /*
1688  * Delete a group from a tree.
1689  */
1690 static void
1691 perf_event_groups_delete(struct perf_event_groups *groups,
1692                          struct perf_event *event)
1693 {
1694         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1695                      RB_EMPTY_ROOT(&groups->tree));
1696 
1697         rb_erase(&event->group_node, &groups->tree);
1698         init_event_group(event);
1699 }
1700 
1701 /*
1702  * Helper function to delete event from its groups.
1703  */
1704 static void
1705 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1706 {
1707         struct perf_event_groups *groups;
1708 
1709         groups = get_event_groups(event, ctx);
1710         perf_event_groups_delete(groups, event);
1711 }
1712 
1713 /*
1714  * Get the leftmost event in the cpu/cgroup subtree.
1715  */
1716 static struct perf_event *
1717 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1718                         struct cgroup *cgrp)
1719 {
1720         struct perf_event *node_event = NULL, *match = NULL;
1721         struct rb_node *node = groups->tree.rb_node;
1722 #ifdef CONFIG_CGROUP_PERF
1723         u64 node_cgrp_id, cgrp_id = 0;
1724 
1725         if (cgrp)
1726                 cgrp_id = cgrp->kn->id;
1727 #endif
1728 
1729         while (node) {
1730                 node_event = container_of(node, struct perf_event, group_node);
1731 
1732                 if (cpu < node_event->cpu) {
1733                         node = node->rb_left;
1734                         continue;
1735                 }
1736                 if (cpu > node_event->cpu) {
1737                         node = node->rb_right;
1738                         continue;
1739                 }
1740 #ifdef CONFIG_CGROUP_PERF
1741                 node_cgrp_id = 0;
1742                 if (node_event->cgrp && node_event->cgrp->css.cgroup)
1743                         node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1744 
1745                 if (cgrp_id < node_cgrp_id) {
1746                         node = node->rb_left;
1747                         continue;
1748                 }
1749                 if (cgrp_id > node_cgrp_id) {
1750                         node = node->rb_right;
1751                         continue;
1752                 }
1753 #endif
1754                 match = node_event;
1755                 node = node->rb_left;
1756         }
1757 
1758         return match;
1759 }
1760 
1761 /*
1762  * Like rb_entry_next_safe() for the @cpu subtree.
1763  */
1764 static struct perf_event *
1765 perf_event_groups_next(struct perf_event *event)
1766 {
1767         struct perf_event *next;
1768 #ifdef CONFIG_CGROUP_PERF
1769         u64 curr_cgrp_id = 0;
1770         u64 next_cgrp_id = 0;
1771 #endif
1772 
1773         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1774         if (next == NULL || next->cpu != event->cpu)
1775                 return NULL;
1776 
1777 #ifdef CONFIG_CGROUP_PERF
1778         if (event->cgrp && event->cgrp->css.cgroup)
1779                 curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1780 
1781         if (next->cgrp && next->cgrp->css.cgroup)
1782                 next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1783 
1784         if (curr_cgrp_id != next_cgrp_id)
1785                 return NULL;
1786 #endif
1787         return next;
1788 }
1789 
1790 /*
1791  * Iterate through the whole groups tree.
1792  */
1793 #define perf_event_groups_for_each(event, groups)                       \
1794         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1795                                 typeof(*event), group_node); event;     \
1796                 event = rb_entry_safe(rb_next(&event->group_node),      \
1797                                 typeof(*event), group_node))
1798 
1799 /*
1800  * Add an event from the lists for its context.
1801  * Must be called with ctx->mutex and ctx->lock held.
1802  */
1803 static void
1804 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1805 {
1806         lockdep_assert_held(&ctx->lock);
1807 
1808         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1809         event->attach_state |= PERF_ATTACH_CONTEXT;
1810 
1811         event->tstamp = perf_event_time(event);
1812 
1813         /*
1814          * If we're a stand alone event or group leader, we go to the context
1815          * list, group events are kept attached to the group so that
1816          * perf_group_detach can, at all times, locate all siblings.
1817          */
1818         if (event->group_leader == event) {
1819                 event->group_caps = event->event_caps;
1820                 add_event_to_groups(event, ctx);
1821         }
1822 
1823         list_add_rcu(&event->event_entry, &ctx->event_list);
1824         ctx->nr_events++;
1825         if (event->attr.inherit_stat)
1826                 ctx->nr_stat++;
1827 
1828         if (event->state > PERF_EVENT_STATE_OFF)
1829                 perf_cgroup_event_enable(event, ctx);
1830 
1831         ctx->generation++;
1832 }
1833 
1834 /*
1835  * Initialize event state based on the perf_event_attr::disabled.
1836  */
1837 static inline void perf_event__state_init(struct perf_event *event)
1838 {
1839         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1840                                               PERF_EVENT_STATE_INACTIVE;
1841 }
1842 
1843 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1844 {
1845         int entry = sizeof(u64); /* value */
1846         int size = 0;
1847         int nr = 1;
1848 
1849         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1850                 size += sizeof(u64);
1851 
1852         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1853                 size += sizeof(u64);
1854 
1855         if (event->attr.read_format & PERF_FORMAT_ID)
1856                 entry += sizeof(u64);
1857 
1858         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1859                 nr += nr_siblings;
1860                 size += sizeof(u64);
1861         }
1862 
1863         size += entry * nr;
1864         event->read_size = size;
1865 }
1866 
1867 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1868 {
1869         struct perf_sample_data *data;
1870         u16 size = 0;
1871 
1872         if (sample_type & PERF_SAMPLE_IP)
1873                 size += sizeof(data->ip);
1874 
1875         if (sample_type & PERF_SAMPLE_ADDR)
1876                 size += sizeof(data->addr);
1877 
1878         if (sample_type & PERF_SAMPLE_PERIOD)
1879                 size += sizeof(data->period);
1880 
1881         if (sample_type & PERF_SAMPLE_WEIGHT)
1882                 size += sizeof(data->weight);
1883 
1884         if (sample_type & PERF_SAMPLE_READ)
1885                 size += event->read_size;
1886 
1887         if (sample_type & PERF_SAMPLE_DATA_SRC)
1888                 size += sizeof(data->data_src.val);
1889 
1890         if (sample_type & PERF_SAMPLE_TRANSACTION)
1891                 size += sizeof(data->txn);
1892 
1893         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1894                 size += sizeof(data->phys_addr);
1895 
1896         if (sample_type & PERF_SAMPLE_CGROUP)
1897                 size += sizeof(data->cgroup);
1898 
1899         event->header_size = size;
1900 }
1901 
1902 /*
1903  * Called at perf_event creation and when events are attached/detached from a
1904  * group.
1905  */
1906 static void perf_event__header_size(struct perf_event *event)
1907 {
1908         __perf_event_read_size(event,
1909                                event->group_leader->nr_siblings);
1910         __perf_event_header_size(event, event->attr.sample_type);
1911 }
1912 
1913 static void perf_event__id_header_size(struct perf_event *event)
1914 {
1915         struct perf_sample_data *data;
1916         u64 sample_type = event->attr.sample_type;
1917         u16 size = 0;
1918 
1919         if (sample_type & PERF_SAMPLE_TID)
1920                 size += sizeof(data->tid_entry);
1921 
1922         if (sample_type & PERF_SAMPLE_TIME)
1923                 size += sizeof(data->time);
1924 
1925         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1926                 size += sizeof(data->id);
1927 
1928         if (sample_type & PERF_SAMPLE_ID)
1929                 size += sizeof(data->id);
1930 
1931         if (sample_type & PERF_SAMPLE_STREAM_ID)
1932                 size += sizeof(data->stream_id);
1933 
1934         if (sample_type & PERF_SAMPLE_CPU)
1935                 size += sizeof(data->cpu_entry);
1936 
1937         event->id_header_size = size;
1938 }
1939 
1940 static bool perf_event_validate_size(struct perf_event *event)
1941 {
1942         /*
1943          * The values computed here will be over-written when we actually
1944          * attach the event.
1945          */
1946         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1947         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1948         perf_event__id_header_size(event);
1949 
1950         /*
1951          * Sum the lot; should not exceed the 64k limit we have on records.
1952          * Conservative limit to allow for callchains and other variable fields.
1953          */
1954         if (event->read_size + event->header_size +
1955             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1956                 return false;
1957 
1958         return true;
1959 }
1960 
1961 static void perf_group_attach(struct perf_event *event)
1962 {
1963         struct perf_event *group_leader = event->group_leader, *pos;
1964 
1965         lockdep_assert_held(&event->ctx->lock);
1966 
1967         /*
1968          * We can have double attach due to group movement in perf_event_open.
1969          */
1970         if (event->attach_state & PERF_ATTACH_GROUP)
1971                 return;
1972 
1973         event->attach_state |= PERF_ATTACH_GROUP;
1974 
1975         if (group_leader == event)
1976                 return;
1977 
1978         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1979 
1980         group_leader->group_caps &= event->event_caps;
1981 
1982         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1983         group_leader->nr_siblings++;
1984 
1985         perf_event__header_size(group_leader);
1986 
1987         for_each_sibling_event(pos, group_leader)
1988                 perf_event__header_size(pos);
1989 }
1990 
1991 /*
1992  * Remove an event from the lists for its context.
1993  * Must be called with ctx->mutex and ctx->lock held.
1994  */
1995 static void
1996 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1997 {
1998         WARN_ON_ONCE(event->ctx != ctx);
1999         lockdep_assert_held(&ctx->lock);
2000 
2001         /*
2002          * We can have double detach due to exit/hot-unplug + close.
2003          */
2004         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2005                 return;
2006 
2007         event->attach_state &= ~PERF_ATTACH_CONTEXT;
2008 
2009         ctx->nr_events--;
2010         if (event->attr.inherit_stat)
2011                 ctx->nr_stat--;
2012 
2013         list_del_rcu(&event->event_entry);
2014 
2015         if (event->group_leader == event)
2016                 del_event_from_groups(event, ctx);
2017 
2018         /*
2019          * If event was in error state, then keep it
2020          * that way, otherwise bogus counts will be
2021          * returned on read(). The only way to get out
2022          * of error state is by explicit re-enabling
2023          * of the event
2024          */
2025         if (event->state > PERF_EVENT_STATE_OFF) {
2026                 perf_cgroup_event_disable(event, ctx);
2027                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2028         }
2029 
2030         ctx->generation++;
2031 }
2032 
2033 static int
2034 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2035 {
2036         if (!has_aux(aux_event))
2037                 return 0;
2038 
2039         if (!event->pmu->aux_output_match)
2040                 return 0;
2041 
2042         return event->pmu->aux_output_match(aux_event);
2043 }
2044 
2045 static void put_event(struct perf_event *event);
2046 static void event_sched_out(struct perf_event *event,
2047                             struct perf_cpu_context *cpuctx,
2048                             struct perf_event_context *ctx);
2049 
2050 static void perf_put_aux_event(struct perf_event *event)
2051 {
2052         struct perf_event_context *ctx = event->ctx;
2053         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2054         struct perf_event *iter;
2055 
2056         /*
2057          * If event uses aux_event tear down the link
2058          */
2059         if (event->aux_event) {
2060                 iter = event->aux_event;
2061                 event->aux_event = NULL;
2062                 put_event(iter);
2063                 return;
2064         }
2065 
2066         /*
2067          * If the event is an aux_event, tear down all links to
2068          * it from other events.
2069          */
2070         for_each_sibling_event(iter, event->group_leader) {
2071                 if (iter->aux_event != event)
2072                         continue;
2073 
2074                 iter->aux_event = NULL;
2075                 put_event(event);
2076 
2077                 /*
2078                  * If it's ACTIVE, schedule it out and put it into ERROR
2079                  * state so that we don't try to schedule it again. Note
2080                  * that perf_event_enable() will clear the ERROR status.
2081                  */
2082                 event_sched_out(iter, cpuctx, ctx);
2083                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2084         }
2085 }
2086 
2087 static bool perf_need_aux_event(struct perf_event *event)
2088 {
2089         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2090 }
2091 
2092 static int perf_get_aux_event(struct perf_event *event,
2093                               struct perf_event *group_leader)
2094 {
2095         /*
2096          * Our group leader must be an aux event if we want to be
2097          * an aux_output. This way, the aux event will precede its
2098          * aux_output events in the group, and therefore will always
2099          * schedule first.
2100          */
2101         if (!group_leader)
2102                 return 0;
2103 
2104         /*
2105          * aux_output and aux_sample_size are mutually exclusive.
2106          */
2107         if (event->attr.aux_output && event->attr.aux_sample_size)
2108                 return 0;
2109 
2110         if (event->attr.aux_output &&
2111             !perf_aux_output_match(event, group_leader))
2112                 return 0;
2113 
2114         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2115                 return 0;
2116 
2117         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2118                 return 0;
2119 
2120         /*
2121          * Link aux_outputs to their aux event; this is undone in
2122          * perf_group_detach() by perf_put_aux_event(). When the
2123          * group in torn down, the aux_output events loose their
2124          * link to the aux_event and can't schedule any more.
2125          */
2126         event->aux_event = group_leader;
2127 
2128         return 1;
2129 }
2130 
2131 static inline struct list_head *get_event_list(struct perf_event *event)
2132 {
2133         struct perf_event_context *ctx = event->ctx;
2134         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2135 }
2136 
2137 static void perf_group_detach(struct perf_event *event)
2138 {
2139         struct perf_event *sibling, *tmp;
2140         struct perf_event_context *ctx = event->ctx;
2141 
2142         lockdep_assert_held(&ctx->lock);
2143 
2144         /*
2145          * We can have double detach due to exit/hot-unplug + close.
2146          */
2147         if (!(event->attach_state & PERF_ATTACH_GROUP))
2148                 return;
2149 
2150         event->attach_state &= ~PERF_ATTACH_GROUP;
2151 
2152         perf_put_aux_event(event);
2153 
2154         /*
2155          * If this is a sibling, remove it from its group.
2156          */
2157         if (event->group_leader != event) {
2158                 list_del_init(&event->sibling_list);
2159                 event->group_leader->nr_siblings--;
2160                 goto out;
2161         }
2162 
2163         /*
2164          * If this was a group event with sibling events then
2165          * upgrade the siblings to singleton events by adding them
2166          * to whatever list we are on.
2167          */
2168         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2169 
2170                 sibling->group_leader = sibling;
2171                 list_del_init(&sibling->sibling_list);
2172 
2173                 /* Inherit group flags from the previous leader */
2174                 sibling->group_caps = event->group_caps;
2175 
2176                 if (!RB_EMPTY_NODE(&event->group_node)) {
2177                         add_event_to_groups(sibling, event->ctx);
2178 
2179                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2180                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2181                 }
2182 
2183                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2184         }
2185 
2186 out:
2187         perf_event__header_size(event->group_leader);
2188 
2189         for_each_sibling_event(tmp, event->group_leader)
2190                 perf_event__header_size(tmp);
2191 }
2192 
2193 static bool is_orphaned_event(struct perf_event *event)
2194 {
2195         return event->state == PERF_EVENT_STATE_DEAD;
2196 }
2197 
2198 static inline int __pmu_filter_match(struct perf_event *event)
2199 {
2200         struct pmu *pmu = event->pmu;
2201         return pmu->filter_match ? pmu->filter_match(event) : 1;
2202 }
2203 
2204 /*
2205  * Check whether we should attempt to schedule an event group based on
2206  * PMU-specific filtering. An event group can consist of HW and SW events,
2207  * potentially with a SW leader, so we must check all the filters, to
2208  * determine whether a group is schedulable:
2209  */
2210 static inline int pmu_filter_match(struct perf_event *event)
2211 {
2212         struct perf_event *sibling;
2213 
2214         if (!__pmu_filter_match(event))
2215                 return 0;
2216 
2217         for_each_sibling_event(sibling, event) {
2218                 if (!__pmu_filter_match(sibling))
2219                         return 0;
2220         }
2221 
2222         return 1;
2223 }
2224 
2225 static inline int
2226 event_filter_match(struct perf_event *event)
2227 {
2228         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2229                perf_cgroup_match(event) && pmu_filter_match(event);
2230 }
2231 
2232 static void
2233 event_sched_out(struct perf_event *event,
2234                   struct perf_cpu_context *cpuctx,
2235                   struct perf_event_context *ctx)
2236 {
2237         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2238 
2239         WARN_ON_ONCE(event->ctx != ctx);
2240         lockdep_assert_held(&ctx->lock);
2241 
2242         if (event->state != PERF_EVENT_STATE_ACTIVE)
2243                 return;
2244 
2245         /*
2246          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2247          * we can schedule events _OUT_ individually through things like
2248          * __perf_remove_from_context().
2249          */
2250         list_del_init(&event->active_list);
2251 
2252         perf_pmu_disable(event->pmu);
2253 
2254         event->pmu->del(event, 0);
2255         event->oncpu = -1;
2256 
2257         if (READ_ONCE(event->pending_disable) >= 0) {
2258                 WRITE_ONCE(event->pending_disable, -1);
2259                 perf_cgroup_event_disable(event, ctx);
2260                 state = PERF_EVENT_STATE_OFF;
2261         }
2262         perf_event_set_state(event, state);
2263 
2264         if (!is_software_event(event))
2265                 cpuctx->active_oncpu--;
2266         if (!--ctx->nr_active)
2267                 perf_event_ctx_deactivate(ctx);
2268         if (event->attr.freq && event->attr.sample_freq)
2269                 ctx->nr_freq--;
2270         if (event->attr.exclusive || !cpuctx->active_oncpu)
2271                 cpuctx->exclusive = 0;
2272 
2273         perf_pmu_enable(event->pmu);
2274 }
2275 
2276 static void
2277 group_sched_out(struct perf_event *group_event,
2278                 struct perf_cpu_context *cpuctx,
2279                 struct perf_event_context *ctx)
2280 {
2281         struct perf_event *event;
2282 
2283         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2284                 return;
2285 
2286         perf_pmu_disable(ctx->pmu);
2287 
2288         event_sched_out(group_event, cpuctx, ctx);
2289 
2290         /*
2291          * Schedule out siblings (if any):
2292          */
2293         for_each_sibling_event(event, group_event)
2294                 event_sched_out(event, cpuctx, ctx);
2295 
2296         perf_pmu_enable(ctx->pmu);
2297 
2298         if (group_event->attr.exclusive)
2299                 cpuctx->exclusive = 0;
2300 }
2301 
2302 #define DETACH_GROUP    0x01UL
2303 
2304 /*
2305  * Cross CPU call to remove a performance event
2306  *
2307  * We disable the event on the hardware level first. After that we
2308  * remove it from the context list.
2309  */
2310 static void
2311 __perf_remove_from_context(struct perf_event *event,
2312                            struct perf_cpu_context *cpuctx,
2313                            struct perf_event_context *ctx,
2314                            void *info)
2315 {
2316         unsigned long flags = (unsigned long)info;
2317 
2318         if (ctx->is_active & EVENT_TIME) {
2319                 update_context_time(ctx);
2320                 update_cgrp_time_from_cpuctx(cpuctx);
2321         }
2322 
2323         event_sched_out(event, cpuctx, ctx);
2324         if (flags & DETACH_GROUP)
2325                 perf_group_detach(event);
2326         list_del_event(event, ctx);
2327 
2328         if (!ctx->nr_events && ctx->is_active) {
2329                 ctx->is_active = 0;
2330                 ctx->rotate_necessary = 0;
2331                 if (ctx->task) {
2332                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2333                         cpuctx->task_ctx = NULL;
2334                 }
2335         }
2336 }
2337 
2338 /*
2339  * Remove the event from a task's (or a CPU's) list of events.
2340  *
2341  * If event->ctx is a cloned context, callers must make sure that
2342  * every task struct that event->ctx->task could possibly point to
2343  * remains valid.  This is OK when called from perf_release since
2344  * that only calls us on the top-level context, which can't be a clone.
2345  * When called from perf_event_exit_task, it's OK because the
2346  * context has been detached from its task.
2347  */
2348 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2349 {
2350         struct perf_event_context *ctx = event->ctx;
2351 
2352         lockdep_assert_held(&ctx->mutex);
2353 
2354         event_function_call(event, __perf_remove_from_context, (void *)flags);
2355 
2356         /*
2357          * The above event_function_call() can NO-OP when it hits
2358          * TASK_TOMBSTONE. In that case we must already have been detached
2359          * from the context (by perf_event_exit_event()) but the grouping
2360          * might still be in-tact.
2361          */
2362         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2363         if ((flags & DETACH_GROUP) &&
2364             (event->attach_state & PERF_ATTACH_GROUP)) {
2365                 /*
2366                  * Since in that case we cannot possibly be scheduled, simply
2367                  * detach now.
2368                  */
2369                 raw_spin_lock_irq(&ctx->lock);
2370                 perf_group_detach(event);
2371                 raw_spin_unlock_irq(&ctx->lock);
2372         }
2373 }
2374 
2375 /*
2376  * Cross CPU call to disable a performance event
2377  */
2378 static void __perf_event_disable(struct perf_event *event,
2379                                  struct perf_cpu_context *cpuctx,
2380                                  struct perf_event_context *ctx,
2381                                  void *info)
2382 {
2383         if (event->state < PERF_EVENT_STATE_INACTIVE)
2384                 return;
2385 
2386         if (ctx->is_active & EVENT_TIME) {
2387                 update_context_time(ctx);
2388                 update_cgrp_time_from_event(event);
2389         }
2390 
2391         if (event == event->group_leader)
2392                 group_sched_out(event, cpuctx, ctx);
2393         else
2394                 event_sched_out(event, cpuctx, ctx);
2395 
2396         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2397         perf_cgroup_event_disable(event, ctx);
2398 }
2399 
2400 /*
2401  * Disable an event.
2402  *
2403  * If event->ctx is a cloned context, callers must make sure that
2404  * every task struct that event->ctx->task could possibly point to
2405  * remains valid.  This condition is satisfied when called through
2406  * perf_event_for_each_child or perf_event_for_each because they
2407  * hold the top-level event's child_mutex, so any descendant that
2408  * goes to exit will block in perf_event_exit_event().
2409  *
2410  * When called from perf_pending_event it's OK because event->ctx
2411  * is the current context on this CPU and preemption is disabled,
2412  * hence we can't get into perf_event_task_sched_out for this context.
2413  */
2414 static void _perf_event_disable(struct perf_event *event)
2415 {
2416         struct perf_event_context *ctx = event->ctx;
2417 
2418         raw_spin_lock_irq(&ctx->lock);
2419         if (event->state <= PERF_EVENT_STATE_OFF) {
2420                 raw_spin_unlock_irq(&ctx->lock);
2421                 return;
2422         }
2423         raw_spin_unlock_irq(&ctx->lock);
2424 
2425         event_function_call(event, __perf_event_disable, NULL);
2426 }
2427 
2428 void perf_event_disable_local(struct perf_event *event)
2429 {
2430         event_function_local(event, __perf_event_disable, NULL);
2431 }
2432 
2433 /*
2434  * Strictly speaking kernel users cannot create groups and therefore this
2435  * interface does not need the perf_event_ctx_lock() magic.
2436  */
2437 void perf_event_disable(struct perf_event *event)
2438 {
2439         struct perf_event_context *ctx;
2440 
2441         ctx = perf_event_ctx_lock(event);
2442         _perf_event_disable(event);
2443         perf_event_ctx_unlock(event, ctx);
2444 }
2445 EXPORT_SYMBOL_GPL(perf_event_disable);
2446 
2447 void perf_event_disable_inatomic(struct perf_event *event)
2448 {
2449         WRITE_ONCE(event->pending_disable, smp_processor_id());
2450         /* can fail, see perf_pending_event_disable() */
2451         irq_work_queue(&event->pending);
2452 }
2453 
2454 static void perf_set_shadow_time(struct perf_event *event,
2455                                  struct perf_event_context *ctx)
2456 {
2457         /*
2458          * use the correct time source for the time snapshot
2459          *
2460          * We could get by without this by leveraging the
2461          * fact that to get to this function, the caller
2462          * has most likely already called update_context_time()
2463          * and update_cgrp_time_xx() and thus both timestamp
2464          * are identical (or very close). Given that tstamp is,
2465          * already adjusted for cgroup, we could say that:
2466          *    tstamp - ctx->timestamp
2467          * is equivalent to
2468          *    tstamp - cgrp->timestamp.
2469          *
2470          * Then, in perf_output_read(), the calculation would
2471          * work with no changes because:
2472          * - event is guaranteed scheduled in
2473          * - no scheduled out in between
2474          * - thus the timestamp would be the same
2475          *
2476          * But this is a bit hairy.
2477          *
2478          * So instead, we have an explicit cgroup call to remain
2479          * within the time time source all along. We believe it
2480          * is cleaner and simpler to understand.
2481          */
2482         if (is_cgroup_event(event))
2483                 perf_cgroup_set_shadow_time(event, event->tstamp);
2484         else
2485                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2486 }
2487 
2488 #define MAX_INTERRUPTS (~0ULL)
2489 
2490 static void perf_log_throttle(struct perf_event *event, int enable);
2491 static void perf_log_itrace_start(struct perf_event *event);
2492 
2493 static int
2494 event_sched_in(struct perf_event *event,
2495                  struct perf_cpu_context *cpuctx,
2496                  struct perf_event_context *ctx)
2497 {
2498         int ret = 0;
2499 
2500         WARN_ON_ONCE(event->ctx != ctx);
2501 
2502         lockdep_assert_held(&ctx->lock);
2503 
2504         if (event->state <= PERF_EVENT_STATE_OFF)
2505                 return 0;
2506 
2507         WRITE_ONCE(event->oncpu, smp_processor_id());
2508         /*
2509          * Order event::oncpu write to happen before the ACTIVE state is
2510          * visible. This allows perf_event_{stop,read}() to observe the correct
2511          * ->oncpu if it sees ACTIVE.
2512          */
2513         smp_wmb();
2514         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2515 
2516         /*
2517          * Unthrottle events, since we scheduled we might have missed several
2518          * ticks already, also for a heavily scheduling task there is little
2519          * guarantee it'll get a tick in a timely manner.
2520          */
2521         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2522                 perf_log_throttle(event, 1);
2523                 event->hw.interrupts = 0;
2524         }
2525 
2526         perf_pmu_disable(event->pmu);
2527 
2528         perf_set_shadow_time(event, ctx);
2529 
2530         perf_log_itrace_start(event);
2531 
2532         if (event->pmu->add(event, PERF_EF_START)) {
2533                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2534                 event->oncpu = -1;
2535                 ret = -EAGAIN;
2536                 goto out;
2537         }
2538 
2539         if (!is_software_event(event))
2540                 cpuctx->active_oncpu++;
2541         if (!ctx->nr_active++)
2542                 perf_event_ctx_activate(ctx);
2543         if (event->attr.freq && event->attr.sample_freq)
2544                 ctx->nr_freq++;
2545 
2546         if (event->attr.exclusive)
2547                 cpuctx->exclusive = 1;
2548 
2549 out:
2550         perf_pmu_enable(event->pmu);
2551 
2552         return ret;
2553 }
2554 
2555 static int
2556 group_sched_in(struct perf_event *group_event,
2557                struct perf_cpu_context *cpuctx,
2558                struct perf_event_context *ctx)
2559 {
2560         struct perf_event *event, *partial_group = NULL;
2561         struct pmu *pmu = ctx->pmu;
2562 
2563         if (group_event->state == PERF_EVENT_STATE_OFF)
2564                 return 0;
2565 
2566         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2567 
2568         if (event_sched_in(group_event, cpuctx, ctx))
2569                 goto error;
2570 
2571         /*
2572          * Schedule in siblings as one group (if any):
2573          */
2574         for_each_sibling_event(event, group_event) {
2575                 if (event_sched_in(event, cpuctx, ctx)) {
2576                         partial_group = event;
2577                         goto group_error;
2578                 }
2579         }
2580 
2581         if (!pmu->commit_txn(pmu))
2582                 return 0;
2583 
2584 group_error:
2585         /*
2586          * Groups can be scheduled in as one unit only, so undo any
2587          * partial group before returning:
2588          * The events up to the failed event are scheduled out normally.
2589          */
2590         for_each_sibling_event(event, group_event) {
2591                 if (event == partial_group)
2592                         break;
2593 
2594                 event_sched_out(event, cpuctx, ctx);
2595         }
2596         event_sched_out(group_event, cpuctx, ctx);
2597 
2598 error:
2599         pmu->cancel_txn(pmu);
2600         return -EAGAIN;
2601 }
2602 
2603 /*
2604  * Work out whether we can put this event group on the CPU now.
2605  */
2606 static int group_can_go_on(struct perf_event *event,
2607                            struct perf_cpu_context *cpuctx,
2608                            int can_add_hw)
2609 {
2610         /*
2611          * Groups consisting entirely of software events can always go on.
2612          */
2613         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2614                 return 1;
2615         /*
2616          * If an exclusive group is already on, no other hardware
2617          * events can go on.
2618          */
2619         if (cpuctx->exclusive)
2620                 return 0;
2621         /*
2622          * If this group is exclusive and there are already
2623          * events on the CPU, it can't go on.
2624          */
2625         if (event->attr.exclusive && cpuctx->active_oncpu)
2626                 return 0;
2627         /*
2628          * Otherwise, try to add it if all previous groups were able
2629          * to go on.
2630          */
2631         return can_add_hw;
2632 }
2633 
2634 static void add_event_to_ctx(struct perf_event *event,
2635                                struct perf_event_context *ctx)
2636 {
2637         list_add_event(event, ctx);
2638         perf_group_attach(event);
2639 }
2640 
2641 static void ctx_sched_out(struct perf_event_context *ctx,
2642                           struct perf_cpu_context *cpuctx,
2643                           enum event_type_t event_type);
2644 static void
2645 ctx_sched_in(struct perf_event_context *ctx,
2646              struct perf_cpu_context *cpuctx,
2647              enum event_type_t event_type,
2648              struct task_struct *task);
2649 
2650 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2651                                struct perf_event_context *ctx,
2652                                enum event_type_t event_type)
2653 {
2654         if (!cpuctx->task_ctx)
2655                 return;
2656 
2657         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2658                 return;
2659 
2660         ctx_sched_out(ctx, cpuctx, event_type);
2661 }
2662 
2663 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2664                                 struct perf_event_context *ctx,
2665                                 struct task_struct *task)
2666 {
2667         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2668         if (ctx)
2669                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2670         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2671         if (ctx)
2672                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2673 }
2674 
2675 /*
2676  * We want to maintain the following priority of scheduling:
2677  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2678  *  - task pinned (EVENT_PINNED)
2679  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2680  *  - task flexible (EVENT_FLEXIBLE).
2681  *
2682  * In order to avoid unscheduling and scheduling back in everything every
2683  * time an event is added, only do it for the groups of equal priority and
2684  * below.
2685  *
2686  * This can be called after a batch operation on task events, in which case
2687  * event_type is a bit mask of the types of events involved. For CPU events,
2688  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2689  */
2690 static void ctx_resched(struct perf_cpu_context *cpuctx,
2691                         struct perf_event_context *task_ctx,
2692                         enum event_type_t event_type)
2693 {
2694         enum event_type_t ctx_event_type;
2695         bool cpu_event = !!(event_type & EVENT_CPU);
2696 
2697         /*
2698          * If pinned groups are involved, flexible groups also need to be
2699          * scheduled out.
2700          */
2701         if (event_type & EVENT_PINNED)
2702                 event_type |= EVENT_FLEXIBLE;
2703 
2704         ctx_event_type = event_type & EVENT_ALL;
2705 
2706         perf_pmu_disable(cpuctx->ctx.pmu);
2707         if (task_ctx)
2708                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2709 
2710         /*
2711          * Decide which cpu ctx groups to schedule out based on the types
2712          * of events that caused rescheduling:
2713          *  - EVENT_CPU: schedule out corresponding groups;
2714          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2715          *  - otherwise, do nothing more.
2716          */
2717         if (cpu_event)
2718                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2719         else if (ctx_event_type & EVENT_PINNED)
2720                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2721 
2722         perf_event_sched_in(cpuctx, task_ctx, current);
2723         perf_pmu_enable(cpuctx->ctx.pmu);
2724 }
2725 
2726 void perf_pmu_resched(struct pmu *pmu)
2727 {
2728         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2729         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2730 
2731         perf_ctx_lock(cpuctx, task_ctx);
2732         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2733         perf_ctx_unlock(cpuctx, task_ctx);
2734 }
2735 
2736 /*
2737  * Cross CPU call to install and enable a performance event
2738  *
2739  * Very similar to remote_function() + event_function() but cannot assume that
2740  * things like ctx->is_active and cpuctx->task_ctx are set.
2741  */
2742 static int  __perf_install_in_context(void *info)
2743 {
2744         struct perf_event *event = info;
2745         struct perf_event_context *ctx = event->ctx;
2746         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2747         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2748         bool reprogram = true;
2749         int ret = 0;
2750 
2751         raw_spin_lock(&cpuctx->ctx.lock);
2752         if (ctx->task) {
2753                 raw_spin_lock(&ctx->lock);
2754                 task_ctx = ctx;
2755 
2756                 reprogram = (ctx->task == current);
2757 
2758                 /*
2759                  * If the task is running, it must be running on this CPU,
2760                  * otherwise we cannot reprogram things.
2761                  *
2762                  * If its not running, we don't care, ctx->lock will
2763                  * serialize against it becoming runnable.
2764                  */
2765                 if (task_curr(ctx->task) && !reprogram) {
2766                         ret = -ESRCH;
2767                         goto unlock;
2768                 }
2769 
2770                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2771         } else if (task_ctx) {
2772                 raw_spin_lock(&task_ctx->lock);
2773         }
2774 
2775 #ifdef CONFIG_CGROUP_PERF
2776         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2777                 /*
2778                  * If the current cgroup doesn't match the event's
2779                  * cgroup, we should not try to schedule it.
2780                  */
2781                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2782                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2783                                         event->cgrp->css.cgroup);
2784         }
2785 #endif
2786 
2787         if (reprogram) {
2788                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2789                 add_event_to_ctx(event, ctx);
2790                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2791         } else {
2792                 add_event_to_ctx(event, ctx);
2793         }
2794 
2795 unlock:
2796         perf_ctx_unlock(cpuctx, task_ctx);
2797 
2798         return ret;
2799 }
2800 
2801 static bool exclusive_event_installable(struct perf_event *event,
2802                                         struct perf_event_context *ctx);
2803 
2804 /*
2805  * Attach a performance event to a context.
2806  *
2807  * Very similar to event_function_call, see comment there.
2808  */
2809 static void
2810 perf_install_in_context(struct perf_event_context *ctx,
2811                         struct perf_event *event,
2812                         int cpu)
2813 {
2814         struct task_struct *task = READ_ONCE(ctx->task);
2815 
2816         lockdep_assert_held(&ctx->mutex);
2817 
2818         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2819 
2820         if (event->cpu != -1)
2821                 event->cpu = cpu;
2822 
2823         /*
2824          * Ensures that if we can observe event->ctx, both the event and ctx
2825          * will be 'complete'. See perf_iterate_sb_cpu().
2826          */
2827         smp_store_release(&event->ctx, ctx);
2828 
2829         /*
2830          * perf_event_attr::disabled events will not run and can be initialized
2831          * without IPI. Except when this is the first event for the context, in
2832          * that case we need the magic of the IPI to set ctx->is_active.
2833          *
2834          * The IOC_ENABLE that is sure to follow the creation of a disabled
2835          * event will issue the IPI and reprogram the hardware.
2836          */
2837         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2838                 raw_spin_lock_irq(&ctx->lock);
2839                 if (ctx->task == TASK_TOMBSTONE) {
2840                         raw_spin_unlock_irq(&ctx->lock);
2841                         return;
2842                 }
2843                 add_event_to_ctx(event, ctx);
2844                 raw_spin_unlock_irq(&ctx->lock);
2845                 return;
2846         }
2847 
2848         if (!task) {
2849                 cpu_function_call(cpu, __perf_install_in_context, event);
2850                 return;
2851         }
2852 
2853         /*
2854          * Should not happen, we validate the ctx is still alive before calling.
2855          */
2856         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2857                 return;
2858 
2859         /*
2860          * Installing events is tricky because we cannot rely on ctx->is_active
2861          * to be set in case this is the nr_events 0 -> 1 transition.
2862          *
2863          * Instead we use task_curr(), which tells us if the task is running.
2864          * However, since we use task_curr() outside of rq::lock, we can race
2865          * against the actual state. This means the result can be wrong.
2866          *
2867          * If we get a false positive, we retry, this is harmless.
2868          *
2869          * If we get a false negative, things are complicated. If we are after
2870          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2871          * value must be correct. If we're before, it doesn't matter since
2872          * perf_event_context_sched_in() will program the counter.
2873          *
2874          * However, this hinges on the remote context switch having observed
2875          * our task->perf_event_ctxp[] store, such that it will in fact take
2876          * ctx::lock in perf_event_context_sched_in().
2877          *
2878          * We do this by task_function_call(), if the IPI fails to hit the task
2879          * we know any future context switch of task must see the
2880          * perf_event_ctpx[] store.
2881          */
2882 
2883         /*
2884          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2885          * task_cpu() load, such that if the IPI then does not find the task
2886          * running, a future context switch of that task must observe the
2887          * store.
2888          */
2889         smp_mb();
2890 again:
2891         if (!task_function_call(task, __perf_install_in_context, event))
2892                 return;
2893 
2894         raw_spin_lock_irq(&ctx->lock);
2895         task = ctx->task;
2896         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2897                 /*
2898                  * Cannot happen because we already checked above (which also
2899                  * cannot happen), and we hold ctx->mutex, which serializes us
2900                  * against perf_event_exit_task_context().
2901                  */
2902                 raw_spin_unlock_irq(&ctx->lock);
2903                 return;
2904         }
2905         /*
2906          * If the task is not running, ctx->lock will avoid it becoming so,
2907          * thus we can safely install the event.
2908          */
2909         if (task_curr(task)) {
2910                 raw_spin_unlock_irq(&ctx->lock);
2911                 goto again;
2912         }
2913         add_event_to_ctx(event, ctx);
2914         raw_spin_unlock_irq(&ctx->lock);
2915 }
2916 
2917 /*
2918  * Cross CPU call to enable a performance event
2919  */
2920 static void __perf_event_enable(struct perf_event *event,
2921                                 struct perf_cpu_context *cpuctx,
2922                                 struct perf_event_context *ctx,
2923                                 void *info)
2924 {
2925         struct perf_event *leader = event->group_leader;
2926         struct perf_event_context *task_ctx;
2927 
2928         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2929             event->state <= PERF_EVENT_STATE_ERROR)
2930                 return;
2931 
2932         if (ctx->is_active)
2933                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2934 
2935         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2936         perf_cgroup_event_enable(event, ctx);
2937 
2938         if (!ctx->is_active)
2939                 return;
2940 
2941         if (!event_filter_match(event)) {
2942                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2943                 return;
2944         }
2945 
2946         /*
2947          * If the event is in a group and isn't the group leader,
2948          * then don't put it on unless the group is on.
2949          */
2950         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2951                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2952                 return;
2953         }
2954 
2955         task_ctx = cpuctx->task_ctx;
2956         if (ctx->task)
2957                 WARN_ON_ONCE(task_ctx != ctx);
2958 
2959         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2960 }
2961 
2962 /*
2963  * Enable an event.
2964  *
2965  * If event->ctx is a cloned context, callers must make sure that
2966  * every task struct that event->ctx->task could possibly point to
2967  * remains valid.  This condition is satisfied when called through
2968  * perf_event_for_each_child or perf_event_for_each as described
2969  * for perf_event_disable.
2970  */
2971 static void _perf_event_enable(struct perf_event *event)
2972 {
2973         struct perf_event_context *ctx = event->ctx;
2974 
2975         raw_spin_lock_irq(&ctx->lock);
2976         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2977             event->state <  PERF_EVENT_STATE_ERROR) {
2978                 raw_spin_unlock_irq(&ctx->lock);
2979                 return;
2980         }
2981 
2982         /*
2983          * If the event is in error state, clear that first.
2984          *
2985          * That way, if we see the event in error state below, we know that it
2986          * has gone back into error state, as distinct from the task having
2987          * been scheduled away before the cross-call arrived.
2988          */
2989         if (event->state == PERF_EVENT_STATE_ERROR)
2990                 event->state = PERF_EVENT_STATE_OFF;
2991         raw_spin_unlock_irq(&ctx->lock);
2992 
2993         event_function_call(event, __perf_event_enable, NULL);
2994 }
2995 
2996 /*
2997  * See perf_event_disable();
2998  */
2999 void perf_event_enable(struct perf_event *event)
3000 {
3001         struct perf_event_context *ctx;
3002 
3003         ctx = perf_event_ctx_lock(event);
3004         _perf_event_enable(event);
3005         perf_event_ctx_unlock(event, ctx);
3006 }
3007 EXPORT_SYMBOL_GPL(perf_event_enable);
3008 
3009 struct stop_event_data {
3010         struct perf_event       *event;
3011         unsigned int            restart;
3012 };
3013 
3014 static int __perf_event_stop(void *info)
3015 {
3016         struct stop_event_data *sd = info;
3017         struct perf_event *event = sd->event;
3018 
3019         /* if it's already INACTIVE, do nothing */
3020         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3021                 return 0;
3022 
3023         /* matches smp_wmb() in event_sched_in() */
3024         smp_rmb();
3025 
3026         /*
3027          * There is a window with interrupts enabled before we get here,
3028          * so we need to check again lest we try to stop another CPU's event.
3029          */
3030         if (READ_ONCE(event->oncpu) != smp_processor_id())
3031                 return -EAGAIN;
3032 
3033         event->pmu->stop(event, PERF_EF_UPDATE);
3034 
3035         /*
3036          * May race with the actual stop (through perf_pmu_output_stop()),
3037          * but it is only used for events with AUX ring buffer, and such
3038          * events will refuse to restart because of rb::aux_mmap_count==0,
3039          * see comments in perf_aux_output_begin().
3040          *
3041          * Since this is happening on an event-local CPU, no trace is lost
3042          * while restarting.
3043          */
3044         if (sd->restart)
3045                 event->pmu->start(event, 0);
3046 
3047         return 0;
3048 }
3049 
3050 static int perf_event_stop(struct perf_event *event, int restart)
3051 {
3052         struct stop_event_data sd = {
3053                 .event          = event,
3054                 .restart        = restart,
3055         };
3056         int ret = 0;
3057 
3058         do {
3059                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3060                         return 0;
3061 
3062                 /* matches smp_wmb() in event_sched_in() */
3063                 smp_rmb();
3064 
3065                 /*
3066                  * We only want to restart ACTIVE events, so if the event goes
3067                  * inactive here (event->oncpu==-1), there's nothing more to do;
3068                  * fall through with ret==-ENXIO.
3069                  */
3070                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3071                                         __perf_event_stop, &sd);
3072         } while (ret == -EAGAIN);
3073 
3074         return ret;
3075 }
3076 
3077 /*
3078  * In order to contain the amount of racy and tricky in the address filter
3079  * configuration management, it is a two part process:
3080  *
3081  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3082  *      we update the addresses of corresponding vmas in
3083  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3084  * (p2) when an event is scheduled in (pmu::add), it calls
3085  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3086  *      if the generation has changed since the previous call.
3087  *
3088  * If (p1) happens while the event is active, we restart it to force (p2).
3089  *
3090  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3091  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3092  *     ioctl;
3093  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3094  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3095  *     for reading;
3096  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3097  *     of exec.
3098  */
3099 void perf_event_addr_filters_sync(struct perf_event *event)
3100 {
3101         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3102 
3103         if (!has_addr_filter(event))
3104                 return;
3105 
3106         raw_spin_lock(&ifh->lock);
3107         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3108                 event->pmu->addr_filters_sync(event);
3109                 event->hw.addr_filters_gen = event->addr_filters_gen;
3110         }
3111         raw_spin_unlock(&ifh->lock);
3112 }
3113 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3114 
3115 static int _perf_event_refresh(struct perf_event *event, int refresh)
3116 {
3117         /*
3118          * not supported on inherited events
3119          */
3120         if (event->attr.inherit || !is_sampling_event(event))
3121                 return -EINVAL;
3122 
3123         atomic_add(refresh, &event->event_limit);
3124         _perf_event_enable(event);
3125 
3126         return 0;
3127 }
3128 
3129 /*
3130  * See perf_event_disable()
3131  */
3132 int perf_event_refresh(struct perf_event *event, int refresh)
3133 {
3134         struct perf_event_context *ctx;
3135         int ret;
3136 
3137         ctx = perf_event_ctx_lock(event);
3138         ret = _perf_event_refresh(event, refresh);
3139         perf_event_ctx_unlock(event, ctx);
3140 
3141         return ret;
3142 }
3143 EXPORT_SYMBOL_GPL(perf_event_refresh);
3144 
3145 static int perf_event_modify_breakpoint(struct perf_event *bp,
3146                                          struct perf_event_attr *attr)
3147 {
3148         int err;
3149 
3150         _perf_event_disable(bp);
3151 
3152         err = modify_user_hw_breakpoint_check(bp, attr, true);
3153 
3154         if (!bp->attr.disabled)
3155                 _perf_event_enable(bp);
3156 
3157         return err;
3158 }
3159 
3160 static int perf_event_modify_attr(struct perf_event *event,
3161                                   struct perf_event_attr *attr)
3162 {
3163         if (event->attr.type != attr->type)
3164                 return -EINVAL;
3165 
3166         switch (event->attr.type) {
3167         case PERF_TYPE_BREAKPOINT:
3168                 return perf_event_modify_breakpoint(event, attr);
3169         default:
3170                 /* Place holder for future additions. */
3171                 return -EOPNOTSUPP;
3172         }
3173 }
3174 
3175 static void ctx_sched_out(struct perf_event_context *ctx,
3176                           struct perf_cpu_context *cpuctx,
3177                           enum event_type_t event_type)
3178 {
3179         struct perf_event *event, *tmp;
3180         int is_active = ctx->is_active;
3181 
3182         lockdep_assert_held(&ctx->lock);
3183 
3184         if (likely(!ctx->nr_events)) {
3185                 /*
3186                  * See __perf_remove_from_context().
3187                  */
3188                 WARN_ON_ONCE(ctx->is_active);
3189                 if (ctx->task)
3190                         WARN_ON_ONCE(cpuctx->task_ctx);
3191                 return;
3192         }
3193 
3194         ctx->is_active &= ~event_type;
3195         if (!(ctx->is_active & EVENT_ALL))
3196                 ctx->is_active = 0;
3197 
3198         if (ctx->task) {
3199                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3200                 if (!ctx->is_active)
3201                         cpuctx->task_ctx = NULL;
3202         }
3203 
3204         /*
3205          * Always update time if it was set; not only when it changes.
3206          * Otherwise we can 'forget' to update time for any but the last
3207          * context we sched out. For example:
3208          *
3209          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3210          *   ctx_sched_out(.event_type = EVENT_PINNED)
3211          *
3212          * would only update time for the pinned events.
3213          */
3214         if (is_active & EVENT_TIME) {
3215                 /* update (and stop) ctx time */
3216                 update_context_time(ctx);
3217                 update_cgrp_time_from_cpuctx(cpuctx);
3218         }
3219 
3220         is_active ^= ctx->is_active; /* changed bits */
3221 
3222         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3223                 return;
3224 
3225         perf_pmu_disable(ctx->pmu);
3226         if (is_active & EVENT_PINNED) {
3227                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3228                         group_sched_out(event, cpuctx, ctx);
3229         }
3230 
3231         if (is_active & EVENT_FLEXIBLE) {
3232                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3233                         group_sched_out(event, cpuctx, ctx);
3234 
3235                 /*
3236                  * Since we cleared EVENT_FLEXIBLE, also clear
3237                  * rotate_necessary, is will be reset by
3238                  * ctx_flexible_sched_in() when needed.
3239                  */
3240                 ctx->rotate_necessary = 0;
3241         }
3242         perf_pmu_enable(ctx->pmu);
3243 }
3244 
3245 /*
3246  * Test whether two contexts are equivalent, i.e. whether they have both been
3247  * cloned from the same version of the same context.
3248  *
3249  * Equivalence is measured using a generation number in the context that is
3250  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3251  * and list_del_event().
3252  */
3253 static int context_equiv(struct perf_event_context *ctx1,
3254                          struct perf_event_context *ctx2)
3255 {
3256         lockdep_assert_held(&ctx1->lock);
3257         lockdep_assert_held(&ctx2->lock);
3258 
3259         /* Pinning disables the swap optimization */
3260         if (ctx1->pin_count || ctx2->pin_count)
3261                 return 0;
3262 
3263         /* If ctx1 is the parent of ctx2 */
3264         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3265                 return 1;
3266 
3267         /* If ctx2 is the parent of ctx1 */
3268         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3269                 return 1;
3270 
3271         /*
3272          * If ctx1 and ctx2 have the same parent; we flatten the parent
3273          * hierarchy, see perf_event_init_context().
3274          */
3275         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3276                         ctx1->parent_gen == ctx2->parent_gen)
3277                 return 1;
3278 
3279         /* Unmatched */
3280         return 0;
3281 }
3282 
3283 static void __perf_event_sync_stat(struct perf_event *event,
3284                                      struct perf_event *next_event)
3285 {
3286         u64 value;
3287 
3288         if (!event->attr.inherit_stat)
3289                 return;
3290 
3291         /*
3292          * Update the event value, we cannot use perf_event_read()
3293          * because we're in the middle of a context switch and have IRQs
3294          * disabled, which upsets smp_call_function_single(), however
3295          * we know the event must be on the current CPU, therefore we
3296          * don't need to use it.
3297          */
3298         if (event->state == PERF_EVENT_STATE_ACTIVE)
3299                 event->pmu->read(event);
3300 
3301         perf_event_update_time(event);
3302 
3303         /*
3304          * In order to keep per-task stats reliable we need to flip the event
3305          * values when we flip the contexts.
3306          */
3307         value = local64_read(&next_event->count);
3308         value = local64_xchg(&event->count, value);
3309         local64_set(&next_event->count, value);
3310 
3311         swap(event->total_time_enabled, next_event->total_time_enabled);
3312         swap(event->total_time_running, next_event->total_time_running);
3313 
3314         /*
3315          * Since we swizzled the values, update the user visible data too.
3316          */
3317         perf_event_update_userpage(event);
3318         perf_event_update_userpage(next_event);
3319 }
3320 
3321 static void perf_event_sync_stat(struct perf_event_context *ctx,
3322                                    struct perf_event_context *next_ctx)
3323 {
3324         struct perf_event *event, *next_event;
3325 
3326         if (!ctx->nr_stat)
3327                 return;
3328 
3329         update_context_time(ctx);
3330 
3331         event = list_first_entry(&ctx->event_list,
3332                                    struct perf_event, event_entry);
3333 
3334         next_event = list_first_entry(&next_ctx->event_list,
3335                                         struct perf_event, event_entry);
3336 
3337         while (&event->event_entry != &ctx->event_list &&
3338                &next_event->event_entry != &next_ctx->event_list) {
3339 
3340                 __perf_event_sync_stat(event, next_event);
3341 
3342                 event = list_next_entry(event, event_entry);
3343                 next_event = list_next_entry(next_event, event_entry);
3344         }
3345 }
3346 
3347 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3348                                          struct task_struct *next)
3349 {
3350         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3351         struct perf_event_context *next_ctx;
3352         struct perf_event_context *parent, *next_parent;
3353         struct perf_cpu_context *cpuctx;
3354         int do_switch = 1;
3355 
3356         if (likely(!ctx))
3357                 return;
3358 
3359         cpuctx = __get_cpu_context(ctx);
3360         if (!cpuctx->task_ctx)
3361                 return;
3362 
3363         rcu_read_lock();
3364         next_ctx = next->perf_event_ctxp[ctxn];
3365         if (!next_ctx)
3366                 goto unlock;
3367 
3368         parent = rcu_dereference(ctx->parent_ctx);
3369         next_parent = rcu_dereference(next_ctx->parent_ctx);
3370 
3371         /* If neither context have a parent context; they cannot be clones. */
3372         if (!parent && !next_parent)
3373                 goto unlock;
3374 
3375         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3376                 /*
3377                  * Looks like the two contexts are clones, so we might be
3378                  * able to optimize the context switch.  We lock both
3379                  * contexts and check that they are clones under the
3380                  * lock (including re-checking that neither has been
3381                  * uncloned in the meantime).  It doesn't matter which
3382                  * order we take the locks because no other cpu could
3383                  * be trying to lock both of these tasks.
3384                  */
3385                 raw_spin_lock(&ctx->lock);
3386                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3387                 if (context_equiv(ctx, next_ctx)) {
3388                         struct pmu *pmu = ctx->pmu;
3389 
3390                         WRITE_ONCE(ctx->task, next);
3391                         WRITE_ONCE(next_ctx->task, task);
3392 
3393                         /*
3394                          * PMU specific parts of task perf context can require
3395                          * additional synchronization. As an example of such
3396                          * synchronization see implementation details of Intel
3397                          * LBR call stack data profiling;
3398                          */
3399                         if (pmu->swap_task_ctx)
3400                                 pmu->swap_task_ctx(ctx, next_ctx);
3401                         else
3402                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3403 
3404                         /*
3405                          * RCU_INIT_POINTER here is safe because we've not
3406                          * modified the ctx and the above modification of
3407                          * ctx->task and ctx->task_ctx_data are immaterial
3408                          * since those values are always verified under
3409                          * ctx->lock which we're now holding.
3410                          */
3411                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3412                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3413 
3414                         do_switch = 0;
3415 
3416                         perf_event_sync_stat(ctx, next_ctx);
3417                 }
3418                 raw_spin_unlock(&next_ctx->lock);
3419                 raw_spin_unlock(&ctx->lock);
3420         }
3421 unlock:
3422         rcu_read_unlock();
3423 
3424         if (do_switch) {
3425                 raw_spin_lock(&ctx->lock);
3426                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3427                 raw_spin_unlock(&ctx->lock);
3428         }
3429 }
3430 
3431 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3432 
3433 void perf_sched_cb_dec(struct pmu *pmu)
3434 {
3435         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3436 
3437         this_cpu_dec(perf_sched_cb_usages);
3438 
3439         if (!--cpuctx->sched_cb_usage)
3440                 list_del(&cpuctx->sched_cb_entry);
3441 }
3442 
3443 
3444 void perf_sched_cb_inc(struct pmu *pmu)
3445 {
3446         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3447 
3448         if (!cpuctx->sched_cb_usage++)
3449                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3450 
3451         this_cpu_inc(perf_sched_cb_usages);
3452 }
3453 
3454 /*
3455  * This function provides the context switch callback to the lower code
3456  * layer. It is invoked ONLY when the context switch callback is enabled.
3457  *
3458  * This callback is relevant even to per-cpu events; for example multi event
3459  * PEBS requires this to provide PID/TID information. This requires we flush
3460  * all queued PEBS records before we context switch to a new task.
3461  */
3462 static void perf_pmu_sched_task(struct task_struct *prev,
3463                                 struct task_struct *next,
3464                                 bool sched_in)
3465 {
3466         struct perf_cpu_context *cpuctx;
3467         struct pmu *pmu;
3468 
3469         if (prev == next)
3470                 return;
3471 
3472         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3473                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3474 
3475                 if (WARN_ON_ONCE(!pmu->sched_task))
3476                         continue;
3477 
3478                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3479                 perf_pmu_disable(pmu);
3480 
3481                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3482 
3483                 perf_pmu_enable(pmu);
3484                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3485         }
3486 }
3487 
3488 static void perf_event_switch(struct task_struct *task,
3489                               struct task_struct *next_prev, bool sched_in);
3490 
3491 #define for_each_task_context_nr(ctxn)                                  \
3492         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3493 
3494 /*
3495  * Called from scheduler to remove the events of the current task,
3496  * with interrupts disabled.
3497  *
3498  * We stop each event and update the event value in event->count.
3499  *
3500  * This does not protect us against NMI, but disable()
3501  * sets the disabled bit in the control field of event _before_
3502  * accessing the event control register. If a NMI hits, then it will
3503  * not restart the event.
3504  */
3505 void __perf_event_task_sched_out(struct task_struct *task,
3506                                  struct task_struct *next)
3507 {
3508         int ctxn;
3509 
3510         if (__this_cpu_read(perf_sched_cb_usages))
3511                 perf_pmu_sched_task(task, next, false);
3512 
3513         if (atomic_read(&nr_switch_events))
3514                 perf_event_switch(task, next, false);
3515 
3516         for_each_task_context_nr(ctxn)
3517                 perf_event_context_sched_out(task, ctxn, next);
3518 
3519         /*
3520          * if cgroup events exist on this CPU, then we need
3521          * to check if we have to switch out PMU state.
3522          * cgroup event are system-wide mode only
3523          */
3524         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3525                 perf_cgroup_sched_out(task, next);
3526 }
3527 
3528 /*
3529  * Called with IRQs disabled
3530  */
3531 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3532                               enum event_type_t event_type)
3533 {
3534         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3535 }
3536 
3537 static bool perf_less_group_idx(const void *l, const void *r)
3538 {
3539         const struct perf_event *le = *(const struct perf_event **)l;
3540         const struct perf_event *re = *(const struct perf_event **)r;
3541 
3542         return le->group_index < re->group_index;
3543 }
3544 
3545 static void swap_ptr(void *l, void *r)
3546 {
3547         void **lp = l, **rp = r;
3548 
3549         swap(*lp, *rp);
3550 }
3551 
3552 static const struct min_heap_callbacks perf_min_heap = {
3553         .elem_size = sizeof(struct perf_event *),
3554         .less = perf_less_group_idx,
3555         .swp = swap_ptr,
3556 };
3557 
3558 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3559 {
3560         struct perf_event **itrs = heap->data;
3561 
3562         if (event) {
3563                 itrs[heap->nr] = event;
3564                 heap->nr++;
3565         }
3566 }
3567 
3568 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3569                                 struct perf_event_groups *groups, int cpu,
3570                                 int (*func)(struct perf_event *, void *),
3571                                 void *data)
3572 {
3573 #ifdef CONFIG_CGROUP_PERF
3574         struct cgroup_subsys_state *css = NULL;
3575 #endif
3576         /* Space for per CPU and/or any CPU event iterators. */
3577         struct perf_event *itrs[2];
3578         struct min_heap event_heap;
3579         struct perf_event **evt;
3580         int ret;
3581 
3582         if (cpuctx) {
3583                 event_heap = (struct min_heap){
3584                         .data = cpuctx->heap,
3585                         .nr = 0,
3586                         .size = cpuctx->heap_size,
3587                 };
3588 
3589                 lockdep_assert_held(&cpuctx->ctx.lock);
3590 
3591 #ifdef CONFIG_CGROUP_PERF
3592                 if (cpuctx->cgrp)
3593                         css = &cpuctx->cgrp->css;
3594 #endif
3595         } else {
3596                 event_heap = (struct min_heap){
3597                         .data = itrs,
3598                         .nr = 0,
3599                         .size = ARRAY_SIZE(itrs),
3600                 };
3601                 /* Events not within a CPU context may be on any CPU. */
3602                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3603         }
3604         evt = event_heap.data;
3605 
3606         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3607 
3608 #ifdef CONFIG_CGROUP_PERF
3609         for (; css; css = css->parent)
3610                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3611 #endif
3612 
3613         min_heapify_all(&event_heap, &perf_min_heap);
3614 
3615         while (event_heap.nr) {
3616                 ret = func(*evt, data);
3617                 if (ret)
3618                         return ret;
3619 
3620                 *evt = perf_event_groups_next(*evt);
3621                 if (*evt)
3622                         min_heapify(&event_heap, 0, &perf_min_heap);
3623                 else
3624                         min_heap_pop(&event_heap, &perf_min_heap);
3625         }
3626 
3627         return 0;
3628 }
3629 
3630 static int merge_sched_in(struct perf_event *event, void *data)
3631 {
3632         struct perf_event_context *ctx = event->ctx;
3633         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3634         int *can_add_hw = data;
3635 
3636         if (event->state <= PERF_EVENT_STATE_OFF)
3637                 return 0;
3638 
3639         if (!event_filter_match(event))
3640                 return 0;
3641 
3642         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3643                 if (!group_sched_in(event, cpuctx, ctx))
3644                         list_add_tail(&event->active_list, get_event_list(event));
3645         }
3646 
3647         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3648                 if (event->attr.pinned) {
3649                         perf_cgroup_event_disable(event, ctx);
3650                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3651                 }
3652 
3653                 *can_add_hw = 0;
3654                 ctx->rotate_necessary = 1;
3655                 perf_mux_hrtimer_restart(cpuctx);
3656         }
3657 
3658         return 0;
3659 }
3660 
3661 static void
3662 ctx_pinned_sched_in(struct perf_event_context *ctx,
3663                     struct perf_cpu_context *cpuctx)
3664 {
3665         int can_add_hw = 1;
3666 
3667         if (ctx != &cpuctx->ctx)
3668                 cpuctx = NULL;
3669 
3670         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3671                            smp_processor_id(),
3672                            merge_sched_in, &can_add_hw);
3673 }
3674 
3675 static void
3676 ctx_flexible_sched_in(struct perf_event_context *ctx,
3677                       struct perf_cpu_context *cpuctx)
3678 {
3679         int can_add_hw = 1;
3680 
3681         if (ctx != &cpuctx->ctx)
3682                 cpuctx = NULL;
3683 
3684         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3685                            smp_processor_id(),
3686                            merge_sched_in, &can_add_hw);
3687 }
3688 
3689 static void
3690 ctx_sched_in(struct perf_event_context *ctx,
3691              struct perf_cpu_context *cpuctx,
3692              enum event_type_t event_type,
3693              struct task_struct *task)
3694 {
3695         int is_active = ctx->is_active;
3696         u64 now;
3697 
3698         lockdep_assert_held(&ctx->lock);
3699 
3700         if (likely(!ctx->nr_events))
3701                 return;
3702 
3703         ctx->is_active |= (event_type | EVENT_TIME);
3704         if (ctx->task) {
3705                 if (!is_active)
3706                         cpuctx->task_ctx = ctx;
3707                 else
3708                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3709         }
3710 
3711         is_active ^= ctx->is_active; /* changed bits */
3712 
3713         if (is_active & EVENT_TIME) {
3714                 /* start ctx time */
3715                 now = perf_clock();
3716                 ctx->timestamp = now;
3717                 perf_cgroup_set_timestamp(task, ctx);
3718         }
3719 
3720         /*
3721          * First go through the list and put on any pinned groups
3722          * in order to give them the best chance of going on.
3723          */
3724         if (is_active & EVENT_PINNED)
3725                 ctx_pinned_sched_in(ctx, cpuctx);
3726 
3727         /* Then walk through the lower prio flexible groups */
3728         if (is_active & EVENT_FLEXIBLE)
3729                 ctx_flexible_sched_in(ctx, cpuctx);
3730 }
3731 
3732 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3733                              enum event_type_t event_type,
3734                              struct task_struct *task)
3735 {
3736         struct perf_event_context *ctx = &cpuctx->ctx;
3737 
3738         ctx_sched_in(ctx, cpuctx, event_type, task);
3739 }
3740 
3741 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3742                                         struct task_struct *task)
3743 {
3744         struct perf_cpu_context *cpuctx;
3745 
3746         cpuctx = __get_cpu_context(ctx);
3747         if (cpuctx->task_ctx == ctx)
3748                 return;
3749 
3750         perf_ctx_lock(cpuctx, ctx);
3751         /*
3752          * We must check ctx->nr_events while holding ctx->lock, such
3753          * that we serialize against perf_install_in_context().
3754          */
3755         if (!ctx->nr_events)
3756                 goto unlock;
3757 
3758         perf_pmu_disable(ctx->pmu);
3759         /*
3760          * We want to keep the following priority order:
3761          * cpu pinned (that don't need to move), task pinned,
3762          * cpu flexible, task flexible.
3763          *
3764          * However, if task's ctx is not carrying any pinned
3765          * events, no need to flip the cpuctx's events around.
3766          */
3767         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3768                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3769         perf_event_sched_in(cpuctx, ctx, task);
3770         perf_pmu_enable(ctx->pmu);
3771 
3772 unlock:
3773         perf_ctx_unlock(cpuctx, ctx);
3774 }
3775 
3776 /*
3777  * Called from scheduler to add the events of the current task
3778  * with interrupts disabled.
3779  *
3780  * We restore the event value and then enable it.
3781  *
3782  * This does not protect us against NMI, but enable()
3783  * sets the enabled bit in the control field of event _before_
3784  * accessing the event control register. If a NMI hits, then it will
3785  * keep the event running.
3786  */
3787 void __perf_event_task_sched_in(struct task_struct *prev,
3788                                 struct task_struct *task)
3789 {
3790         struct perf_event_context *ctx;
3791         int ctxn;
3792 
3793         /*
3794          * If cgroup events exist on this CPU, then we need to check if we have
3795          * to switch in PMU state; cgroup event are system-wide mode only.
3796          *
3797          * Since cgroup events are CPU events, we must schedule these in before
3798          * we schedule in the task events.
3799          */
3800         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3801                 perf_cgroup_sched_in(prev, task);
3802 
3803         for_each_task_context_nr(ctxn) {
3804                 ctx = task->perf_event_ctxp[ctxn];
3805                 if (likely(!ctx))
3806                         continue;
3807 
3808                 perf_event_context_sched_in(ctx, task);
3809         }
3810 
3811         if (atomic_read(&nr_switch_events))
3812                 perf_event_switch(task, prev, true);
3813 
3814         if (__this_cpu_read(perf_sched_cb_usages))
3815                 perf_pmu_sched_task(prev, task, true);
3816 }
3817 
3818 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3819 {
3820         u64 frequency = event->attr.sample_freq;
3821         u64 sec = NSEC_PER_SEC;
3822         u64 divisor, dividend;
3823 
3824         int count_fls, nsec_fls, frequency_fls, sec_fls;
3825 
3826         count_fls = fls64(count);
3827         nsec_fls = fls64(nsec);
3828         frequency_fls = fls64(frequency);
3829         sec_fls = 30;
3830 
3831         /*
3832          * We got @count in @nsec, with a target of sample_freq HZ
3833          * the target period becomes:
3834          *
3835          *             @count * 10^9
3836          * period = -------------------
3837          *          @nsec * sample_freq
3838          *
3839          */
3840 
3841         /*
3842          * Reduce accuracy by one bit such that @a and @b converge
3843          * to a similar magnitude.
3844          */
3845 #define REDUCE_FLS(a, b)                \
3846 do {                                    \
3847         if (a##_fls > b##_fls) {        \
3848                 a >>= 1;                \
3849                 a##_fls--;              \
3850         } else {                        \
3851                 b >>= 1;                \
3852                 b##_fls--;              \
3853         }                               \
3854 } while (0)
3855 
3856         /*
3857          * Reduce accuracy until either term fits in a u64, then proceed with
3858          * the other, so that finally we can do a u64/u64 division.
3859          */
3860         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3861                 REDUCE_FLS(nsec, frequency);
3862                 REDUCE_FLS(sec, count);
3863         }
3864 
3865         if (count_fls + sec_fls > 64) {
3866                 divisor = nsec * frequency;
3867 
3868                 while (count_fls + sec_fls > 64) {
3869                         REDUCE_FLS(count, sec);
3870                         divisor >>= 1;
3871                 }
3872 
3873                 dividend = count * sec;
3874         } else {
3875                 dividend = count * sec;
3876 
3877                 while (nsec_fls + frequency_fls > 64) {
3878                         REDUCE_FLS(nsec, frequency);
3879                         dividend >>= 1;
3880                 }
3881 
3882                 divisor = nsec * frequency;
3883         }
3884 
3885         if (!divisor)
3886                 return dividend;
3887 
3888         return div64_u64(dividend, divisor);
3889 }
3890 
3891 static DEFINE_PER_CPU(int, perf_throttled_count);
3892 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3893 
3894 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3895 {
3896         struct hw_perf_event *hwc = &event->hw;
3897         s64 period, sample_period;
3898         s64 delta;
3899 
3900         period = perf_calculate_period(event, nsec, count);
3901 
3902         delta = (s64)(period - hwc->sample_period);
3903         delta = (delta + 7) / 8; /* low pass filter */
3904 
3905         sample_period = hwc->sample_period + delta;
3906 
3907         if (!sample_period)
3908                 sample_period = 1;
3909 
3910         hwc->sample_period = sample_period;
3911 
3912         if (local64_read(&hwc->period_left) > 8*sample_period) {
3913                 if (disable)
3914                         event->pmu->stop(event, PERF_EF_UPDATE);
3915 
3916                 local64_set(&hwc->period_left, 0);
3917 
3918                 if (disable)
3919                         event->pmu->start(event, PERF_EF_RELOAD);
3920         }
3921 }
3922 
3923 /*
3924  * combine freq adjustment with unthrottling to avoid two passes over the
3925  * events. At the same time, make sure, having freq events does not change
3926  * the rate of unthrottling as that would introduce bias.
3927  */
3928 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3929                                            int needs_unthr)
3930 {
3931         struct perf_event *event;
3932         struct hw_perf_event *hwc;
3933         u64 now, period = TICK_NSEC;
3934         s64 delta;
3935 
3936         /*
3937          * only need to iterate over all events iff:
3938          * - context have events in frequency mode (needs freq adjust)
3939          * - there are events to unthrottle on this cpu
3940          */
3941         if (!(ctx->nr_freq || needs_unthr))
3942                 return;
3943 
3944         raw_spin_lock(&ctx->lock);
3945         perf_pmu_disable(ctx->pmu);
3946 
3947         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3948                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3949                         continue;
3950 
3951                 if (!event_filter_match(event))
3952                         continue;
3953 
3954                 perf_pmu_disable(event->pmu);
3955 
3956                 hwc = &event->hw;
3957 
3958                 if (hwc->interrupts == MAX_INTERRUPTS) {
3959                         hwc->interrupts = 0;
3960                         perf_log_throttle(event, 1);
3961                         event->pmu->start(event, 0);
3962                 }
3963 
3964                 if (!event->attr.freq || !event->attr.sample_freq)
3965                         goto next;
3966 
3967                 /*
3968                  * stop the event and update event->count
3969                  */
3970                 event->pmu->stop(event, PERF_EF_UPDATE);
3971 
3972                 now = local64_read(&event->count);
3973                 delta = now - hwc->freq_count_stamp;
3974                 hwc->freq_count_stamp = now;
3975 
3976                 /*
3977                  * restart the event
3978                  * reload only if value has changed
3979                  * we have stopped the event so tell that
3980                  * to perf_adjust_period() to avoid stopping it
3981                  * twice.
3982                  */
3983                 if (delta > 0)
3984                         perf_adjust_period(event, period, delta, false);
3985 
3986                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3987         next:
3988                 perf_pmu_enable(event->pmu);
3989         }
3990 
3991         perf_pmu_enable(ctx->pmu);
3992         raw_spin_unlock(&ctx->lock);
3993 }
3994 
3995 /*
3996  * Move @event to the tail of the @ctx's elegible events.
3997  */
3998 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3999 {
4000         /*
4001          * Rotate the first entry last of non-pinned groups. Rotation might be
4002          * disabled by the inheritance code.
4003          */
4004         if (ctx->rotate_disable)
4005                 return;
4006 
4007         perf_event_groups_delete(&ctx->flexible_groups, event);
4008         perf_event_groups_insert(&ctx->flexible_groups, event);
4009 }
4010 
4011 /* pick an event from the flexible_groups to rotate */
4012 static inline struct perf_event *
4013 ctx_event_to_rotate(struct perf_event_context *ctx)
4014 {
4015         struct perf_event *event;
4016 
4017         /* pick the first active flexible event */
4018         event = list_first_entry_or_null(&ctx->flexible_active,
4019                                          struct perf_event, active_list);
4020 
4021         /* if no active flexible event, pick the first event */
4022         if (!event) {
4023                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4024                                       typeof(*event), group_node);
4025         }
4026 
4027         /*
4028          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4029          * finds there are unschedulable events, it will set it again.
4030          */
4031         ctx->rotate_necessary = 0;
4032 
4033         return event;
4034 }
4035 
4036 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4037 {
4038         struct perf_event *cpu_event = NULL, *task_event = NULL;
4039         struct perf_event_context *task_ctx = NULL;
4040         int cpu_rotate, task_rotate;
4041 
4042         /*
4043          * Since we run this from IRQ context, nobody can install new
4044          * events, thus the event count values are stable.
4045          */
4046 
4047         cpu_rotate = cpuctx->ctx.rotate_necessary;
4048         task_ctx = cpuctx->task_ctx;
4049         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4050 
4051         if (!(cpu_rotate || task_rotate))
4052                 return false;
4053 
4054         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4055         perf_pmu_disable(cpuctx->ctx.pmu);
4056 
4057         if (task_rotate)
4058                 task_event = ctx_event_to_rotate(task_ctx);
4059         if (cpu_rotate)
4060                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4061 
4062         /*
4063          * As per the order given at ctx_resched() first 'pop' task flexible
4064          * and then, if needed CPU flexible.
4065          */
4066         if (task_event || (task_ctx && cpu_event))
4067                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4068         if (cpu_event)
4069                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4070 
4071         if (task_event)
4072                 rotate_ctx(task_ctx, task_event);
4073         if (cpu_event)
4074                 rotate_ctx(&cpuctx->ctx, cpu_event);
4075 
4076         perf_event_sched_in(cpuctx, task_ctx, current);
4077 
4078         perf_pmu_enable(cpuctx->ctx.pmu);
4079         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4080 
4081         return true;
4082 }
4083 
4084 void perf_event_task_tick(void)
4085 {
4086         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4087         struct perf_event_context *ctx, *tmp;
4088         int throttled;
4089 
4090         lockdep_assert_irqs_disabled();
4091 
4092         __this_cpu_inc(perf_throttled_seq);
4093         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4094         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4095 
4096         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4097                 perf_adjust_freq_unthr_context(ctx, throttled);
4098 }
4099 
4100 static int event_enable_on_exec(struct perf_event *event,
4101                                 struct perf_event_context *ctx)
4102 {
4103         if (!event->attr.enable_on_exec)
4104                 return 0;
4105 
4106         event->attr.enable_on_exec = 0;
4107         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4108                 return 0;
4109 
4110         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4111 
4112         return 1;
4113 }
4114 
4115 /*
4116  * Enable all of a task's events that have been marked enable-on-exec.
4117  * This expects task == current.
4118  */
4119 static void perf_event_enable_on_exec(int ctxn)
4120 {
4121         struct perf_event_context *ctx, *clone_ctx = NULL;
4122         enum event_type_t event_type = 0;
4123         struct perf_cpu_context *cpuctx;
4124         struct perf_event *event;
4125         unsigned long flags;
4126         int enabled = 0;
4127 
4128         local_irq_save(flags);
4129         ctx = current->perf_event_ctxp[ctxn];
4130         if (!ctx || !ctx->nr_events)
4131                 goto out;
4132 
4133         cpuctx = __get_cpu_context(ctx);
4134         perf_ctx_lock(cpuctx, ctx);
4135         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4136         list_for_each_entry(event, &ctx->event_list, event_entry) {
4137                 enabled |= event_enable_on_exec(event, ctx);
4138                 event_type |= get_event_type(event);
4139         }
4140 
4141         /*
4142          * Unclone and reschedule this context if we enabled any event.
4143          */
4144         if (enabled) {
4145                 clone_ctx = unclone_ctx(ctx);
4146                 ctx_resched(cpuctx, ctx, event_type);
4147         } else {
4148                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4149         }
4150         perf_ctx_unlock(cpuctx, ctx);
4151 
4152 out:
4153         local_irq_restore(flags);
4154 
4155         if (clone_ctx)
4156                 put_ctx(clone_ctx);
4157 }
4158 
4159 struct perf_read_data {
4160         struct perf_event *event;
4161         bool group;
4162         int ret;
4163 };
4164 
4165 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4166 {
4167         u16 local_pkg, event_pkg;
4168 
4169         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4170                 int local_cpu = smp_processor_id();
4171 
4172                 event_pkg = topology_physical_package_id(event_cpu);
4173                 local_pkg = topology_physical_package_id(local_cpu);
4174 
4175                 if (event_pkg == local_pkg)
4176                         return local_cpu;
4177         }
4178 
4179         return event_cpu;
4180 }
4181 
4182 /*
4183  * Cross CPU call to read the hardware event
4184  */
4185 static void __perf_event_read(void *info)
4186 {
4187         struct perf_read_data *data = info;
4188         struct perf_event *sub, *event = data->event;
4189         struct perf_event_context *ctx = event->ctx;
4190         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4191         struct pmu *pmu = event->pmu;
4192 
4193         /*
4194          * If this is a task context, we need to check whether it is
4195          * the current task context of this cpu.  If not it has been
4196          * scheduled out before the smp call arrived.  In that case
4197          * event->count would have been updated to a recent sample
4198          * when the event was scheduled out.
4199          */
4200         if (ctx->task && cpuctx->task_ctx != ctx)
4201                 return;
4202 
4203         raw_spin_lock(&ctx->lock);
4204         if (ctx->is_active & EVENT_TIME) {
4205                 update_context_time(ctx);
4206                 update_cgrp_time_from_event(event);
4207         }
4208 
4209         perf_event_update_time(event);
4210         if (data->group)
4211                 perf_event_update_sibling_time(event);
4212 
4213         if (event->state != PERF_EVENT_STATE_ACTIVE)
4214                 goto unlock;
4215 
4216         if (!data->group) {
4217                 pmu->read(event);
4218                 data->ret = 0;
4219                 goto unlock;
4220         }
4221 
4222         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4223 
4224         pmu->read(event);
4225 
4226         for_each_sibling_event(sub, event) {
4227                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4228                         /*
4229                          * Use sibling's PMU rather than @event's since
4230                          * sibling could be on different (eg: software) PMU.
4231                          */
4232                         sub->pmu->read(sub);
4233                 }
4234         }
4235 
4236         data->ret = pmu->commit_txn(pmu);
4237 
4238 unlock:
4239         raw_spin_unlock(&ctx->lock);
4240 }
4241 
4242 static inline u64 perf_event_count(struct perf_event *event)
4243 {
4244         return local64_read(&event->count) + atomic64_read(&event->child_count);
4245 }
4246 
4247 /*
4248  * NMI-safe method to read a local event, that is an event that
4249  * is:
4250  *   - either for the current task, or for this CPU
4251  *   - does not have inherit set, for inherited task events
4252  *     will not be local and we cannot read them atomically
4253  *   - must not have a pmu::count method
4254  */
4255 int perf_event_read_local(struct perf_event *event, u64 *value,
4256                           u64 *enabled, u64 *running)
4257 {
4258         unsigned long flags;
4259         int ret = 0;
4260 
4261         /*
4262          * Disabling interrupts avoids all counter scheduling (context
4263          * switches, timer based rotation and IPIs).
4264          */
4265         local_irq_save(flags);
4266 
4267         /*
4268          * It must not be an event with inherit set, we cannot read
4269          * all child counters from atomic context.
4270          */
4271         if (event->attr.inherit) {
4272                 ret = -EOPNOTSUPP;
4273                 goto out;
4274         }
4275 
4276         /* If this is a per-task event, it must be for current */
4277         if ((event->attach_state & PERF_ATTACH_TASK) &&
4278             event->hw.target != current) {
4279                 ret = -EINVAL;
4280                 goto out;
4281         }
4282 
4283         /* If this is a per-CPU event, it must be for this CPU */
4284         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4285             event->cpu != smp_processor_id()) {
4286                 ret = -EINVAL;
4287                 goto out;
4288         }
4289 
4290         /* If this is a pinned event it must be running on this CPU */
4291         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4292                 ret = -EBUSY;
4293                 goto out;
4294         }
4295 
4296         /*
4297          * If the event is currently on this CPU, its either a per-task event,
4298          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4299          * oncpu == -1).
4300          */
4301         if (event->oncpu == smp_processor_id())
4302                 event->pmu->read(event);
4303 
4304         *value = local64_read(&event->count);
4305         if (enabled || running) {
4306                 u64 now = event->shadow_ctx_time + perf_clock();
4307                 u64 __enabled, __running;
4308 
4309                 __perf_update_times(event, now, &__enabled, &__running);
4310                 if (enabled)
4311                         *enabled = __enabled;
4312                 if (running)
4313                         *running = __running;
4314         }
4315 out:
4316         local_irq_restore(flags);
4317 
4318         return ret;
4319 }
4320 
4321 static int perf_event_read(struct perf_event *event, bool group)
4322 {
4323         enum perf_event_state state = READ_ONCE(event->state);
4324         int event_cpu, ret = 0;
4325 
4326         /*
4327          * If event is enabled and currently active on a CPU, update the
4328          * value in the event structure:
4329          */
4330 again:
4331         if (state == PERF_EVENT_STATE_ACTIVE) {
4332                 struct perf_read_data data;
4333 
4334                 /*
4335                  * Orders the ->state and ->oncpu loads such that if we see
4336                  * ACTIVE we must also see the right ->oncpu.
4337                  *
4338                  * Matches the smp_wmb() from event_sched_in().
4339                  */
4340                 smp_rmb();
4341 
4342                 event_cpu = READ_ONCE(event->oncpu);
4343                 if ((unsigned)event_cpu >= nr_cpu_ids)
4344                         return 0;
4345 
4346                 data = (struct perf_read_data){
4347                         .event = event,
4348                         .group = group,
4349                         .ret = 0,
4350                 };
4351 
4352                 preempt_disable();
4353                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4354 
4355                 /*
4356                  * Purposely ignore the smp_call_function_single() return
4357                  * value.
4358                  *
4359                  * If event_cpu isn't a valid CPU it means the event got
4360                  * scheduled out and that will have updated the event count.
4361                  *
4362                  * Therefore, either way, we'll have an up-to-date event count
4363                  * after this.
4364                  */
4365                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4366                 preempt_enable();
4367                 ret = data.ret;
4368 
4369         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4370                 struct perf_event_context *ctx = event->ctx;
4371                 unsigned long flags;
4372 
4373                 raw_spin_lock_irqsave(&ctx->lock, flags);
4374                 state = event->state;
4375                 if (state != PERF_EVENT_STATE_INACTIVE) {
4376                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4377                         goto again;
4378                 }
4379 
4380                 /*
4381                  * May read while context is not active (e.g., thread is
4382                  * blocked), in that case we cannot update context time
4383                  */
4384                 if (ctx->is_active & EVENT_TIME) {
4385                         update_context_time(ctx);
4386                         update_cgrp_time_from_event(event);
4387                 }
4388 
4389                 perf_event_update_time(event);
4390                 if (group)
4391                         perf_event_update_sibling_time(event);
4392                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4393         }
4394 
4395         return ret;
4396 }
4397 
4398 /*
4399  * Initialize the perf_event context in a task_struct:
4400  */
4401 static void __perf_event_init_context(struct perf_event_context *ctx)
4402 {
4403         raw_spin_lock_init(&ctx->lock);
4404         mutex_init(&ctx->mutex);
4405         INIT_LIST_HEAD(&ctx->active_ctx_list);
4406         perf_event_groups_init(&ctx->pinned_groups);
4407         perf_event_groups_init(&ctx->flexible_groups);
4408         INIT_LIST_HEAD(&ctx->event_list);
4409         INIT_LIST_HEAD(&ctx->pinned_active);
4410         INIT_LIST_HEAD(&ctx->flexible_active);
4411         refcount_set(&ctx->refcount, 1);
4412 }
4413 
4414 static struct perf_event_context *
4415 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4416 {
4417         struct perf_event_context *ctx;
4418 
4419         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4420         if (!ctx)
4421                 return NULL;
4422 
4423         __perf_event_init_context(ctx);
4424         if (task)
4425                 ctx->task = get_task_struct(task);
4426         ctx->pmu = pmu;
4427 
4428         return ctx;
4429 }
4430 
4431 static struct task_struct *
4432 find_lively_task_by_vpid(pid_t vpid)
4433 {
4434         struct task_struct *task;
4435 
4436         rcu_read_lock();
4437         if (!vpid)
4438                 task = current;
4439         else
4440                 task = find_task_by_vpid(vpid);
4441         if (task)
4442                 get_task_struct(task);
4443         rcu_read_unlock();
4444 
4445         if (!task)
4446                 return ERR_PTR(-ESRCH);
4447 
4448         return task;
4449 }
4450 
4451 /*
4452  * Returns a matching context with refcount and pincount.
4453  */
4454 static struct perf_event_context *
4455 find_get_context(struct pmu *pmu, struct task_struct *task,
4456                 struct perf_event *event)
4457 {
4458         struct perf_event_context *ctx, *clone_ctx = NULL;
4459         struct perf_cpu_context *cpuctx;
4460         void *task_ctx_data = NULL;
4461         unsigned long flags;
4462         int ctxn, err;
4463         int cpu = event->cpu;
4464 
4465         if (!task) {
4466                 /* Must be root to operate on a CPU event: */
4467                 err = perf_allow_cpu(&event->attr);
4468                 if (err)
4469                         return ERR_PTR(err);
4470 
4471                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4472                 ctx = &cpuctx->ctx;
4473                 get_ctx(ctx);
4474                 ++ctx->pin_count;
4475 
4476                 return ctx;
4477         }
4478 
4479         err = -EINVAL;
4480         ctxn = pmu->task_ctx_nr;
4481         if (ctxn < 0)
4482                 goto errout;
4483 
4484         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4485                 task_ctx_data = alloc_task_ctx_data(pmu);
4486                 if (!task_ctx_data) {
4487                         err = -ENOMEM;
4488                         goto errout;
4489                 }
4490         }
4491 
4492 retry:
4493         ctx = perf_lock_task_context(task, ctxn, &flags);
4494         if (ctx) {
4495                 clone_ctx = unclone_ctx(ctx);
4496                 ++ctx->pin_count;
4497 
4498                 if (task_ctx_data && !ctx->task_ctx_data) {
4499                         ctx->task_ctx_data = task_ctx_data;
4500                         task_ctx_data = NULL;
4501                 }
4502                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4503 
4504                 if (clone_ctx)
4505                         put_ctx(clone_ctx);
4506         } else {
4507                 ctx = alloc_perf_context(pmu, task);
4508                 err = -ENOMEM;
4509                 if (!ctx)
4510                         goto errout;
4511 
4512                 if (task_ctx_data) {
4513                         ctx->task_ctx_data = task_ctx_data;
4514                         task_ctx_data = NULL;
4515                 }
4516 
4517                 err = 0;
4518                 mutex_lock(&task->perf_event_mutex);
4519                 /*
4520                  * If it has already passed perf_event_exit_task().
4521                  * we must see PF_EXITING, it takes this mutex too.
4522                  */
4523                 if (task->flags & PF_EXITING)
4524                         err = -ESRCH;
4525                 else if (task->perf_event_ctxp[ctxn])
4526                         err = -EAGAIN;
4527                 else {
4528                         get_ctx(ctx);
4529                         ++ctx->pin_count;
4530                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4531                 }
4532                 mutex_unlock(&task->perf_event_mutex);
4533 
4534                 if (unlikely(err)) {
4535                         put_ctx(ctx);
4536 
4537                         if (err == -EAGAIN)
4538                                 goto retry;
4539                         goto errout;
4540                 }
4541         }
4542 
4543         free_task_ctx_data(pmu, task_ctx_data);
4544         return ctx;
4545 
4546 errout:
4547         free_task_ctx_data(pmu, task_ctx_data);
4548         return ERR_PTR(err);
4549 }
4550 
4551 static void perf_event_free_filter(struct perf_event *event);
4552 static void perf_event_free_bpf_prog(struct perf_event *event);
4553 
4554 static void free_event_rcu(struct rcu_head *head)
4555 {
4556         struct perf_event *event;
4557 
4558         event = container_of(head, struct perf_event, rcu_head);
4559         if (event->ns)
4560                 put_pid_ns(event->ns);
4561         perf_event_free_filter(event);
4562         kfree(event);
4563 }
4564 
4565 static void ring_buffer_attach(struct perf_event *event,
4566                                struct perf_buffer *rb);
4567 
4568 static void detach_sb_event(struct perf_event *event)
4569 {
4570         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4571 
4572         raw_spin_lock(&pel->lock);
4573         list_del_rcu(&event->sb_list);
4574         raw_spin_unlock(&pel->lock);
4575 }
4576 
4577 static bool is_sb_event(struct perf_event *event)
4578 {
4579         struct perf_event_attr *attr = &event->attr;
4580 
4581         if (event->parent)
4582                 return false;
4583 
4584         if (event->attach_state & PERF_ATTACH_TASK)
4585                 return false;
4586 
4587         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4588             attr->comm || attr->comm_exec ||
4589             attr->task || attr->ksymbol ||
4590             attr->context_switch || attr->text_poke ||
4591             attr->bpf_event)
4592                 return true;
4593         return false;
4594 }
4595 
4596 static void unaccount_pmu_sb_event(struct perf_event *event)
4597 {
4598         if (is_sb_event(event))
4599                 detach_sb_event(event);
4600 }
4601 
4602 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4603 {
4604         if (event->parent)
4605                 return;
4606 
4607         if (is_cgroup_event(event))
4608                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4609 }
4610 
4611 #ifdef CONFIG_NO_HZ_FULL
4612 static DEFINE_SPINLOCK(nr_freq_lock);
4613 #endif
4614 
4615 static void unaccount_freq_event_nohz(void)
4616 {
4617 #ifdef CONFIG_NO_HZ_FULL
4618         spin_lock(&nr_freq_lock);
4619         if (atomic_dec_and_test(&nr_freq_events))
4620                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4621         spin_unlock(&nr_freq_lock);
4622 #endif
4623 }
4624 
4625 static void unaccount_freq_event(void)
4626 {
4627         if (tick_nohz_full_enabled())
4628                 unaccount_freq_event_nohz();
4629         else
4630                 atomic_dec(&nr_freq_events);
4631 }
4632 
4633 static void unaccount_event(struct perf_event *event)
4634 {
4635         bool dec = false;
4636 
4637         if (event->parent)
4638                 return;
4639 
4640         if (event->attach_state & PERF_ATTACH_TASK)
4641                 dec = true;
4642         if (event->attr.mmap || event->attr.mmap_data)
4643                 atomic_dec(&nr_mmap_events);
4644         if (event->attr.comm)
4645                 atomic_dec(&nr_comm_events);
4646         if (event->attr.namespaces)
4647                 atomic_dec(&nr_namespaces_events);
4648         if (event->attr.cgroup)
4649                 atomic_dec(&nr_cgroup_events);
4650         if (event->attr.task)
4651                 atomic_dec(&nr_task_events);
4652         if (event->attr.freq)
4653                 unaccount_freq_event();
4654         if (event->attr.context_switch) {
4655                 dec = true;
4656                 atomic_dec(&nr_switch_events);
4657         }
4658         if (is_cgroup_event(event))
4659                 dec = true;
4660         if (has_branch_stack(event))
4661                 dec = true;
4662         if (event->attr.ksymbol)
4663                 atomic_dec(&nr_ksymbol_events);
4664         if (event->attr.bpf_event)
4665                 atomic_dec(&nr_bpf_events);
4666         if (event->attr.text_poke)
4667                 atomic_dec(&nr_text_poke_events);
4668 
4669         if (dec) {
4670                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4671                         schedule_delayed_work(&perf_sched_work, HZ);
4672         }
4673 
4674         unaccount_event_cpu(event, event->cpu);
4675 
4676         unaccount_pmu_sb_event(event);
4677 }
4678 
4679 static void perf_sched_delayed(struct work_struct *work)
4680 {
4681         mutex_lock(&perf_sched_mutex);
4682         if (atomic_dec_and_test(&perf_sched_count))
4683                 static_branch_disable(&perf_sched_events);
4684         mutex_unlock(&perf_sched_mutex);
4685 }
4686 
4687 /*
4688  * The following implement mutual exclusion of events on "exclusive" pmus
4689  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4690  * at a time, so we disallow creating events that might conflict, namely:
4691  *
4692  *  1) cpu-wide events in the presence of per-task events,
4693  *  2) per-task events in the presence of cpu-wide events,
4694  *  3) two matching events on the same context.
4695  *
4696  * The former two cases are handled in the allocation path (perf_event_alloc(),
4697  * _free_event()), the latter -- before the first perf_install_in_context().
4698  */
4699 static int exclusive_event_init(struct perf_event *event)
4700 {
4701         struct pmu *pmu = event->pmu;
4702 
4703         if (!is_exclusive_pmu(pmu))
4704                 return 0;
4705 
4706         /*
4707          * Prevent co-existence of per-task and cpu-wide events on the
4708          * same exclusive pmu.
4709          *
4710          * Negative pmu::exclusive_cnt means there are cpu-wide
4711          * events on this "exclusive" pmu, positive means there are
4712          * per-task events.
4713          *
4714          * Since this is called in perf_event_alloc() path, event::ctx
4715          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4716          * to mean "per-task event", because unlike other attach states it
4717          * never gets cleared.
4718          */
4719         if (event->attach_state & PERF_ATTACH_TASK) {
4720                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4721                         return -EBUSY;
4722         } else {
4723                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4724                         return -EBUSY;
4725         }
4726 
4727         return 0;
4728 }
4729 
4730 static void exclusive_event_destroy(struct perf_event *event)
4731 {
4732         struct pmu *pmu = event->pmu;
4733 
4734         if (!is_exclusive_pmu(pmu))
4735                 return;
4736 
4737         /* see comment in exclusive_event_init() */
4738         if (event->attach_state & PERF_ATTACH_TASK)
4739                 atomic_dec(&pmu->exclusive_cnt);
4740         else
4741                 atomic_inc(&pmu->exclusive_cnt);
4742 }
4743 
4744 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4745 {
4746         if ((e1->pmu == e2->pmu) &&
4747             (e1->cpu == e2->cpu ||
4748              e1->cpu == -1 ||
4749              e2->cpu == -1))
4750                 return true;
4751         return false;
4752 }
4753 
4754 static bool exclusive_event_installable(struct perf_event *event,
4755                                         struct perf_event_context *ctx)
4756 {
4757         struct perf_event *iter_event;
4758         struct pmu *pmu = event->pmu;
4759 
4760         lockdep_assert_held(&ctx->mutex);
4761 
4762         if (!is_exclusive_pmu(pmu))
4763                 return true;
4764 
4765         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4766                 if (exclusive_event_match(iter_event, event))
4767                         return false;
4768         }
4769 
4770         return true;
4771 }
4772 
4773 static void perf_addr_filters_splice(struct perf_event *event,
4774                                        struct list_head *head);
4775 
4776 static void _free_event(struct perf_event *event)
4777 {
4778         irq_work_sync(&event->pending);
4779 
4780         unaccount_event(event);
4781 
4782         security_perf_event_free(event);
4783 
4784         if (event->rb) {
4785                 /*
4786                  * Can happen when we close an event with re-directed output.
4787                  *
4788                  * Since we have a 0 refcount, perf_mmap_close() will skip
4789                  * over us; possibly making our ring_buffer_put() the last.
4790                  */
4791                 mutex_lock(&event->mmap_mutex);
4792                 ring_buffer_attach(event, NULL);
4793                 mutex_unlock(&event->mmap_mutex);
4794         }
4795 
4796         if (is_cgroup_event(event))
4797                 perf_detach_cgroup(event);
4798 
4799         if (!event->parent) {
4800                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4801                         put_callchain_buffers();
4802         }
4803 
4804         perf_event_free_bpf_prog(event);
4805         perf_addr_filters_splice(event, NULL);
4806         kfree(event->addr_filter_ranges);
4807 
4808         if (event->destroy)
4809                 event->destroy(event);
4810 
4811         /*
4812          * Must be after ->destroy(), due to uprobe_perf_close() using
4813          * hw.target.
4814          */
4815         if (event->hw.target)
4816                 put_task_struct(event->hw.target);
4817 
4818         /*
4819          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4820          * all task references must be cleaned up.
4821          */
4822         if (event->ctx)
4823                 put_ctx(event->ctx);
4824 
4825         exclusive_event_destroy(event);
4826         module_put(event->pmu->module);
4827 
4828         call_rcu(&event->rcu_head, free_event_rcu);
4829 }
4830 
4831 /*
4832  * Used to free events which have a known refcount of 1, such as in error paths
4833  * where the event isn't exposed yet and inherited events.
4834  */
4835 static void free_event(struct perf_event *event)
4836 {
4837         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4838                                 "unexpected event refcount: %ld; ptr=%p\n",
4839                                 atomic_long_read(&event->refcount), event)) {
4840                 /* leak to avoid use-after-free */
4841                 return;
4842         }
4843 
4844         _free_event(event);
4845 }
4846 
4847 /*
4848  * Remove user event from the owner task.
4849  */
4850 static void perf_remove_from_owner(struct perf_event *event)
4851 {
4852         struct task_struct *owner;
4853 
4854         rcu_read_lock();
4855         /*
4856          * Matches the smp_store_release() in perf_event_exit_task(). If we
4857          * observe !owner it means the list deletion is complete and we can
4858          * indeed free this event, otherwise we need to serialize on
4859          * owner->perf_event_mutex.
4860          */
4861         owner = READ_ONCE(event->owner);
4862         if (owner) {
4863                 /*
4864                  * Since delayed_put_task_struct() also drops the last
4865                  * task reference we can safely take a new reference
4866                  * while holding the rcu_read_lock().
4867                  */
4868                 get_task_struct(owner);
4869         }
4870         rcu_read_unlock();
4871 
4872         if (owner) {
4873                 /*
4874                  * If we're here through perf_event_exit_task() we're already
4875                  * holding ctx->mutex which would be an inversion wrt. the
4876                  * normal lock order.
4877                  *
4878                  * However we can safely take this lock because its the child
4879                  * ctx->mutex.
4880                  */
4881                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4882 
4883                 /*
4884                  * We have to re-check the event->owner field, if it is cleared
4885                  * we raced with perf_event_exit_task(), acquiring the mutex
4886                  * ensured they're done, and we can proceed with freeing the
4887                  * event.
4888                  */
4889                 if (event->owner) {
4890                         list_del_init(&event->owner_entry);
4891                         smp_store_release(&event->owner, NULL);
4892                 }
4893                 mutex_unlock(&owner->perf_event_mutex);
4894                 put_task_struct(owner);
4895         }
4896 }
4897 
4898 static void put_event(struct perf_event *event)
4899 {
4900         if (!atomic_long_dec_and_test(&event->refcount))
4901                 return;
4902 
4903         _free_event(event);
4904 }
4905 
4906 /*
4907  * Kill an event dead; while event:refcount will preserve the event
4908  * object, it will not preserve its functionality. Once the last 'user'
4909  * gives up the object, we'll destroy the thing.
4910  */
4911 int perf_event_release_kernel(struct perf_event *event)
4912 {
4913         struct perf_event_context *ctx = event->ctx;
4914         struct perf_event *child, *tmp;
4915         LIST_HEAD(free_list);
4916 
4917         /*
4918          * If we got here through err_file: fput(event_file); we will not have
4919          * attached to a context yet.
4920          */
4921         if (!ctx) {
4922                 WARN_ON_ONCE(event->attach_state &
4923                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4924                 goto no_ctx;
4925         }
4926 
4927         if (!is_kernel_event(event))
4928                 perf_remove_from_owner(event);
4929 
4930         ctx = perf_event_ctx_lock(event);
4931         WARN_ON_ONCE(ctx->parent_ctx);
4932         perf_remove_from_context(event, DETACH_GROUP);
4933 
4934         raw_spin_lock_irq(&ctx->lock);
4935         /*
4936          * Mark this event as STATE_DEAD, there is no external reference to it
4937          * anymore.
4938          *
4939          * Anybody acquiring event->child_mutex after the below loop _must_
4940          * also see this, most importantly inherit_event() which will avoid
4941          * placing more children on the list.
4942          *
4943          * Thus this guarantees that we will in fact observe and kill _ALL_
4944          * child events.
4945          */
4946         event->state = PERF_EVENT_STATE_DEAD;
4947         raw_spin_unlock_irq(&ctx->lock);
4948 
4949         perf_event_ctx_unlock(event, ctx);
4950 
4951 again:
4952         mutex_lock(&event->child_mutex);
4953         list_for_each_entry(child, &event->child_list, child_list) {
4954 
4955                 /*
4956                  * Cannot change, child events are not migrated, see the
4957                  * comment with perf_event_ctx_lock_nested().
4958                  */
4959                 ctx = READ_ONCE(child->ctx);
4960                 /*
4961                  * Since child_mutex nests inside ctx::mutex, we must jump
4962                  * through hoops. We start by grabbing a reference on the ctx.
4963                  *
4964                  * Since the event cannot get freed while we hold the
4965                  * child_mutex, the context must also exist and have a !0
4966                  * reference count.
4967                  */
4968                 get_ctx(ctx);
4969 
4970                 /*
4971                  * Now that we have a ctx ref, we can drop child_mutex, and
4972                  * acquire ctx::mutex without fear of it going away. Then we
4973                  * can re-acquire child_mutex.
4974                  */
4975                 mutex_unlock(&event->child_mutex);
4976                 mutex_lock(&ctx->mutex);
4977                 mutex_lock(&event->child_mutex);
4978 
4979                 /*
4980                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4981                  * state, if child is still the first entry, it didn't get freed
4982                  * and we can continue doing so.
4983                  */
4984                 tmp = list_first_entry_or_null(&event->child_list,
4985                                                struct perf_event, child_list);
4986                 if (tmp == child) {
4987                         perf_remove_from_context(child, DETACH_GROUP);
4988                         list_move(&child->child_list, &free_list);
4989                         /*
4990                          * This matches the refcount bump in inherit_event();
4991                          * this can't be the last reference.
4992                          */
4993                         put_event(event);
4994                 }
4995 
4996                 mutex_unlock(&event->child_mutex);
4997                 mutex_unlock(&ctx->mutex);
4998                 put_ctx(ctx);
4999                 goto again;
5000         }
5001         mutex_unlock(&event->child_mutex);
5002 
5003         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5004                 void *var = &child->ctx->refcount;
5005 
5006                 list_del(&child->child_list);
5007                 free_event(child);
5008 
5009                 /*
5010                  * Wake any perf_event_free_task() waiting for this event to be
5011                  * freed.
5012                  */
5013                 smp_mb(); /* pairs with wait_var_event() */
5014                 wake_up_var(var);
5015         }
5016 
5017 no_ctx:
5018         put_event(event); /* Must be the 'last' reference */
5019         return 0;
5020 }
5021 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5022 
5023 /*
5024  * Called when the last reference to the file is gone.
5025  */
5026 static int perf_release(struct inode *inode, struct file *file)
5027 {
5028         perf_event_release_kernel(file->private_data);
5029         return 0;
5030 }
5031 
5032 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5033 {
5034         struct perf_event *child;
5035         u64 total = 0;
5036 
5037         *enabled = 0;
5038         *running = 0;
5039 
5040         mutex_lock(&event->child_mutex);
5041 
5042         (void)perf_event_read(event, false);
5043         total += perf_event_count(event);
5044 
5045         *enabled += event->total_time_enabled +
5046                         atomic64_read(&event->child_total_time_enabled);
5047         *running += event->total_time_running +
5048                         atomic64_read(&event->child_total_time_running);
5049 
5050         list_for_each_entry(child, &event->child_list, child_list) {
5051                 (void)perf_event_read(child, false);
5052                 total += perf_event_count(child);
5053                 *enabled += child->total_time_enabled;
5054                 *running += child->total_time_running;
5055         }
5056         mutex_unlock(&event->child_mutex);
5057 
5058         return total;
5059 }
5060 
5061 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5062 {
5063         struct perf_event_context *ctx;
5064         u64 count;
5065 
5066         ctx = perf_event_ctx_lock(event);
5067         count = __perf_event_read_value(event, enabled, running);
5068         perf_event_ctx_unlock(event, ctx);
5069 
5070         return count;
5071 }
5072 EXPORT_SYMBOL_GPL(perf_event_read_value);
5073 
5074 static int __perf_read_group_add(struct perf_event *leader,
5075                                         u64 read_format, u64 *values)
5076 {
5077         struct perf_event_context *ctx = leader->ctx;
5078         struct perf_event *sub;
5079         unsigned long flags;
5080         int n = 1; /* skip @nr */
5081         int ret;
5082 
5083         ret = perf_event_read(leader, true);
5084         if (ret)
5085                 return ret;
5086 
5087         raw_spin_lock_irqsave(&ctx->lock, flags);
5088 
5089         /*
5090          * Since we co-schedule groups, {enabled,running} times of siblings
5091          * will be identical to those of the leader, so we only publish one
5092          * set.
5093          */
5094         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5095                 values[n++] += leader->total_time_enabled +
5096                         atomic64_read(&leader->child_total_time_enabled);
5097         }
5098 
5099         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5100                 values[n++] += leader->total_time_running +
5101                         atomic64_read(&leader->child_total_time_running);
5102         }
5103 
5104         /*
5105          * Write {count,id} tuples for every sibling.
5106          */
5107         values[n++] += perf_event_count(leader);
5108         if (read_format & PERF_FORMAT_ID)
5109                 values[n++] = primary_event_id(leader);
5110 
5111         for_each_sibling_event(sub, leader) {
5112                 values[n++] += perf_event_count(sub);
5113                 if (read_format & PERF_FORMAT_ID)
5114                         values[n++] = primary_event_id(sub);
5115         }
5116 
5117         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5118         return 0;
5119 }
5120 
5121 static int perf_read_group(struct perf_event *event,
5122                                    u64 read_format, char __user *buf)
5123 {
5124         struct perf_event *leader = event->group_leader, *child;
5125         struct perf_event_context *ctx = leader->ctx;
5126         int ret;
5127         u64 *values;
5128 
5129         lockdep_assert_held(&ctx->mutex);
5130 
5131         values = kzalloc(event->read_size, GFP_KERNEL);
5132         if (!values)
5133                 return -ENOMEM;
5134 
5135         values[0] = 1 + leader->nr_siblings;
5136 
5137         /*
5138          * By locking the child_mutex of the leader we effectively
5139          * lock the child list of all siblings.. XXX explain how.
5140          */
5141         mutex_lock(&leader->child_mutex);
5142 
5143         ret = __perf_read_group_add(leader, read_format, values);
5144         if (ret)
5145                 goto unlock;
5146 
5147         list_for_each_entry(child, &leader->child_list, child_list) {
5148                 ret = __perf_read_group_add(child, read_format, values);
5149                 if (ret)
5150                         goto unlock;
5151         }
5152 
5153         mutex_unlock(&leader->child_mutex);
5154 
5155         ret = event->read_size;
5156         if (copy_to_user(buf, values, event->read_size))
5157                 ret = -EFAULT;
5158         goto out;
5159 
5160 unlock:
5161         mutex_unlock(&leader->child_mutex);
5162 out:
5163         kfree(values);
5164         return ret;
5165 }
5166 
5167 static int perf_read_one(struct perf_event *event,
5168                                  u64 read_format, char __user *buf)
5169 {
5170         u64 enabled, running;
5171         u64 values[4];
5172         int n = 0;
5173 
5174         values[n++] = __perf_event_read_value(event, &enabled, &running);
5175         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5176                 values[n++] = enabled;
5177         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5178                 values[n++] = running;
5179         if (read_format & PERF_FORMAT_ID)
5180                 values[n++] = primary_event_id(event);
5181 
5182         if (copy_to_user(buf, values, n * sizeof(u64)))
5183                 return -EFAULT;
5184 
5185         return n * sizeof(u64);
5186 }
5187 
5188 static bool is_event_hup(struct perf_event *event)
5189 {
5190         bool no_children;
5191 
5192         if (event->state > PERF_EVENT_STATE_EXIT)
5193                 return false;
5194 
5195         mutex_lock(&event->child_mutex);
5196         no_children = list_empty(&event->child_list);
5197         mutex_unlock(&event->child_mutex);
5198         return no_children;
5199 }
5200 
5201 /*
5202  * Read the performance event - simple non blocking version for now
5203  */
5204 static ssize_t
5205 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5206 {
5207         u64 read_format = event->attr.read_format;
5208         int ret;
5209 
5210         /*
5211          * Return end-of-file for a read on an event that is in
5212          * error state (i.e. because it was pinned but it couldn't be
5213          * scheduled on to the CPU at some point).
5214          */
5215         if (event->state == PERF_EVENT_STATE_ERROR)
5216                 return 0;
5217 
5218         if (count < event->read_size)
5219                 return -ENOSPC;
5220 
5221         WARN_ON_ONCE(event->ctx->parent_ctx);
5222         if (read_format & PERF_FORMAT_GROUP)
5223                 ret = perf_read_group(event, read_format, buf);
5224         else
5225                 ret = perf_read_one(event, read_format, buf);
5226 
5227         return ret;
5228 }
5229 
5230 static ssize_t
5231 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5232 {
5233         struct perf_event *event = file->private_data;
5234         struct perf_event_context *ctx;
5235         int ret;
5236 
5237         ret = security_perf_event_read(event);
5238         if (ret)
5239                 return ret;
5240 
5241         ctx = perf_event_ctx_lock(event);
5242         ret = __perf_read(event, buf, count);
5243         perf_event_ctx_unlock(event, ctx);
5244 
5245         return ret;
5246 }
5247 
5248 static __poll_t perf_poll(struct file *file, poll_table *wait)
5249 {
5250         struct perf_event *event = file->private_data;
5251         struct perf_buffer *rb;
5252         __poll_t events = EPOLLHUP;
5253 
5254         poll_wait(file, &event->waitq, wait);
5255 
5256         if (is_event_hup(event))
5257                 return events;
5258 
5259         /*
5260          * Pin the event->rb by taking event->mmap_mutex; otherwise
5261          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5262          */
5263         mutex_lock(&event->mmap_mutex);
5264         rb = event->rb;
5265         if (rb)
5266                 events = atomic_xchg(&rb->poll, 0);
5267         mutex_unlock(&event->mmap_mutex);
5268         return events;
5269 }
5270 
5271 static void _perf_event_reset(struct perf_event *event)
5272 {
5273         (void)perf_event_read(event, false);
5274         local64_set(&event->count, 0);
5275         perf_event_update_userpage(event);
5276 }
5277 
5278 /* Assume it's not an event with inherit set. */
5279 u64 perf_event_pause(struct perf_event *event, bool reset)
5280 {
5281         struct perf_event_context *ctx;
5282         u64 count;
5283 
5284         ctx = perf_event_ctx_lock(event);
5285         WARN_ON_ONCE(event->attr.inherit);
5286         _perf_event_disable(event);
5287         count = local64_read(&event->count);
5288         if (reset)
5289                 local64_set(&event->count, 0);
5290         perf_event_ctx_unlock(event, ctx);
5291 
5292         return count;
5293 }
5294 EXPORT_SYMBOL_GPL(perf_event_pause);
5295 
5296 /*
5297  * Holding the top-level event's child_mutex means that any
5298  * descendant process that has inherited this event will block
5299  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5300  * task existence requirements of perf_event_enable/disable.
5301  */
5302 static void perf_event_for_each_child(struct perf_event *event,
5303                                         void (*func)(struct perf_event *))
5304 {
5305         struct perf_event *child;
5306 
5307         WARN_ON_ONCE(event->ctx->parent_ctx);
5308 
5309         mutex_lock(&event->child_mutex);
5310         func(event);
5311         list_for_each_entry(child, &event->child_list, child_list)
5312                 func(child);
5313         mutex_unlock(&event->child_mutex);
5314 }
5315 
5316 static void perf_event_for_each(struct perf_event *event,
5317                                   void (*func)(struct perf_event *))
5318 {
5319         struct perf_event_context *ctx = event->ctx;
5320         struct perf_event *sibling;
5321 
5322         lockdep_assert_held(&ctx->mutex);
5323 
5324         event = event->group_leader;
5325 
5326         perf_event_for_each_child(event, func);
5327         for_each_sibling_event(sibling, event)
5328                 perf_event_for_each_child(sibling, func);
5329 }
5330 
5331 static void __perf_event_period(struct perf_event *event,
5332                                 struct perf_cpu_context *cpuctx,
5333                                 struct perf_event_context *ctx,
5334                                 void *info)
5335 {
5336         u64 value = *((u64 *)info);
5337         bool active;
5338 
5339         if (event->attr.freq) {
5340                 event->attr.sample_freq = value;
5341         } else {
5342                 event->attr.sample_period = value;
5343                 event->hw.sample_period = value;
5344         }
5345 
5346         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5347         if (active) {
5348                 perf_pmu_disable(ctx->pmu);
5349                 /*
5350                  * We could be throttled; unthrottle now to avoid the tick
5351                  * trying to unthrottle while we already re-started the event.
5352                  */
5353                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5354                         event->hw.interrupts = 0;
5355                         perf_log_throttle(event, 1);
5356                 }
5357                 event->pmu->stop(event, PERF_EF_UPDATE);
5358         }
5359 
5360         local64_set(&event->hw.period_left, 0);
5361 
5362         if (active) {
5363                 event->pmu->start(event, PERF_EF_RELOAD);
5364                 perf_pmu_enable(ctx->pmu);
5365         }
5366 }
5367 
5368 static int perf_event_check_period(struct perf_event *event, u64 value)
5369 {
5370         return event->pmu->check_period(event, value);
5371 }
5372 
5373 static int _perf_event_period(struct perf_event *event, u64 value)
5374 {
5375         if (!is_sampling_event(event))
5376                 return -EINVAL;
5377 
5378         if (!value)
5379                 return -EINVAL;
5380 
5381         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5382                 return -EINVAL;
5383 
5384         if (perf_event_check_period(event, value))
5385                 return -EINVAL;
5386 
5387         if (!event->attr.freq && (value & (1ULL << 63)))
5388                 return -EINVAL;
5389 
5390         event_function_call(event, __perf_event_period, &value);
5391 
5392         return 0;
5393 }
5394 
5395 int perf_event_period(struct perf_event *event, u64 value)
5396 {
5397         struct perf_event_context *ctx;
5398         int ret;
5399 
5400         ctx = perf_event_ctx_lock(event);
5401         ret = _perf_event_period(event, value);
5402         perf_event_ctx_unlock(event, ctx);
5403 
5404         return ret;
5405 }
5406 EXPORT_SYMBOL_GPL(perf_event_period);
5407 
5408 static const struct file_operations perf_fops;
5409 
5410 static inline int perf_fget_light(int fd, struct fd *p)
5411 {
5412         struct fd f = fdget(fd);
5413         if (!f.file)
5414                 return -EBADF;
5415 
5416         if (f.file->f_op != &perf_fops) {
5417                 fdput(f);
5418                 return -EBADF;
5419         }
5420         *p = f;
5421         return 0;
5422 }
5423 
5424 static int perf_event_set_output(struct perf_event *event,
5425                                  struct perf_event *output_event);
5426 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5427 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5428 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5429                           struct perf_event_attr *attr);
5430 
5431 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5432 {
5433         void (*func)(struct perf_event *);
5434         u32 flags = arg;
5435 
5436         switch (cmd) {
5437         case PERF_EVENT_IOC_ENABLE:
5438                 func = _perf_event_enable;
5439                 break;
5440         case PERF_EVENT_IOC_DISABLE:
5441                 func = _perf_event_disable;
5442                 break;
5443         case PERF_EVENT_IOC_RESET:
5444                 func = _perf_event_reset;
5445                 break;
5446 
5447         case PERF_EVENT_IOC_REFRESH:
5448                 return _perf_event_refresh(event, arg);
5449 
5450         case PERF_EVENT_IOC_PERIOD:
5451         {
5452                 u64 value;
5453 
5454                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5455                         return -EFAULT;
5456 
5457                 return _perf_event_period(event, value);
5458         }
5459         case PERF_EVENT_IOC_ID:
5460         {
5461                 u64 id = primary_event_id(event);
5462 
5463                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5464                         return -EFAULT;
5465                 return 0;
5466         }
5467 
5468         case PERF_EVENT_IOC_SET_OUTPUT:
5469         {
5470                 int ret;
5471                 if (arg != -1) {
5472                         struct perf_event *output_event;
5473                         struct fd output;
5474                         ret = perf_fget_light(arg, &output);
5475                         if (ret)
5476                                 return ret;
5477                         output_event = output.file->private_data;
5478                         ret = perf_event_set_output(event, output_event);
5479                         fdput(output);
5480                 } else {
5481                         ret = perf_event_set_output(event, NULL);
5482                 }
5483                 return ret;
5484         }
5485 
5486         case PERF_EVENT_IOC_SET_FILTER:
5487                 return perf_event_set_filter(event, (void __user *)arg);
5488 
5489         case PERF_EVENT_IOC_SET_BPF:
5490                 return perf_event_set_bpf_prog(event, arg);
5491 
5492         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5493                 struct perf_buffer *rb;
5494 
5495                 rcu_read_lock();
5496                 rb = rcu_dereference(event->rb);
5497                 if (!rb || !rb->nr_pages) {
5498                         rcu_read_unlock();
5499                         return -EINVAL;
5500                 }
5501                 rb_toggle_paused(rb, !!arg);
5502                 rcu_read_unlock();
5503                 return 0;
5504         }
5505 
5506         case PERF_EVENT_IOC_QUERY_BPF:
5507                 return perf_event_query_prog_array(event, (void __user *)arg);
5508 
5509         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5510                 struct perf_event_attr new_attr;
5511                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5512                                          &new_attr);
5513 
5514                 if (err)
5515                         return err;
5516 
5517                 return perf_event_modify_attr(event,  &new_attr);
5518         }
5519         default:
5520                 return -ENOTTY;
5521         }
5522 
5523         if (flags & PERF_IOC_FLAG_GROUP)
5524                 perf_event_for_each(event, func);
5525         else
5526                 perf_event_for_each_child(event, func);
5527 
5528         return 0;
5529 }
5530 
5531 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5532 {
5533         struct perf_event *event = file->private_data;
5534         struct perf_event_context *ctx;
5535         long ret;
5536 
5537         /* Treat ioctl like writes as it is likely a mutating operation. */
5538         ret = security_perf_event_write(event);
5539         if (ret)
5540                 return ret;
5541 
5542         ctx = perf_event_ctx_lock(event);
5543         ret = _perf_ioctl(event, cmd, arg);
5544         perf_event_ctx_unlock(event, ctx);
5545 
5546         return ret;
5547 }
5548 
5549 #ifdef CONFIG_COMPAT
5550 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5551                                 unsigned long arg)
5552 {
5553         switch (_IOC_NR(cmd)) {
5554         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5555         case _IOC_NR(PERF_EVENT_IOC_ID):
5556         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5557         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5558                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5559                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5560                         cmd &= ~IOCSIZE_MASK;
5561                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5562                 }
5563                 break;
5564         }
5565         return perf_ioctl(file, cmd, arg);
5566 }
5567 #else
5568 # define perf_compat_ioctl NULL
5569 #endif
5570 
5571 int perf_event_task_enable(void)
5572 {
5573         struct perf_event_context *ctx;
5574         struct perf_event *event;
5575 
5576         mutex_lock(&current->perf_event_mutex);
5577         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5578                 ctx = perf_event_ctx_lock(event);
5579                 perf_event_for_each_child(event, _perf_event_enable);
5580                 perf_event_ctx_unlock(event, ctx);
5581         }
5582         mutex_unlock(&current->perf_event_mutex);
5583 
5584         return 0;
5585 }
5586 
5587 int perf_event_task_disable(void)
5588 {
5589         struct perf_event_context *ctx;
5590         struct perf_event *event;
5591 
5592         mutex_lock(&current->perf_event_mutex);
5593         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5594                 ctx = perf_event_ctx_lock(event);
5595                 perf_event_for_each_child(event, _perf_event_disable);
5596                 perf_event_ctx_unlock(event, ctx);
5597         }
5598         mutex_unlock(&current->perf_event_mutex);
5599 
5600         return 0;
5601 }
5602 
5603 static int perf_event_index(struct perf_event *event)
5604 {
5605         if (event->hw.state & PERF_HES_STOPPED)
5606                 return 0;
5607 
5608         if (event->state != PERF_EVENT_STATE_ACTIVE)
5609                 return 0;
5610 
5611         return event->pmu->event_idx(event);
5612 }
5613 
5614 static void calc_timer_values(struct perf_event *event,
5615                                 u64 *now,
5616                                 u64 *enabled,
5617                                 u64 *running)
5618 {
5619         u64 ctx_time;
5620 
5621         *now = perf_clock();
5622         ctx_time = event->shadow_ctx_time + *now;
5623         __perf_update_times(event, ctx_time, enabled, running);
5624 }
5625 
5626 static void perf_event_init_userpage(struct perf_event *event)
5627 {
5628         struct perf_event_mmap_page *userpg;
5629         struct perf_buffer *rb;
5630 
5631         rcu_read_lock();
5632         rb = rcu_dereference(event->rb);
5633         if (!rb)
5634                 goto unlock;
5635 
5636         userpg = rb->user_page;
5637 
5638         /* Allow new userspace to detect that bit 0 is deprecated */
5639         userpg->cap_bit0_is_deprecated = 1;
5640         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5641         userpg->data_offset = PAGE_SIZE;
5642         userpg->data_size = perf_data_size(rb);
5643 
5644 unlock:
5645         rcu_read_unlock();
5646 }
5647 
5648 void __weak arch_perf_update_userpage(
5649         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5650 {
5651 }
5652 
5653 /*
5654  * Callers need to ensure there can be no nesting of this function, otherwise
5655  * the seqlock logic goes bad. We can not serialize this because the arch
5656  * code calls this from NMI context.
5657  */
5658 void perf_event_update_userpage(struct perf_event *event)
5659 {
5660         struct perf_event_mmap_page *userpg;
5661         struct perf_buffer *rb;
5662         u64 enabled, running, now;
5663 
5664         rcu_read_lock();
5665         rb = rcu_dereference(event->rb);
5666         if (!rb)
5667                 goto unlock;
5668 
5669         /*
5670          * compute total_time_enabled, total_time_running
5671          * based on snapshot values taken when the event
5672          * was last scheduled in.
5673          *
5674          * we cannot simply called update_context_time()
5675          * because of locking issue as we can be called in
5676          * NMI context
5677          */
5678         calc_timer_values(event, &now, &enabled, &running);
5679 
5680         userpg = rb->user_page;
5681         /*
5682          * Disable preemption to guarantee consistent time stamps are stored to
5683          * the user page.
5684          */
5685         preempt_disable();
5686         ++userpg->lock;
5687         barrier();
5688         userpg->index = perf_event_index(event);
5689         userpg->offset = perf_event_count(event);
5690         if (userpg->index)
5691                 userpg->offset -= local64_read(&event->hw.prev_count);
5692 
5693         userpg->time_enabled = enabled +
5694                         atomic64_read(&event->child_total_time_enabled);
5695 
5696         userpg->time_running = running +
5697                         atomic64_read(&event->child_total_time_running);
5698 
5699         arch_perf_update_userpage(event, userpg, now);
5700 
5701         barrier();
5702         ++userpg->lock;
5703         preempt_enable();
5704 unlock:
5705         rcu_read_unlock();
5706 }
5707 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5708 
5709 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5710 {
5711         struct perf_event *event = vmf->vma->vm_file->private_data;
5712         struct perf_buffer *rb;
5713         vm_fault_t ret = VM_FAULT_SIGBUS;
5714 
5715         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5716                 if (vmf->pgoff == 0)
5717                         ret = 0;
5718                 return ret;
5719         }
5720 
5721         rcu_read_lock();
5722         rb = rcu_dereference(event->rb);
5723         if (!rb)
5724                 goto unlock;
5725 
5726         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5727                 goto unlock;
5728 
5729         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5730         if (!vmf->page)
5731                 goto unlock;
5732 
5733         get_page(vmf->page);
5734         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5735         vmf->page->index   = vmf->pgoff;
5736 
5737         ret = 0;
5738 unlock:
5739         rcu_read_unlock();
5740 
5741         return ret;
5742 }
5743 
5744 static void ring_buffer_attach(struct perf_event *event,
5745                                struct perf_buffer *rb)
5746 {
5747         struct perf_buffer *old_rb = NULL;
5748         unsigned long flags;
5749 
5750         if (event->rb) {
5751                 /*
5752                  * Should be impossible, we set this when removing
5753                  * event->rb_entry and wait/clear when adding event->rb_entry.
5754                  */
5755                 WARN_ON_ONCE(event->rcu_pending);
5756 
5757                 old_rb = event->rb;
5758                 spin_lock_irqsave(&old_rb->event_lock, flags);
5759                 list_del_rcu(&event->rb_entry);
5760                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5761 
5762                 event->rcu_batches = get_state_synchronize_rcu();
5763                 event->rcu_pending = 1;
5764         }
5765 
5766         if (rb) {
5767                 if (event->rcu_pending) {
5768                         cond_synchronize_rcu(event->rcu_batches);
5769                         event->rcu_pending = 0;
5770                 }
5771 
5772                 spin_lock_irqsave(&rb->event_lock, flags);
5773                 list_add_rcu(&event->rb_entry, &rb->event_list);
5774                 spin_unlock_irqrestore(&rb->event_lock, flags);
5775         }
5776 
5777         /*
5778          * Avoid racing with perf_mmap_close(AUX): stop the event
5779          * before swizzling the event::rb pointer; if it's getting
5780          * unmapped, its aux_mmap_count will be 0 and it won't
5781          * restart. See the comment in __perf_pmu_output_stop().
5782          *
5783          * Data will inevitably be lost when set_output is done in
5784          * mid-air, but then again, whoever does it like this is
5785          * not in for the data anyway.
5786          */
5787         if (has_aux(event))
5788                 perf_event_stop(event, 0);
5789 
5790         rcu_assign_pointer(event->rb, rb);
5791 
5792         if (old_rb) {
5793                 ring_buffer_put(old_rb);
5794                 /*
5795                  * Since we detached before setting the new rb, so that we
5796                  * could attach the new rb, we could have missed a wakeup.
5797                  * Provide it now.
5798                  */
5799                 wake_up_all(&event->waitq);
5800         }
5801 }
5802 
5803 static void ring_buffer_wakeup(struct perf_event *event)
5804 {
5805         struct perf_buffer *rb;
5806 
5807         rcu_read_lock();
5808         rb = rcu_dereference(event->rb);
5809         if (rb) {
5810                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5811                         wake_up_all(&event->waitq);
5812         }
5813         rcu_read_unlock();
5814 }
5815 
5816 struct perf_buffer *ring_buffer_get(struct perf_event *event)
5817 {
5818         struct perf_buffer *rb;
5819 
5820         rcu_read_lock();
5821         rb = rcu_dereference(event->rb);
5822         if (rb) {
5823                 if (!refcount_inc_not_zero(&rb->refcount))
5824                         rb = NULL;
5825         }
5826         rcu_read_unlock();
5827 
5828         return rb;
5829 }
5830 
5831 void ring_buffer_put(struct perf_buffer *rb)
5832 {
5833         if (!refcount_dec_and_test(&rb->refcount))
5834                 return;
5835 
5836         WARN_ON_ONCE(!list_empty(&rb->event_list));
5837 
5838         call_rcu(&rb->rcu_head, rb_free_rcu);
5839 }
5840 
5841 static void perf_mmap_open(struct vm_area_struct *vma)
5842 {
5843         struct perf_event *event = vma->vm_file->private_data;
5844 
5845         atomic_inc(&event->mmap_count);
5846         atomic_inc(&event->rb->mmap_count);
5847 
5848         if (vma->vm_pgoff)
5849                 atomic_inc(&event->rb->aux_mmap_count);
5850 
5851         if (event->pmu->event_mapped)
5852                 event->pmu->event_mapped(event, vma->vm_mm);
5853 }
5854 
5855 static void perf_pmu_output_stop(struct perf_event *event);
5856 
5857 /*
5858  * A buffer can be mmap()ed multiple times; either directly through the same
5859  * event, or through other events by use of perf_event_set_output().
5860  *
5861  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5862  * the buffer here, where we still have a VM context. This means we need
5863  * to detach all events redirecting to us.
5864  */
5865 static void perf_mmap_close(struct vm_area_struct *vma)
5866 {
5867         struct perf_event *event = vma->vm_file->private_data;
5868         struct perf_buffer *rb = ring_buffer_get(event);
5869         struct user_struct *mmap_user = rb->mmap_user;
5870         int mmap_locked = rb->mmap_locked;
5871         unsigned long size = perf_data_size(rb);
5872         bool detach_rest = false;
5873 
5874         if (event->pmu->event_unmapped)
5875                 event->pmu->event_unmapped(event, vma->vm_mm);
5876 
5877         /*
5878          * rb->aux_mmap_count will always drop before rb->mmap_count and
5879          * event->mmap_count, so it is ok to use event->mmap_mutex to
5880          * serialize with perf_mmap here.
5881          */
5882         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5883             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5884                 /*
5885                  * Stop all AUX events that are writing to this buffer,
5886                  * so that we can free its AUX pages and corresponding PMU
5887                  * data. Note that after rb::aux_mmap_count dropped to zero,
5888                  * they won't start any more (see perf_aux_output_begin()).
5889                  */
5890                 perf_pmu_output_stop(event);
5891 
5892                 /* now it's safe to free the pages */
5893                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5894                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5895 
5896                 /* this has to be the last one */
5897                 rb_free_aux(rb);
5898                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5899 
5900                 mutex_unlock(&event->mmap_mutex);
5901         }
5902 
5903         if (atomic_dec_and_test(&rb->mmap_count))
5904                 detach_rest = true;
5905 
5906         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5907                 goto out_put;
5908 
5909         ring_buffer_attach(event, NULL);
5910         mutex_unlock(&event->mmap_mutex);
5911 
5912         /* If there's still other mmap()s of this buffer, we're done. */
5913         if (!detach_rest)
5914                 goto out_put;
5915 
5916         /*
5917          * No other mmap()s, detach from all other events that might redirect
5918          * into the now unreachable buffer. Somewhat complicated by the
5919          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5920          */
5921 again:
5922         rcu_read_lock();
5923         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5924                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5925                         /*
5926                          * This event is en-route to free_event() which will
5927                          * detach it and remove it from the list.
5928                          */
5929                         continue;
5930                 }
5931                 rcu_read_unlock();
5932 
5933                 mutex_lock(&event->mmap_mutex);
5934                 /*
5935                  * Check we didn't race with perf_event_set_output() which can
5936                  * swizzle the rb from under us while we were waiting to
5937                  * acquire mmap_mutex.
5938                  *
5939                  * If we find a different rb; ignore this event, a next
5940                  * iteration will no longer find it on the list. We have to
5941                  * still restart the iteration to make sure we're not now
5942                  * iterating the wrong list.
5943                  */
5944                 if (event->rb == rb)
5945                         ring_buffer_attach(event, NULL);
5946 
5947                 mutex_unlock(&event->mmap_mutex);
5948                 put_event(event);
5949 
5950                 /*
5951                  * Restart the iteration; either we're on the wrong list or
5952                  * destroyed its integrity by doing a deletion.
5953                  */
5954                 goto again;
5955         }
5956         rcu_read_unlock();
5957 
5958         /*
5959          * It could be there's still a few 0-ref events on the list; they'll
5960          * get cleaned up by free_event() -- they'll also still have their
5961          * ref on the rb and will free it whenever they are done with it.
5962          *
5963          * Aside from that, this buffer is 'fully' detached and unmapped,
5964          * undo the VM accounting.
5965          */
5966 
5967         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
5968                         &mmap_user->locked_vm);
5969         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5970         free_uid(mmap_user);
5971 
5972 out_put:
5973         ring_buffer_put(rb); /* could be last */
5974 }
5975 
5976 static const struct vm_operations_struct perf_mmap_vmops = {
5977         .open           = perf_mmap_open,
5978         .close          = perf_mmap_close, /* non mergeable */
5979         .fault          = perf_mmap_fault,
5980         .page_mkwrite   = perf_mmap_fault,
5981 };
5982 
5983 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5984 {
5985         struct perf_event *event = file->private_data;
5986         unsigned long user_locked, user_lock_limit;
5987         struct user_struct *user = current_user();
5988         struct perf_buffer *rb = NULL;
5989         unsigned long locked, lock_limit;
5990         unsigned long vma_size;
5991         unsigned long nr_pages;
5992         long user_extra = 0, extra = 0;
5993         int ret = 0, flags = 0;
5994 
5995         /*
5996          * Don't allow mmap() of inherited per-task counters. This would
5997          * create a performance issue due to all children writing to the
5998          * same rb.
5999          */
6000         if (event->cpu == -1 && event->attr.inherit)
6001                 return -EINVAL;
6002 
6003         if (!(vma->vm_flags & VM_SHARED))
6004                 return -EINVAL;
6005 
6006         ret = security_perf_event_read(event);
6007         if (ret)
6008                 return ret;
6009 
6010         vma_size = vma->vm_end - vma->vm_start;
6011 
6012         if (vma->vm_pgoff == 0) {
6013                 nr_pages = (vma_size / PAGE_SIZE) - 1;
6014         } else {
6015                 /*
6016                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6017                  * mapped, all subsequent mappings should have the same size
6018                  * and offset. Must be above the normal perf buffer.
6019                  */
6020                 u64 aux_offset, aux_size;
6021 
6022                 if (!event->rb)
6023                         return -EINVAL;
6024 
6025                 nr_pages = vma_size / PAGE_SIZE;
6026 
6027                 mutex_lock(&event->mmap_mutex);
6028                 ret = -EINVAL;
6029 
6030                 rb = event->rb;
6031                 if (!rb)
6032                         goto aux_unlock;