~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.15-rc1 ] ~ [ linux-5.14.5 ] ~ [ linux-5.13.18 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.66 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.147 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.206 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.246 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.282 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.283 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Performance events core code:
  4  *
  5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  9  */
 10 
 11 #include <linux/fs.h>
 12 #include <linux/mm.h>
 13 #include <linux/cpu.h>
 14 #include <linux/smp.h>
 15 #include <linux/idr.h>
 16 #include <linux/file.h>
 17 #include <linux/poll.h>
 18 #include <linux/slab.h>
 19 #include <linux/hash.h>
 20 #include <linux/tick.h>
 21 #include <linux/sysfs.h>
 22 #include <linux/dcache.h>
 23 #include <linux/percpu.h>
 24 #include <linux/ptrace.h>
 25 #include <linux/reboot.h>
 26 #include <linux/vmstat.h>
 27 #include <linux/device.h>
 28 #include <linux/export.h>
 29 #include <linux/vmalloc.h>
 30 #include <linux/hardirq.h>
 31 #include <linux/hugetlb.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/cgroup.h>
 38 #include <linux/perf_event.h>
 39 #include <linux/trace_events.h>
 40 #include <linux/hw_breakpoint.h>
 41 #include <linux/mm_types.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 #include <linux/bpf.h>
 46 #include <linux/filter.h>
 47 #include <linux/namei.h>
 48 #include <linux/parser.h>
 49 #include <linux/sched/clock.h>
 50 #include <linux/sched/mm.h>
 51 #include <linux/proc_ns.h>
 52 #include <linux/mount.h>
 53 #include <linux/min_heap.h>
 54 #include <linux/highmem.h>
 55 #include <linux/pgtable.h>
 56 
 57 #include "internal.h"
 58 
 59 #include <asm/irq_regs.h>
 60 
 61 typedef int (*remote_function_f)(void *);
 62 
 63 struct remote_function_call {
 64         struct task_struct      *p;
 65         remote_function_f       func;
 66         void                    *info;
 67         int                     ret;
 68 };
 69 
 70 static void remote_function(void *data)
 71 {
 72         struct remote_function_call *tfc = data;
 73         struct task_struct *p = tfc->p;
 74 
 75         if (p) {
 76                 /* -EAGAIN */
 77                 if (task_cpu(p) != smp_processor_id())
 78                         return;
 79 
 80                 /*
 81                  * Now that we're on right CPU with IRQs disabled, we can test
 82                  * if we hit the right task without races.
 83                  */
 84 
 85                 tfc->ret = -ESRCH; /* No such (running) process */
 86                 if (p != current)
 87                         return;
 88         }
 89 
 90         tfc->ret = tfc->func(tfc->info);
 91 }
 92 
 93 /**
 94  * task_function_call - call a function on the cpu on which a task runs
 95  * @p:          the task to evaluate
 96  * @func:       the function to be called
 97  * @info:       the function call argument
 98  *
 99  * Calls the function @func when the task is currently running. This might
100  * be on the current CPU, which just calls the function directly.  This will
101  * retry due to any failures in smp_call_function_single(), such as if the
102  * task_cpu() goes offline concurrently.
103  *
104  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
105  */
106 static int
107 task_function_call(struct task_struct *p, remote_function_f func, void *info)
108 {
109         struct remote_function_call data = {
110                 .p      = p,
111                 .func   = func,
112                 .info   = info,
113                 .ret    = -EAGAIN,
114         };
115         int ret;
116 
117         for (;;) {
118                 ret = smp_call_function_single(task_cpu(p), remote_function,
119                                                &data, 1);
120                 if (!ret)
121                         ret = data.ret;
122 
123                 if (ret != -EAGAIN)
124                         break;
125 
126                 cond_resched();
127         }
128 
129         return ret;
130 }
131 
132 /**
133  * cpu_function_call - call a function on the cpu
134  * @func:       the function to be called
135  * @info:       the function call argument
136  *
137  * Calls the function @func on the remote cpu.
138  *
139  * returns: @func return value or -ENXIO when the cpu is offline
140  */
141 static int cpu_function_call(int cpu, remote_function_f func, void *info)
142 {
143         struct remote_function_call data = {
144                 .p      = NULL,
145                 .func   = func,
146                 .info   = info,
147                 .ret    = -ENXIO, /* No such CPU */
148         };
149 
150         smp_call_function_single(cpu, remote_function, &data, 1);
151 
152         return data.ret;
153 }
154 
155 static inline struct perf_cpu_context *
156 __get_cpu_context(struct perf_event_context *ctx)
157 {
158         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
159 }
160 
161 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
162                           struct perf_event_context *ctx)
163 {
164         raw_spin_lock(&cpuctx->ctx.lock);
165         if (ctx)
166                 raw_spin_lock(&ctx->lock);
167 }
168 
169 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
170                             struct perf_event_context *ctx)
171 {
172         if (ctx)
173                 raw_spin_unlock(&ctx->lock);
174         raw_spin_unlock(&cpuctx->ctx.lock);
175 }
176 
177 #define TASK_TOMBSTONE ((void *)-1L)
178 
179 static bool is_kernel_event(struct perf_event *event)
180 {
181         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
182 }
183 
184 /*
185  * On task ctx scheduling...
186  *
187  * When !ctx->nr_events a task context will not be scheduled. This means
188  * we can disable the scheduler hooks (for performance) without leaving
189  * pending task ctx state.
190  *
191  * This however results in two special cases:
192  *
193  *  - removing the last event from a task ctx; this is relatively straight
194  *    forward and is done in __perf_remove_from_context.
195  *
196  *  - adding the first event to a task ctx; this is tricky because we cannot
197  *    rely on ctx->is_active and therefore cannot use event_function_call().
198  *    See perf_install_in_context().
199  *
200  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
201  */
202 
203 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
204                         struct perf_event_context *, void *);
205 
206 struct event_function_struct {
207         struct perf_event *event;
208         event_f func;
209         void *data;
210 };
211 
212 static int event_function(void *info)
213 {
214         struct event_function_struct *efs = info;
215         struct perf_event *event = efs->event;
216         struct perf_event_context *ctx = event->ctx;
217         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
218         struct perf_event_context *task_ctx = cpuctx->task_ctx;
219         int ret = 0;
220 
221         lockdep_assert_irqs_disabled();
222 
223         perf_ctx_lock(cpuctx, task_ctx);
224         /*
225          * Since we do the IPI call without holding ctx->lock things can have
226          * changed, double check we hit the task we set out to hit.
227          */
228         if (ctx->task) {
229                 if (ctx->task != current) {
230                         ret = -ESRCH;
231                         goto unlock;
232                 }
233 
234                 /*
235                  * We only use event_function_call() on established contexts,
236                  * and event_function() is only ever called when active (or
237                  * rather, we'll have bailed in task_function_call() or the
238                  * above ctx->task != current test), therefore we must have
239                  * ctx->is_active here.
240                  */
241                 WARN_ON_ONCE(!ctx->is_active);
242                 /*
243                  * And since we have ctx->is_active, cpuctx->task_ctx must
244                  * match.
245                  */
246                 WARN_ON_ONCE(task_ctx != ctx);
247         } else {
248                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
249         }
250 
251         efs->func(event, cpuctx, ctx, efs->data);
252 unlock:
253         perf_ctx_unlock(cpuctx, task_ctx);
254 
255         return ret;
256 }
257 
258 static void event_function_call(struct perf_event *event, event_f func, void *data)
259 {
260         struct perf_event_context *ctx = event->ctx;
261         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
262         struct event_function_struct efs = {
263                 .event = event,
264                 .func = func,
265                 .data = data,
266         };
267 
268         if (!event->parent) {
269                 /*
270                  * If this is a !child event, we must hold ctx::mutex to
271                  * stabilize the the event->ctx relation. See
272                  * perf_event_ctx_lock().
273                  */
274                 lockdep_assert_held(&ctx->mutex);
275         }
276 
277         if (!task) {
278                 cpu_function_call(event->cpu, event_function, &efs);
279                 return;
280         }
281 
282         if (task == TASK_TOMBSTONE)
283                 return;
284 
285 again:
286         if (!task_function_call(task, event_function, &efs))
287                 return;
288 
289         raw_spin_lock_irq(&ctx->lock);
290         /*
291          * Reload the task pointer, it might have been changed by
292          * a concurrent perf_event_context_sched_out().
293          */
294         task = ctx->task;
295         if (task == TASK_TOMBSTONE) {
296                 raw_spin_unlock_irq(&ctx->lock);
297                 return;
298         }
299         if (ctx->is_active) {
300                 raw_spin_unlock_irq(&ctx->lock);
301                 goto again;
302         }
303         func(event, NULL, ctx, data);
304         raw_spin_unlock_irq(&ctx->lock);
305 }
306 
307 /*
308  * Similar to event_function_call() + event_function(), but hard assumes IRQs
309  * are already disabled and we're on the right CPU.
310  */
311 static void event_function_local(struct perf_event *event, event_f func, void *data)
312 {
313         struct perf_event_context *ctx = event->ctx;
314         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
315         struct task_struct *task = READ_ONCE(ctx->task);
316         struct perf_event_context *task_ctx = NULL;
317 
318         lockdep_assert_irqs_disabled();
319 
320         if (task) {
321                 if (task == TASK_TOMBSTONE)
322                         return;
323 
324                 task_ctx = ctx;
325         }
326 
327         perf_ctx_lock(cpuctx, task_ctx);
328 
329         task = ctx->task;
330         if (task == TASK_TOMBSTONE)
331                 goto unlock;
332 
333         if (task) {
334                 /*
335                  * We must be either inactive or active and the right task,
336                  * otherwise we're screwed, since we cannot IPI to somewhere
337                  * else.
338                  */
339                 if (ctx->is_active) {
340                         if (WARN_ON_ONCE(task != current))
341                                 goto unlock;
342 
343                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
344                                 goto unlock;
345                 }
346         } else {
347                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
348         }
349 
350         func(event, cpuctx, ctx, data);
351 unlock:
352         perf_ctx_unlock(cpuctx, task_ctx);
353 }
354 
355 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
356                        PERF_FLAG_FD_OUTPUT  |\
357                        PERF_FLAG_PID_CGROUP |\
358                        PERF_FLAG_FD_CLOEXEC)
359 
360 /*
361  * branch priv levels that need permission checks
362  */
363 #define PERF_SAMPLE_BRANCH_PERM_PLM \
364         (PERF_SAMPLE_BRANCH_KERNEL |\
365          PERF_SAMPLE_BRANCH_HV)
366 
367 enum event_type_t {
368         EVENT_FLEXIBLE = 0x1,
369         EVENT_PINNED = 0x2,
370         EVENT_TIME = 0x4,
371         /* see ctx_resched() for details */
372         EVENT_CPU = 0x8,
373         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
374 };
375 
376 /*
377  * perf_sched_events : >0 events exist
378  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
379  */
380 
381 static void perf_sched_delayed(struct work_struct *work);
382 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
383 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
384 static DEFINE_MUTEX(perf_sched_mutex);
385 static atomic_t perf_sched_count;
386 
387 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
388 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
389 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
390 
391 static atomic_t nr_mmap_events __read_mostly;
392 static atomic_t nr_comm_events __read_mostly;
393 static atomic_t nr_namespaces_events __read_mostly;
394 static atomic_t nr_task_events __read_mostly;
395 static atomic_t nr_freq_events __read_mostly;
396 static atomic_t nr_switch_events __read_mostly;
397 static atomic_t nr_ksymbol_events __read_mostly;
398 static atomic_t nr_bpf_events __read_mostly;
399 static atomic_t nr_cgroup_events __read_mostly;
400 static atomic_t nr_text_poke_events __read_mostly;
401 
402 static LIST_HEAD(pmus);
403 static DEFINE_MUTEX(pmus_lock);
404 static struct srcu_struct pmus_srcu;
405 static cpumask_var_t perf_online_mask;
406 
407 /*
408  * perf event paranoia level:
409  *  -1 - not paranoid at all
410  *   0 - disallow raw tracepoint access for unpriv
411  *   1 - disallow cpu events for unpriv
412  *   2 - disallow kernel profiling for unpriv
413  */
414 int sysctl_perf_event_paranoid __read_mostly = 2;
415 
416 /* Minimum for 512 kiB + 1 user control page */
417 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
418 
419 /*
420  * max perf event sample rate
421  */
422 #define DEFAULT_MAX_SAMPLE_RATE         100000
423 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
424 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
425 
426 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
427 
428 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
429 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
430 
431 static int perf_sample_allowed_ns __read_mostly =
432         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
433 
434 static void update_perf_cpu_limits(void)
435 {
436         u64 tmp = perf_sample_period_ns;
437 
438         tmp *= sysctl_perf_cpu_time_max_percent;
439         tmp = div_u64(tmp, 100);
440         if (!tmp)
441                 tmp = 1;
442 
443         WRITE_ONCE(perf_sample_allowed_ns, tmp);
444 }
445 
446 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
447 
448 int perf_proc_update_handler(struct ctl_table *table, int write,
449                 void *buffer, size_t *lenp, loff_t *ppos)
450 {
451         int ret;
452         int perf_cpu = sysctl_perf_cpu_time_max_percent;
453         /*
454          * If throttling is disabled don't allow the write:
455          */
456         if (write && (perf_cpu == 100 || perf_cpu == 0))
457                 return -EINVAL;
458 
459         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
460         if (ret || !write)
461                 return ret;
462 
463         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
464         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
465         update_perf_cpu_limits();
466 
467         return 0;
468 }
469 
470 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
471 
472 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
473                 void *buffer, size_t *lenp, loff_t *ppos)
474 {
475         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
476 
477         if (ret || !write)
478                 return ret;
479 
480         if (sysctl_perf_cpu_time_max_percent == 100 ||
481             sysctl_perf_cpu_time_max_percent == 0) {
482                 printk(KERN_WARNING
483                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
484                 WRITE_ONCE(perf_sample_allowed_ns, 0);
485         } else {
486                 update_perf_cpu_limits();
487         }
488 
489         return 0;
490 }
491 
492 /*
493  * perf samples are done in some very critical code paths (NMIs).
494  * If they take too much CPU time, the system can lock up and not
495  * get any real work done.  This will drop the sample rate when
496  * we detect that events are taking too long.
497  */
498 #define NR_ACCUMULATED_SAMPLES 128
499 static DEFINE_PER_CPU(u64, running_sample_length);
500 
501 static u64 __report_avg;
502 static u64 __report_allowed;
503 
504 static void perf_duration_warn(struct irq_work *w)
505 {
506         printk_ratelimited(KERN_INFO
507                 "perf: interrupt took too long (%lld > %lld), lowering "
508                 "kernel.perf_event_max_sample_rate to %d\n",
509                 __report_avg, __report_allowed,
510                 sysctl_perf_event_sample_rate);
511 }
512 
513 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
514 
515 void perf_sample_event_took(u64 sample_len_ns)
516 {
517         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
518         u64 running_len;
519         u64 avg_len;
520         u32 max;
521 
522         if (max_len == 0)
523                 return;
524 
525         /* Decay the counter by 1 average sample. */
526         running_len = __this_cpu_read(running_sample_length);
527         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
528         running_len += sample_len_ns;
529         __this_cpu_write(running_sample_length, running_len);
530 
531         /*
532          * Note: this will be biased artifically low until we have
533          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
534          * from having to maintain a count.
535          */
536         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
537         if (avg_len <= max_len)
538                 return;
539 
540         __report_avg = avg_len;
541         __report_allowed = max_len;
542 
543         /*
544          * Compute a throttle threshold 25% below the current duration.
545          */
546         avg_len += avg_len / 4;
547         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
548         if (avg_len < max)
549                 max /= (u32)avg_len;
550         else
551                 max = 1;
552 
553         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
554         WRITE_ONCE(max_samples_per_tick, max);
555 
556         sysctl_perf_event_sample_rate = max * HZ;
557         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
558 
559         if (!irq_work_queue(&perf_duration_work)) {
560                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
561                              "kernel.perf_event_max_sample_rate to %d\n",
562                              __report_avg, __report_allowed,
563                              sysctl_perf_event_sample_rate);
564         }
565 }
566 
567 static atomic64_t perf_event_id;
568 
569 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
570                               enum event_type_t event_type);
571 
572 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
573                              enum event_type_t event_type,
574                              struct task_struct *task);
575 
576 static void update_context_time(struct perf_event_context *ctx);
577 static u64 perf_event_time(struct perf_event *event);
578 
579 void __weak perf_event_print_debug(void)        { }
580 
581 extern __weak const char *perf_pmu_name(void)
582 {
583         return "pmu";
584 }
585 
586 static inline u64 perf_clock(void)
587 {
588         return local_clock();
589 }
590 
591 static inline u64 perf_event_clock(struct perf_event *event)
592 {
593         return event->clock();
594 }
595 
596 /*
597  * State based event timekeeping...
598  *
599  * The basic idea is to use event->state to determine which (if any) time
600  * fields to increment with the current delta. This means we only need to
601  * update timestamps when we change state or when they are explicitly requested
602  * (read).
603  *
604  * Event groups make things a little more complicated, but not terribly so. The
605  * rules for a group are that if the group leader is OFF the entire group is
606  * OFF, irrespecive of what the group member states are. This results in
607  * __perf_effective_state().
608  *
609  * A futher ramification is that when a group leader flips between OFF and
610  * !OFF, we need to update all group member times.
611  *
612  *
613  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
614  * need to make sure the relevant context time is updated before we try and
615  * update our timestamps.
616  */
617 
618 static __always_inline enum perf_event_state
619 __perf_effective_state(struct perf_event *event)
620 {
621         struct perf_event *leader = event->group_leader;
622 
623         if (leader->state <= PERF_EVENT_STATE_OFF)
624                 return leader->state;
625 
626         return event->state;
627 }
628 
629 static __always_inline void
630 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
631 {
632         enum perf_event_state state = __perf_effective_state(event);
633         u64 delta = now - event->tstamp;
634 
635         *enabled = event->total_time_enabled;
636         if (state >= PERF_EVENT_STATE_INACTIVE)
637                 *enabled += delta;
638 
639         *running = event->total_time_running;
640         if (state >= PERF_EVENT_STATE_ACTIVE)
641                 *running += delta;
642 }
643 
644 static void perf_event_update_time(struct perf_event *event)
645 {
646         u64 now = perf_event_time(event);
647 
648         __perf_update_times(event, now, &event->total_time_enabled,
649                                         &event->total_time_running);
650         event->tstamp = now;
651 }
652 
653 static void perf_event_update_sibling_time(struct perf_event *leader)
654 {
655         struct perf_event *sibling;
656 
657         for_each_sibling_event(sibling, leader)
658                 perf_event_update_time(sibling);
659 }
660 
661 static void
662 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
663 {
664         if (event->state == state)
665                 return;
666 
667         perf_event_update_time(event);
668         /*
669          * If a group leader gets enabled/disabled all its siblings
670          * are affected too.
671          */
672         if ((event->state < 0) ^ (state < 0))
673                 perf_event_update_sibling_time(event);
674 
675         WRITE_ONCE(event->state, state);
676 }
677 
678 #ifdef CONFIG_CGROUP_PERF
679 
680 static inline bool
681 perf_cgroup_match(struct perf_event *event)
682 {
683         struct perf_event_context *ctx = event->ctx;
684         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
685 
686         /* @event doesn't care about cgroup */
687         if (!event->cgrp)
688                 return true;
689 
690         /* wants specific cgroup scope but @cpuctx isn't associated with any */
691         if (!cpuctx->cgrp)
692                 return false;
693 
694         /*
695          * Cgroup scoping is recursive.  An event enabled for a cgroup is
696          * also enabled for all its descendant cgroups.  If @cpuctx's
697          * cgroup is a descendant of @event's (the test covers identity
698          * case), it's a match.
699          */
700         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
701                                     event->cgrp->css.cgroup);
702 }
703 
704 static inline void perf_detach_cgroup(struct perf_event *event)
705 {
706         css_put(&event->cgrp->css);
707         event->cgrp = NULL;
708 }
709 
710 static inline int is_cgroup_event(struct perf_event *event)
711 {
712         return event->cgrp != NULL;
713 }
714 
715 static inline u64 perf_cgroup_event_time(struct perf_event *event)
716 {
717         struct perf_cgroup_info *t;
718 
719         t = per_cpu_ptr(event->cgrp->info, event->cpu);
720         return t->time;
721 }
722 
723 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
724 {
725         struct perf_cgroup_info *info;
726         u64 now;
727 
728         now = perf_clock();
729 
730         info = this_cpu_ptr(cgrp->info);
731 
732         info->time += now - info->timestamp;
733         info->timestamp = now;
734 }
735 
736 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
737 {
738         struct perf_cgroup *cgrp = cpuctx->cgrp;
739         struct cgroup_subsys_state *css;
740 
741         if (cgrp) {
742                 for (css = &cgrp->css; css; css = css->parent) {
743                         cgrp = container_of(css, struct perf_cgroup, css);
744                         __update_cgrp_time(cgrp);
745                 }
746         }
747 }
748 
749 static inline void update_cgrp_time_from_event(struct perf_event *event)
750 {
751         struct perf_cgroup *cgrp;
752 
753         /*
754          * ensure we access cgroup data only when needed and
755          * when we know the cgroup is pinned (css_get)
756          */
757         if (!is_cgroup_event(event))
758                 return;
759 
760         cgrp = perf_cgroup_from_task(current, event->ctx);
761         /*
762          * Do not update time when cgroup is not active
763          */
764         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
765                 __update_cgrp_time(event->cgrp);
766 }
767 
768 static inline void
769 perf_cgroup_set_timestamp(struct task_struct *task,
770                           struct perf_event_context *ctx)
771 {
772         struct perf_cgroup *cgrp;
773         struct perf_cgroup_info *info;
774         struct cgroup_subsys_state *css;
775 
776         /*
777          * ctx->lock held by caller
778          * ensure we do not access cgroup data
779          * unless we have the cgroup pinned (css_get)
780          */
781         if (!task || !ctx->nr_cgroups)
782                 return;
783 
784         cgrp = perf_cgroup_from_task(task, ctx);
785 
786         for (css = &cgrp->css; css; css = css->parent) {
787                 cgrp = container_of(css, struct perf_cgroup, css);
788                 info = this_cpu_ptr(cgrp->info);
789                 info->timestamp = ctx->timestamp;
790         }
791 }
792 
793 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
794 
795 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
796 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
797 
798 /*
799  * reschedule events based on the cgroup constraint of task.
800  *
801  * mode SWOUT : schedule out everything
802  * mode SWIN : schedule in based on cgroup for next
803  */
804 static void perf_cgroup_switch(struct task_struct *task, int mode)
805 {
806         struct perf_cpu_context *cpuctx;
807         struct list_head *list;
808         unsigned long flags;
809 
810         /*
811          * Disable interrupts and preemption to avoid this CPU's
812          * cgrp_cpuctx_entry to change under us.
813          */
814         local_irq_save(flags);
815 
816         list = this_cpu_ptr(&cgrp_cpuctx_list);
817         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
818                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
819 
820                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
821                 perf_pmu_disable(cpuctx->ctx.pmu);
822 
823                 if (mode & PERF_CGROUP_SWOUT) {
824                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
825                         /*
826                          * must not be done before ctxswout due
827                          * to event_filter_match() in event_sched_out()
828                          */
829                         cpuctx->cgrp = NULL;
830                 }
831 
832                 if (mode & PERF_CGROUP_SWIN) {
833                         WARN_ON_ONCE(cpuctx->cgrp);
834                         /*
835                          * set cgrp before ctxsw in to allow
836                          * event_filter_match() to not have to pass
837                          * task around
838                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
839                          * because cgorup events are only per-cpu
840                          */
841                         cpuctx->cgrp = perf_cgroup_from_task(task,
842                                                              &cpuctx->ctx);
843                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
844                 }
845                 perf_pmu_enable(cpuctx->ctx.pmu);
846                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
847         }
848 
849         local_irq_restore(flags);
850 }
851 
852 static inline void perf_cgroup_sched_out(struct task_struct *task,
853                                          struct task_struct *next)
854 {
855         struct perf_cgroup *cgrp1;
856         struct perf_cgroup *cgrp2 = NULL;
857 
858         rcu_read_lock();
859         /*
860          * we come here when we know perf_cgroup_events > 0
861          * we do not need to pass the ctx here because we know
862          * we are holding the rcu lock
863          */
864         cgrp1 = perf_cgroup_from_task(task, NULL);
865         cgrp2 = perf_cgroup_from_task(next, NULL);
866 
867         /*
868          * only schedule out current cgroup events if we know
869          * that we are switching to a different cgroup. Otherwise,
870          * do no touch the cgroup events.
871          */
872         if (cgrp1 != cgrp2)
873                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
874 
875         rcu_read_unlock();
876 }
877 
878 static inline void perf_cgroup_sched_in(struct task_struct *prev,
879                                         struct task_struct *task)
880 {
881         struct perf_cgroup *cgrp1;
882         struct perf_cgroup *cgrp2 = NULL;
883 
884         rcu_read_lock();
885         /*
886          * we come here when we know perf_cgroup_events > 0
887          * we do not need to pass the ctx here because we know
888          * we are holding the rcu lock
889          */
890         cgrp1 = perf_cgroup_from_task(task, NULL);
891         cgrp2 = perf_cgroup_from_task(prev, NULL);
892 
893         /*
894          * only need to schedule in cgroup events if we are changing
895          * cgroup during ctxsw. Cgroup events were not scheduled
896          * out of ctxsw out if that was not the case.
897          */
898         if (cgrp1 != cgrp2)
899                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
900 
901         rcu_read_unlock();
902 }
903 
904 static int perf_cgroup_ensure_storage(struct perf_event *event,
905                                 struct cgroup_subsys_state *css)
906 {
907         struct perf_cpu_context *cpuctx;
908         struct perf_event **storage;
909         int cpu, heap_size, ret = 0;
910 
911         /*
912          * Allow storage to have sufficent space for an iterator for each
913          * possibly nested cgroup plus an iterator for events with no cgroup.
914          */
915         for (heap_size = 1; css; css = css->parent)
916                 heap_size++;
917 
918         for_each_possible_cpu(cpu) {
919                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
920                 if (heap_size <= cpuctx->heap_size)
921                         continue;
922 
923                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
924                                        GFP_KERNEL, cpu_to_node(cpu));
925                 if (!storage) {
926                         ret = -ENOMEM;
927                         break;
928                 }
929 
930                 raw_spin_lock_irq(&cpuctx->ctx.lock);
931                 if (cpuctx->heap_size < heap_size) {
932                         swap(cpuctx->heap, storage);
933                         if (storage == cpuctx->heap_default)
934                                 storage = NULL;
935                         cpuctx->heap_size = heap_size;
936                 }
937                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
938 
939                 kfree(storage);
940         }
941 
942         return ret;
943 }
944 
945 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
946                                       struct perf_event_attr *attr,
947                                       struct perf_event *group_leader)
948 {
949         struct perf_cgroup *cgrp;
950         struct cgroup_subsys_state *css;
951         struct fd f = fdget(fd);
952         int ret = 0;
953 
954         if (!f.file)
955                 return -EBADF;
956 
957         css = css_tryget_online_from_dir(f.file->f_path.dentry,
958                                          &perf_event_cgrp_subsys);
959         if (IS_ERR(css)) {
960                 ret = PTR_ERR(css);
961                 goto out;
962         }
963 
964         ret = perf_cgroup_ensure_storage(event, css);
965         if (ret)
966                 goto out;
967 
968         cgrp = container_of(css, struct perf_cgroup, css);
969         event->cgrp = cgrp;
970 
971         /*
972          * all events in a group must monitor
973          * the same cgroup because a task belongs
974          * to only one perf cgroup at a time
975          */
976         if (group_leader && group_leader->cgrp != cgrp) {
977                 perf_detach_cgroup(event);
978                 ret = -EINVAL;
979         }
980 out:
981         fdput(f);
982         return ret;
983 }
984 
985 static inline void
986 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
987 {
988         struct perf_cgroup_info *t;
989         t = per_cpu_ptr(event->cgrp->info, event->cpu);
990         event->shadow_ctx_time = now - t->timestamp;
991 }
992 
993 static inline void
994 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
995 {
996         struct perf_cpu_context *cpuctx;
997 
998         if (!is_cgroup_event(event))
999                 return;
1000 
1001         /*
1002          * Because cgroup events are always per-cpu events,
1003          * @ctx == &cpuctx->ctx.
1004          */
1005         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1006 
1007         /*
1008          * Since setting cpuctx->cgrp is conditional on the current @cgrp
1009          * matching the event's cgroup, we must do this for every new event,
1010          * because if the first would mismatch, the second would not try again
1011          * and we would leave cpuctx->cgrp unset.
1012          */
1013         if (ctx->is_active && !cpuctx->cgrp) {
1014                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1015 
1016                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1017                         cpuctx->cgrp = cgrp;
1018         }
1019 
1020         if (ctx->nr_cgroups++)
1021                 return;
1022 
1023         list_add(&cpuctx->cgrp_cpuctx_entry,
1024                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1025 }
1026 
1027 static inline void
1028 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1029 {
1030         struct perf_cpu_context *cpuctx;
1031 
1032         if (!is_cgroup_event(event))
1033                 return;
1034 
1035         /*
1036          * Because cgroup events are always per-cpu events,
1037          * @ctx == &cpuctx->ctx.
1038          */
1039         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1040 
1041         if (--ctx->nr_cgroups)
1042                 return;
1043 
1044         if (ctx->is_active && cpuctx->cgrp)
1045                 cpuctx->cgrp = NULL;
1046 
1047         list_del(&cpuctx->cgrp_cpuctx_entry);
1048 }
1049 
1050 #else /* !CONFIG_CGROUP_PERF */
1051 
1052 static inline bool
1053 perf_cgroup_match(struct perf_event *event)
1054 {
1055         return true;
1056 }
1057 
1058 static inline void perf_detach_cgroup(struct perf_event *event)
1059 {}
1060 
1061 static inline int is_cgroup_event(struct perf_event *event)
1062 {
1063         return 0;
1064 }
1065 
1066 static inline void update_cgrp_time_from_event(struct perf_event *event)
1067 {
1068 }
1069 
1070 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1071 {
1072 }
1073 
1074 static inline void perf_cgroup_sched_out(struct task_struct *task,
1075                                          struct task_struct *next)
1076 {
1077 }
1078 
1079 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1080                                         struct task_struct *task)
1081 {
1082 }
1083 
1084 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1085                                       struct perf_event_attr *attr,
1086                                       struct perf_event *group_leader)
1087 {
1088         return -EINVAL;
1089 }
1090 
1091 static inline void
1092 perf_cgroup_set_timestamp(struct task_struct *task,
1093                           struct perf_event_context *ctx)
1094 {
1095 }
1096 
1097 static inline void
1098 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1099 {
1100 }
1101 
1102 static inline void
1103 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1104 {
1105 }
1106 
1107 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1108 {
1109         return 0;
1110 }
1111 
1112 static inline void
1113 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1114 {
1115 }
1116 
1117 static inline void
1118 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1119 {
1120 }
1121 #endif
1122 
1123 /*
1124  * set default to be dependent on timer tick just
1125  * like original code
1126  */
1127 #define PERF_CPU_HRTIMER (1000 / HZ)
1128 /*
1129  * function must be called with interrupts disabled
1130  */
1131 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1132 {
1133         struct perf_cpu_context *cpuctx;
1134         bool rotations;
1135 
1136         lockdep_assert_irqs_disabled();
1137 
1138         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1139         rotations = perf_rotate_context(cpuctx);
1140 
1141         raw_spin_lock(&cpuctx->hrtimer_lock);
1142         if (rotations)
1143                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1144         else
1145                 cpuctx->hrtimer_active = 0;
1146         raw_spin_unlock(&cpuctx->hrtimer_lock);
1147 
1148         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1149 }
1150 
1151 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1152 {
1153         struct hrtimer *timer = &cpuctx->hrtimer;
1154         struct pmu *pmu = cpuctx->ctx.pmu;
1155         u64 interval;
1156 
1157         /* no multiplexing needed for SW PMU */
1158         if (pmu->task_ctx_nr == perf_sw_context)
1159                 return;
1160 
1161         /*
1162          * check default is sane, if not set then force to
1163          * default interval (1/tick)
1164          */
1165         interval = pmu->hrtimer_interval_ms;
1166         if (interval < 1)
1167                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1168 
1169         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1170 
1171         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1172         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1173         timer->function = perf_mux_hrtimer_handler;
1174 }
1175 
1176 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1177 {
1178         struct hrtimer *timer = &cpuctx->hrtimer;
1179         struct pmu *pmu = cpuctx->ctx.pmu;
1180         unsigned long flags;
1181 
1182         /* not for SW PMU */
1183         if (pmu->task_ctx_nr == perf_sw_context)
1184                 return 0;
1185 
1186         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1187         if (!cpuctx->hrtimer_active) {
1188                 cpuctx->hrtimer_active = 1;
1189                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1190                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1191         }
1192         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1193 
1194         return 0;
1195 }
1196 
1197 void perf_pmu_disable(struct pmu *pmu)
1198 {
1199         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1200         if (!(*count)++)
1201                 pmu->pmu_disable(pmu);
1202 }
1203 
1204 void perf_pmu_enable(struct pmu *pmu)
1205 {
1206         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1207         if (!--(*count))
1208                 pmu->pmu_enable(pmu);
1209 }
1210 
1211 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1212 
1213 /*
1214  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1215  * perf_event_task_tick() are fully serialized because they're strictly cpu
1216  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1217  * disabled, while perf_event_task_tick is called from IRQ context.
1218  */
1219 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1220 {
1221         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1222 
1223         lockdep_assert_irqs_disabled();
1224 
1225         WARN_ON(!list_empty(&ctx->active_ctx_list));
1226 
1227         list_add(&ctx->active_ctx_list, head);
1228 }
1229 
1230 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1231 {
1232         lockdep_assert_irqs_disabled();
1233 
1234         WARN_ON(list_empty(&ctx->active_ctx_list));
1235 
1236         list_del_init(&ctx->active_ctx_list);
1237 }
1238 
1239 static void get_ctx(struct perf_event_context *ctx)
1240 {
1241         refcount_inc(&ctx->refcount);
1242 }
1243 
1244 static void *alloc_task_ctx_data(struct pmu *pmu)
1245 {
1246         if (pmu->task_ctx_cache)
1247                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1248 
1249         return NULL;
1250 }
1251 
1252 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1253 {
1254         if (pmu->task_ctx_cache && task_ctx_data)
1255                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1256 }
1257 
1258 static void free_ctx(struct rcu_head *head)
1259 {
1260         struct perf_event_context *ctx;
1261 
1262         ctx = container_of(head, struct perf_event_context, rcu_head);
1263         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1264         kfree(ctx);
1265 }
1266 
1267 static void put_ctx(struct perf_event_context *ctx)
1268 {
1269         if (refcount_dec_and_test(&ctx->refcount)) {
1270                 if (ctx->parent_ctx)
1271                         put_ctx(ctx->parent_ctx);
1272                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1273                         put_task_struct(ctx->task);
1274                 call_rcu(&ctx->rcu_head, free_ctx);
1275         }
1276 }
1277 
1278 /*
1279  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1280  * perf_pmu_migrate_context() we need some magic.
1281  *
1282  * Those places that change perf_event::ctx will hold both
1283  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1284  *
1285  * Lock ordering is by mutex address. There are two other sites where
1286  * perf_event_context::mutex nests and those are:
1287  *
1288  *  - perf_event_exit_task_context()    [ child , 0 ]
1289  *      perf_event_exit_event()
1290  *        put_event()                   [ parent, 1 ]
1291  *
1292  *  - perf_event_init_context()         [ parent, 0 ]
1293  *      inherit_task_group()
1294  *        inherit_group()
1295  *          inherit_event()
1296  *            perf_event_alloc()
1297  *              perf_init_event()
1298  *                perf_try_init_event() [ child , 1 ]
1299  *
1300  * While it appears there is an obvious deadlock here -- the parent and child
1301  * nesting levels are inverted between the two. This is in fact safe because
1302  * life-time rules separate them. That is an exiting task cannot fork, and a
1303  * spawning task cannot (yet) exit.
1304  *
1305  * But remember that that these are parent<->child context relations, and
1306  * migration does not affect children, therefore these two orderings should not
1307  * interact.
1308  *
1309  * The change in perf_event::ctx does not affect children (as claimed above)
1310  * because the sys_perf_event_open() case will install a new event and break
1311  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1312  * concerned with cpuctx and that doesn't have children.
1313  *
1314  * The places that change perf_event::ctx will issue:
1315  *
1316  *   perf_remove_from_context();
1317  *   synchronize_rcu();
1318  *   perf_install_in_context();
1319  *
1320  * to affect the change. The remove_from_context() + synchronize_rcu() should
1321  * quiesce the event, after which we can install it in the new location. This
1322  * means that only external vectors (perf_fops, prctl) can perturb the event
1323  * while in transit. Therefore all such accessors should also acquire
1324  * perf_event_context::mutex to serialize against this.
1325  *
1326  * However; because event->ctx can change while we're waiting to acquire
1327  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1328  * function.
1329  *
1330  * Lock order:
1331  *    exec_update_lock
1332  *      task_struct::perf_event_mutex
1333  *        perf_event_context::mutex
1334  *          perf_event::child_mutex;
1335  *            perf_event_context::lock
1336  *          perf_event::mmap_mutex
1337  *          mmap_lock
1338  *            perf_addr_filters_head::lock
1339  *
1340  *    cpu_hotplug_lock
1341  *      pmus_lock
1342  *        cpuctx->mutex / perf_event_context::mutex
1343  */
1344 static struct perf_event_context *
1345 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1346 {
1347         struct perf_event_context *ctx;
1348 
1349 again:
1350         rcu_read_lock();
1351         ctx = READ_ONCE(event->ctx);
1352         if (!refcount_inc_not_zero(&ctx->refcount)) {
1353                 rcu_read_unlock();
1354                 goto again;
1355         }
1356         rcu_read_unlock();
1357 
1358         mutex_lock_nested(&ctx->mutex, nesting);
1359         if (event->ctx != ctx) {
1360                 mutex_unlock(&ctx->mutex);
1361                 put_ctx(ctx);
1362                 goto again;
1363         }
1364 
1365         return ctx;
1366 }
1367 
1368 static inline struct perf_event_context *
1369 perf_event_ctx_lock(struct perf_event *event)
1370 {
1371         return perf_event_ctx_lock_nested(event, 0);
1372 }
1373 
1374 static void perf_event_ctx_unlock(struct perf_event *event,
1375                                   struct perf_event_context *ctx)
1376 {
1377         mutex_unlock(&ctx->mutex);
1378         put_ctx(ctx);
1379 }
1380 
1381 /*
1382  * This must be done under the ctx->lock, such as to serialize against
1383  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1384  * calling scheduler related locks and ctx->lock nests inside those.
1385  */
1386 static __must_check struct perf_event_context *
1387 unclone_ctx(struct perf_event_context *ctx)
1388 {
1389         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1390 
1391         lockdep_assert_held(&ctx->lock);
1392 
1393         if (parent_ctx)
1394                 ctx->parent_ctx = NULL;
1395         ctx->generation++;
1396 
1397         return parent_ctx;
1398 }
1399 
1400 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1401                                 enum pid_type type)
1402 {
1403         u32 nr;
1404         /*
1405          * only top level events have the pid namespace they were created in
1406          */
1407         if (event->parent)
1408                 event = event->parent;
1409 
1410         nr = __task_pid_nr_ns(p, type, event->ns);
1411         /* avoid -1 if it is idle thread or runs in another ns */
1412         if (!nr && !pid_alive(p))
1413                 nr = -1;
1414         return nr;
1415 }
1416 
1417 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1418 {
1419         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1420 }
1421 
1422 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1423 {
1424         return perf_event_pid_type(event, p, PIDTYPE_PID);
1425 }
1426 
1427 /*
1428  * If we inherit events we want to return the parent event id
1429  * to userspace.
1430  */
1431 static u64 primary_event_id(struct perf_event *event)
1432 {
1433         u64 id = event->id;
1434 
1435         if (event->parent)
1436                 id = event->parent->id;
1437 
1438         return id;
1439 }
1440 
1441 /*
1442  * Get the perf_event_context for a task and lock it.
1443  *
1444  * This has to cope with with the fact that until it is locked,
1445  * the context could get moved to another task.
1446  */
1447 static struct perf_event_context *
1448 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1449 {
1450         struct perf_event_context *ctx;
1451 
1452 retry:
1453         /*
1454          * One of the few rules of preemptible RCU is that one cannot do
1455          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1456          * part of the read side critical section was irqs-enabled -- see
1457          * rcu_read_unlock_special().
1458          *
1459          * Since ctx->lock nests under rq->lock we must ensure the entire read
1460          * side critical section has interrupts disabled.
1461          */
1462         local_irq_save(*flags);
1463         rcu_read_lock();
1464         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1465         if (ctx) {
1466                 /*
1467                  * If this context is a clone of another, it might
1468                  * get swapped for another underneath us by
1469                  * perf_event_task_sched_out, though the
1470                  * rcu_read_lock() protects us from any context
1471                  * getting freed.  Lock the context and check if it
1472                  * got swapped before we could get the lock, and retry
1473                  * if so.  If we locked the right context, then it
1474                  * can't get swapped on us any more.
1475                  */
1476                 raw_spin_lock(&ctx->lock);
1477                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1478                         raw_spin_unlock(&ctx->lock);
1479                         rcu_read_unlock();
1480                         local_irq_restore(*flags);
1481                         goto retry;
1482                 }
1483 
1484                 if (ctx->task == TASK_TOMBSTONE ||
1485                     !refcount_inc_not_zero(&ctx->refcount)) {
1486                         raw_spin_unlock(&ctx->lock);
1487                         ctx = NULL;
1488                 } else {
1489                         WARN_ON_ONCE(ctx->task != task);
1490                 }
1491         }
1492         rcu_read_unlock();
1493         if (!ctx)
1494                 local_irq_restore(*flags);
1495         return ctx;
1496 }
1497 
1498 /*
1499  * Get the context for a task and increment its pin_count so it
1500  * can't get swapped to another task.  This also increments its
1501  * reference count so that the context can't get freed.
1502  */
1503 static struct perf_event_context *
1504 perf_pin_task_context(struct task_struct *task, int ctxn)
1505 {
1506         struct perf_event_context *ctx;
1507         unsigned long flags;
1508 
1509         ctx = perf_lock_task_context(task, ctxn, &flags);
1510         if (ctx) {
1511                 ++ctx->pin_count;
1512                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1513         }
1514         return ctx;
1515 }
1516 
1517 static void perf_unpin_context(struct perf_event_context *ctx)
1518 {
1519         unsigned long flags;
1520 
1521         raw_spin_lock_irqsave(&ctx->lock, flags);
1522         --ctx->pin_count;
1523         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1524 }
1525 
1526 /*
1527  * Update the record of the current time in a context.
1528  */
1529 static void update_context_time(struct perf_event_context *ctx)
1530 {
1531         u64 now = perf_clock();
1532 
1533         ctx->time += now - ctx->timestamp;
1534         ctx->timestamp = now;
1535 }
1536 
1537 static u64 perf_event_time(struct perf_event *event)
1538 {
1539         struct perf_event_context *ctx = event->ctx;
1540 
1541         if (is_cgroup_event(event))
1542                 return perf_cgroup_event_time(event);
1543 
1544         return ctx ? ctx->time : 0;
1545 }
1546 
1547 static enum event_type_t get_event_type(struct perf_event *event)
1548 {
1549         struct perf_event_context *ctx = event->ctx;
1550         enum event_type_t event_type;
1551 
1552         lockdep_assert_held(&ctx->lock);
1553 
1554         /*
1555          * It's 'group type', really, because if our group leader is
1556          * pinned, so are we.
1557          */
1558         if (event->group_leader != event)
1559                 event = event->group_leader;
1560 
1561         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1562         if (!ctx->task)
1563                 event_type |= EVENT_CPU;
1564 
1565         return event_type;
1566 }
1567 
1568 /*
1569  * Helper function to initialize event group nodes.
1570  */
1571 static void init_event_group(struct perf_event *event)
1572 {
1573         RB_CLEAR_NODE(&event->group_node);
1574         event->group_index = 0;
1575 }
1576 
1577 /*
1578  * Extract pinned or flexible groups from the context
1579  * based on event attrs bits.
1580  */
1581 static struct perf_event_groups *
1582 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1583 {
1584         if (event->attr.pinned)
1585                 return &ctx->pinned_groups;
1586         else
1587                 return &ctx->flexible_groups;
1588 }
1589 
1590 /*
1591  * Helper function to initializes perf_event_group trees.
1592  */
1593 static void perf_event_groups_init(struct perf_event_groups *groups)
1594 {
1595         groups->tree = RB_ROOT;
1596         groups->index = 0;
1597 }
1598 
1599 /*
1600  * Compare function for event groups;
1601  *
1602  * Implements complex key that first sorts by CPU and then by virtual index
1603  * which provides ordering when rotating groups for the same CPU.
1604  */
1605 static bool
1606 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1607 {
1608         if (left->cpu < right->cpu)
1609                 return true;
1610         if (left->cpu > right->cpu)
1611                 return false;
1612 
1613 #ifdef CONFIG_CGROUP_PERF
1614         if (left->cgrp != right->cgrp) {
1615                 if (!left->cgrp || !left->cgrp->css.cgroup) {
1616                         /*
1617                          * Left has no cgroup but right does, no cgroups come
1618                          * first.
1619                          */
1620                         return true;
1621                 }
1622                 if (!right->cgrp || !right->cgrp->css.cgroup) {
1623                         /*
1624                          * Right has no cgroup but left does, no cgroups come
1625                          * first.
1626                          */
1627                         return false;
1628                 }
1629                 /* Two dissimilar cgroups, order by id. */
1630                 if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1631                         return true;
1632 
1633                 return false;
1634         }
1635 #endif
1636 
1637         if (left->group_index < right->group_index)
1638                 return true;
1639         if (left->group_index > right->group_index)
1640                 return false;
1641 
1642         return false;
1643 }
1644 
1645 /*
1646  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1647  * key (see perf_event_groups_less). This places it last inside the CPU
1648  * subtree.
1649  */
1650 static void
1651 perf_event_groups_insert(struct perf_event_groups *groups,
1652                          struct perf_event *event)
1653 {
1654         struct perf_event *node_event;
1655         struct rb_node *parent;
1656         struct rb_node **node;
1657 
1658         event->group_index = ++groups->index;
1659 
1660         node = &groups->tree.rb_node;
1661         parent = *node;
1662 
1663         while (*node) {
1664                 parent = *node;
1665                 node_event = container_of(*node, struct perf_event, group_node);
1666 
1667                 if (perf_event_groups_less(event, node_event))
1668                         node = &parent->rb_left;
1669                 else
1670                         node = &parent->rb_right;
1671         }
1672 
1673         rb_link_node(&event->group_node, parent, node);
1674         rb_insert_color(&event->group_node, &groups->tree);
1675 }
1676 
1677 /*
1678  * Helper function to insert event into the pinned or flexible groups.
1679  */
1680 static void
1681 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1682 {
1683         struct perf_event_groups *groups;
1684 
1685         groups = get_event_groups(event, ctx);
1686         perf_event_groups_insert(groups, event);
1687 }
1688 
1689 /*
1690  * Delete a group from a tree.
1691  */
1692 static void
1693 perf_event_groups_delete(struct perf_event_groups *groups,
1694                          struct perf_event *event)
1695 {
1696         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1697                      RB_EMPTY_ROOT(&groups->tree));
1698 
1699         rb_erase(&event->group_node, &groups->tree);
1700         init_event_group(event);
1701 }
1702 
1703 /*
1704  * Helper function to delete event from its groups.
1705  */
1706 static void
1707 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1708 {
1709         struct perf_event_groups *groups;
1710 
1711         groups = get_event_groups(event, ctx);
1712         perf_event_groups_delete(groups, event);
1713 }
1714 
1715 /*
1716  * Get the leftmost event in the cpu/cgroup subtree.
1717  */
1718 static struct perf_event *
1719 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1720                         struct cgroup *cgrp)
1721 {
1722         struct perf_event *node_event = NULL, *match = NULL;
1723         struct rb_node *node = groups->tree.rb_node;
1724 #ifdef CONFIG_CGROUP_PERF
1725         u64 node_cgrp_id, cgrp_id = 0;
1726 
1727         if (cgrp)
1728                 cgrp_id = cgrp->kn->id;
1729 #endif
1730 
1731         while (node) {
1732                 node_event = container_of(node, struct perf_event, group_node);
1733 
1734                 if (cpu < node_event->cpu) {
1735                         node = node->rb_left;
1736                         continue;
1737                 }
1738                 if (cpu > node_event->cpu) {
1739                         node = node->rb_right;
1740                         continue;
1741                 }
1742 #ifdef CONFIG_CGROUP_PERF
1743                 node_cgrp_id = 0;
1744                 if (node_event->cgrp && node_event->cgrp->css.cgroup)
1745                         node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1746 
1747                 if (cgrp_id < node_cgrp_id) {
1748                         node = node->rb_left;
1749                         continue;
1750                 }
1751                 if (cgrp_id > node_cgrp_id) {
1752                         node = node->rb_right;
1753                         continue;
1754                 }
1755 #endif
1756                 match = node_event;
1757                 node = node->rb_left;
1758         }
1759 
1760         return match;
1761 }
1762 
1763 /*
1764  * Like rb_entry_next_safe() for the @cpu subtree.
1765  */
1766 static struct perf_event *
1767 perf_event_groups_next(struct perf_event *event)
1768 {
1769         struct perf_event *next;
1770 #ifdef CONFIG_CGROUP_PERF
1771         u64 curr_cgrp_id = 0;
1772         u64 next_cgrp_id = 0;
1773 #endif
1774 
1775         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1776         if (next == NULL || next->cpu != event->cpu)
1777                 return NULL;
1778 
1779 #ifdef CONFIG_CGROUP_PERF
1780         if (event->cgrp && event->cgrp->css.cgroup)
1781                 curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1782 
1783         if (next->cgrp && next->cgrp->css.cgroup)
1784                 next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1785 
1786         if (curr_cgrp_id != next_cgrp_id)
1787                 return NULL;
1788 #endif
1789         return next;
1790 }
1791 
1792 /*
1793  * Iterate through the whole groups tree.
1794  */
1795 #define perf_event_groups_for_each(event, groups)                       \
1796         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1797                                 typeof(*event), group_node); event;     \
1798                 event = rb_entry_safe(rb_next(&event->group_node),      \
1799                                 typeof(*event), group_node))
1800 
1801 /*
1802  * Add an event from the lists for its context.
1803  * Must be called with ctx->mutex and ctx->lock held.
1804  */
1805 static void
1806 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1807 {
1808         lockdep_assert_held(&ctx->lock);
1809 
1810         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1811         event->attach_state |= PERF_ATTACH_CONTEXT;
1812 
1813         event->tstamp = perf_event_time(event);
1814 
1815         /*
1816          * If we're a stand alone event or group leader, we go to the context
1817          * list, group events are kept attached to the group so that
1818          * perf_group_detach can, at all times, locate all siblings.
1819          */
1820         if (event->group_leader == event) {
1821                 event->group_caps = event->event_caps;
1822                 add_event_to_groups(event, ctx);
1823         }
1824 
1825         list_add_rcu(&event->event_entry, &ctx->event_list);
1826         ctx->nr_events++;
1827         if (event->attr.inherit_stat)
1828                 ctx->nr_stat++;
1829 
1830         if (event->state > PERF_EVENT_STATE_OFF)
1831                 perf_cgroup_event_enable(event, ctx);
1832 
1833         ctx->generation++;
1834 }
1835 
1836 /*
1837  * Initialize event state based on the perf_event_attr::disabled.
1838  */
1839 static inline void perf_event__state_init(struct perf_event *event)
1840 {
1841         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1842                                               PERF_EVENT_STATE_INACTIVE;
1843 }
1844 
1845 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1846 {
1847         int entry = sizeof(u64); /* value */
1848         int size = 0;
1849         int nr = 1;
1850 
1851         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1852                 size += sizeof(u64);
1853 
1854         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1855                 size += sizeof(u64);
1856 
1857         if (event->attr.read_format & PERF_FORMAT_ID)
1858                 entry += sizeof(u64);
1859 
1860         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1861                 nr += nr_siblings;
1862                 size += sizeof(u64);
1863         }
1864 
1865         size += entry * nr;
1866         event->read_size = size;
1867 }
1868 
1869 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1870 {
1871         struct perf_sample_data *data;
1872         u16 size = 0;
1873 
1874         if (sample_type & PERF_SAMPLE_IP)
1875                 size += sizeof(data->ip);
1876 
1877         if (sample_type & PERF_SAMPLE_ADDR)
1878                 size += sizeof(data->addr);
1879 
1880         if (sample_type & PERF_SAMPLE_PERIOD)
1881                 size += sizeof(data->period);
1882 
1883         if (sample_type & PERF_SAMPLE_WEIGHT)
1884                 size += sizeof(data->weight);
1885 
1886         if (sample_type & PERF_SAMPLE_READ)
1887                 size += event->read_size;
1888 
1889         if (sample_type & PERF_SAMPLE_DATA_SRC)
1890                 size += sizeof(data->data_src.val);
1891 
1892         if (sample_type & PERF_SAMPLE_TRANSACTION)
1893                 size += sizeof(data->txn);
1894 
1895         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1896                 size += sizeof(data->phys_addr);
1897 
1898         if (sample_type & PERF_SAMPLE_CGROUP)
1899                 size += sizeof(data->cgroup);
1900 
1901         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1902                 size += sizeof(data->data_page_size);
1903 
1904         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1905                 size += sizeof(data->code_page_size);
1906 
1907         event->header_size = size;
1908 }
1909 
1910 /*
1911  * Called at perf_event creation and when events are attached/detached from a
1912  * group.
1913  */
1914 static void perf_event__header_size(struct perf_event *event)
1915 {
1916         __perf_event_read_size(event,
1917                                event->group_leader->nr_siblings);
1918         __perf_event_header_size(event, event->attr.sample_type);
1919 }
1920 
1921 static void perf_event__id_header_size(struct perf_event *event)
1922 {
1923         struct perf_sample_data *data;
1924         u64 sample_type = event->attr.sample_type;
1925         u16 size = 0;
1926 
1927         if (sample_type & PERF_SAMPLE_TID)
1928                 size += sizeof(data->tid_entry);
1929 
1930         if (sample_type & PERF_SAMPLE_TIME)
1931                 size += sizeof(data->time);
1932 
1933         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1934                 size += sizeof(data->id);
1935 
1936         if (sample_type & PERF_SAMPLE_ID)
1937                 size += sizeof(data->id);
1938 
1939         if (sample_type & PERF_SAMPLE_STREAM_ID)
1940                 size += sizeof(data->stream_id);
1941 
1942         if (sample_type & PERF_SAMPLE_CPU)
1943                 size += sizeof(data->cpu_entry);
1944 
1945         event->id_header_size = size;
1946 }
1947 
1948 static bool perf_event_validate_size(struct perf_event *event)
1949 {
1950         /*
1951          * The values computed here will be over-written when we actually
1952          * attach the event.
1953          */
1954         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1955         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1956         perf_event__id_header_size(event);
1957 
1958         /*
1959          * Sum the lot; should not exceed the 64k limit we have on records.
1960          * Conservative limit to allow for callchains and other variable fields.
1961          */
1962         if (event->read_size + event->header_size +
1963             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1964                 return false;
1965 
1966         return true;
1967 }
1968 
1969 static void perf_group_attach(struct perf_event *event)
1970 {
1971         struct perf_event *group_leader = event->group_leader, *pos;
1972 
1973         lockdep_assert_held(&event->ctx->lock);
1974 
1975         /*
1976          * We can have double attach due to group movement in perf_event_open.
1977          */
1978         if (event->attach_state & PERF_ATTACH_GROUP)
1979                 return;
1980 
1981         event->attach_state |= PERF_ATTACH_GROUP;
1982 
1983         if (group_leader == event)
1984                 return;
1985 
1986         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1987 
1988         group_leader->group_caps &= event->event_caps;
1989 
1990         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1991         group_leader->nr_siblings++;
1992 
1993         perf_event__header_size(group_leader);
1994 
1995         for_each_sibling_event(pos, group_leader)
1996                 perf_event__header_size(pos);
1997 }
1998 
1999 /*
2000  * Remove an event from the lists for its context.
2001  * Must be called with ctx->mutex and ctx->lock held.
2002  */
2003 static void
2004 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2005 {
2006         WARN_ON_ONCE(event->ctx != ctx);
2007         lockdep_assert_held(&ctx->lock);
2008 
2009         /*
2010          * We can have double detach due to exit/hot-unplug + close.
2011          */
2012         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2013                 return;
2014 
2015         event->attach_state &= ~PERF_ATTACH_CONTEXT;
2016 
2017         ctx->nr_events--;
2018         if (event->attr.inherit_stat)
2019                 ctx->nr_stat--;
2020 
2021         list_del_rcu(&event->event_entry);
2022 
2023         if (event->group_leader == event)
2024                 del_event_from_groups(event, ctx);
2025 
2026         /*
2027          * If event was in error state, then keep it
2028          * that way, otherwise bogus counts will be
2029          * returned on read(). The only way to get out
2030          * of error state is by explicit re-enabling
2031          * of the event
2032          */
2033         if (event->state > PERF_EVENT_STATE_OFF) {
2034                 perf_cgroup_event_disable(event, ctx);
2035                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2036         }
2037 
2038         ctx->generation++;
2039 }
2040 
2041 static int
2042 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2043 {
2044         if (!has_aux(aux_event))
2045                 return 0;
2046 
2047         if (!event->pmu->aux_output_match)
2048                 return 0;
2049 
2050         return event->pmu->aux_output_match(aux_event);
2051 }
2052 
2053 static void put_event(struct perf_event *event);
2054 static void event_sched_out(struct perf_event *event,
2055                             struct perf_cpu_context *cpuctx,
2056                             struct perf_event_context *ctx);
2057 
2058 static void perf_put_aux_event(struct perf_event *event)
2059 {
2060         struct perf_event_context *ctx = event->ctx;
2061         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2062         struct perf_event *iter;
2063 
2064         /*
2065          * If event uses aux_event tear down the link
2066          */
2067         if (event->aux_event) {
2068                 iter = event->aux_event;
2069                 event->aux_event = NULL;
2070                 put_event(iter);
2071                 return;
2072         }
2073 
2074         /*
2075          * If the event is an aux_event, tear down all links to
2076          * it from other events.
2077          */
2078         for_each_sibling_event(iter, event->group_leader) {
2079                 if (iter->aux_event != event)
2080                         continue;
2081 
2082                 iter->aux_event = NULL;
2083                 put_event(event);
2084 
2085                 /*
2086                  * If it's ACTIVE, schedule it out and put it into ERROR
2087                  * state so that we don't try to schedule it again. Note
2088                  * that perf_event_enable() will clear the ERROR status.
2089                  */
2090                 event_sched_out(iter, cpuctx, ctx);
2091                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2092         }
2093 }
2094 
2095 static bool perf_need_aux_event(struct perf_event *event)
2096 {
2097         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2098 }
2099 
2100 static int perf_get_aux_event(struct perf_event *event,
2101                               struct perf_event *group_leader)
2102 {
2103         /*
2104          * Our group leader must be an aux event if we want to be
2105          * an aux_output. This way, the aux event will precede its
2106          * aux_output events in the group, and therefore will always
2107          * schedule first.
2108          */
2109         if (!group_leader)
2110                 return 0;
2111 
2112         /*
2113          * aux_output and aux_sample_size are mutually exclusive.
2114          */
2115         if (event->attr.aux_output && event->attr.aux_sample_size)
2116                 return 0;
2117 
2118         if (event->attr.aux_output &&
2119             !perf_aux_output_match(event, group_leader))
2120                 return 0;
2121 
2122         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2123                 return 0;
2124 
2125         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2126                 return 0;
2127 
2128         /*
2129          * Link aux_outputs to their aux event; this is undone in
2130          * perf_group_detach() by perf_put_aux_event(). When the
2131          * group in torn down, the aux_output events loose their
2132          * link to the aux_event and can't schedule any more.
2133          */
2134         event->aux_event = group_leader;
2135 
2136         return 1;
2137 }
2138 
2139 static inline struct list_head *get_event_list(struct perf_event *event)
2140 {
2141         struct perf_event_context *ctx = event->ctx;
2142         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2143 }
2144 
2145 /*
2146  * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2147  * cannot exist on their own, schedule them out and move them into the ERROR
2148  * state. Also see _perf_event_enable(), it will not be able to recover
2149  * this ERROR state.
2150  */
2151 static inline void perf_remove_sibling_event(struct perf_event *event)
2152 {
2153         struct perf_event_context *ctx = event->ctx;
2154         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2155 
2156         event_sched_out(event, cpuctx, ctx);
2157         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2158 }
2159 
2160 static void perf_group_detach(struct perf_event *event)
2161 {
2162         struct perf_event *leader = event->group_leader;
2163         struct perf_event *sibling, *tmp;
2164         struct perf_event_context *ctx = event->ctx;
2165 
2166         lockdep_assert_held(&ctx->lock);
2167 
2168         /*
2169          * We can have double detach due to exit/hot-unplug + close.
2170          */
2171         if (!(event->attach_state & PERF_ATTACH_GROUP))
2172                 return;
2173 
2174         event->attach_state &= ~PERF_ATTACH_GROUP;
2175 
2176         perf_put_aux_event(event);
2177 
2178         /*
2179          * If this is a sibling, remove it from its group.
2180          */
2181         if (leader != event) {
2182                 list_del_init(&event->sibling_list);
2183                 event->group_leader->nr_siblings--;
2184                 goto out;
2185         }
2186 
2187         /*
2188          * If this was a group event with sibling events then
2189          * upgrade the siblings to singleton events by adding them
2190          * to whatever list we are on.
2191          */
2192         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2193 
2194                 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2195                         perf_remove_sibling_event(sibling);
2196 
2197                 sibling->group_leader = sibling;
2198                 list_del_init(&sibling->sibling_list);
2199 
2200                 /* Inherit group flags from the previous leader */
2201                 sibling->group_caps = event->group_caps;
2202 
2203                 if (!RB_EMPTY_NODE(&event->group_node)) {
2204                         add_event_to_groups(sibling, event->ctx);
2205 
2206                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2207                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2208                 }
2209 
2210                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2211         }
2212 
2213 out:
2214         for_each_sibling_event(tmp, leader)
2215                 perf_event__header_size(tmp);
2216 
2217         perf_event__header_size(leader);
2218 }
2219 
2220 static void sync_child_event(struct perf_event *child_event);
2221 
2222 static void perf_child_detach(struct perf_event *event)
2223 {
2224         struct perf_event *parent_event = event->parent;
2225 
2226         if (!(event->attach_state & PERF_ATTACH_CHILD))
2227                 return;
2228 
2229         event->attach_state &= ~PERF_ATTACH_CHILD;
2230 
2231         if (WARN_ON_ONCE(!parent_event))
2232                 return;
2233 
2234         lockdep_assert_held(&parent_event->child_mutex);
2235 
2236         sync_child_event(event);
2237         list_del_init(&event->child_list);
2238 }
2239 
2240 static bool is_orphaned_event(struct perf_event *event)
2241 {
2242         return event->state == PERF_EVENT_STATE_DEAD;
2243 }
2244 
2245 static inline int __pmu_filter_match(struct perf_event *event)
2246 {
2247         struct pmu *pmu = event->pmu;
2248         return pmu->filter_match ? pmu->filter_match(event) : 1;
2249 }
2250 
2251 /*
2252  * Check whether we should attempt to schedule an event group based on
2253  * PMU-specific filtering. An event group can consist of HW and SW events,
2254  * potentially with a SW leader, so we must check all the filters, to
2255  * determine whether a group is schedulable:
2256  */
2257 static inline int pmu_filter_match(struct perf_event *event)
2258 {
2259         struct perf_event *sibling;
2260 
2261         if (!__pmu_filter_match(event))
2262                 return 0;
2263 
2264         for_each_sibling_event(sibling, event) {
2265                 if (!__pmu_filter_match(sibling))
2266                         return 0;
2267         }
2268 
2269         return 1;
2270 }
2271 
2272 static inline int
2273 event_filter_match(struct perf_event *event)
2274 {
2275         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2276                perf_cgroup_match(event) && pmu_filter_match(event);
2277 }
2278 
2279 static void
2280 event_sched_out(struct perf_event *event,
2281                   struct perf_cpu_context *cpuctx,
2282                   struct perf_event_context *ctx)
2283 {
2284         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2285 
2286         WARN_ON_ONCE(event->ctx != ctx);
2287         lockdep_assert_held(&ctx->lock);
2288 
2289         if (event->state != PERF_EVENT_STATE_ACTIVE)
2290                 return;
2291 
2292         /*
2293          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2294          * we can schedule events _OUT_ individually through things like
2295          * __perf_remove_from_context().
2296          */
2297         list_del_init(&event->active_list);
2298 
2299         perf_pmu_disable(event->pmu);
2300 
2301         event->pmu->del(event, 0);
2302         event->oncpu = -1;
2303 
2304         if (READ_ONCE(event->pending_disable) >= 0) {
2305                 WRITE_ONCE(event->pending_disable, -1);
2306                 perf_cgroup_event_disable(event, ctx);
2307                 state = PERF_EVENT_STATE_OFF;
2308         }
2309         perf_event_set_state(event, state);
2310 
2311         if (!is_software_event(event))
2312                 cpuctx->active_oncpu--;
2313         if (!--ctx->nr_active)
2314                 perf_event_ctx_deactivate(ctx);
2315         if (event->attr.freq && event->attr.sample_freq)
2316                 ctx->nr_freq--;
2317         if (event->attr.exclusive || !cpuctx->active_oncpu)
2318                 cpuctx->exclusive = 0;
2319 
2320         perf_pmu_enable(event->pmu);
2321 }
2322 
2323 static void
2324 group_sched_out(struct perf_event *group_event,
2325                 struct perf_cpu_context *cpuctx,
2326                 struct perf_event_context *ctx)
2327 {
2328         struct perf_event *event;
2329 
2330         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2331                 return;
2332 
2333         perf_pmu_disable(ctx->pmu);
2334 
2335         event_sched_out(group_event, cpuctx, ctx);
2336 
2337         /*
2338          * Schedule out siblings (if any):
2339          */
2340         for_each_sibling_event(event, group_event)
2341                 event_sched_out(event, cpuctx, ctx);
2342 
2343         perf_pmu_enable(ctx->pmu);
2344 }
2345 
2346 #define DETACH_GROUP    0x01UL
2347 #define DETACH_CHILD    0x02UL
2348 
2349 /*
2350  * Cross CPU call to remove a performance event
2351  *
2352  * We disable the event on the hardware level first. After that we
2353  * remove it from the context list.
2354  */
2355 static void
2356 __perf_remove_from_context(struct perf_event *event,
2357                            struct perf_cpu_context *cpuctx,
2358                            struct perf_event_context *ctx,
2359                            void *info)
2360 {
2361         unsigned long flags = (unsigned long)info;
2362 
2363         if (ctx->is_active & EVENT_TIME) {
2364                 update_context_time(ctx);
2365                 update_cgrp_time_from_cpuctx(cpuctx);
2366         }
2367 
2368         event_sched_out(event, cpuctx, ctx);
2369         if (flags & DETACH_GROUP)
2370                 perf_group_detach(event);
2371         if (flags & DETACH_CHILD)
2372                 perf_child_detach(event);
2373         list_del_event(event, ctx);
2374 
2375         if (!ctx->nr_events && ctx->is_active) {
2376                 ctx->is_active = 0;
2377                 ctx->rotate_necessary = 0;
2378                 if (ctx->task) {
2379                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2380                         cpuctx->task_ctx = NULL;
2381                 }
2382         }
2383 }
2384 
2385 /*
2386  * Remove the event from a task's (or a CPU's) list of events.
2387  *
2388  * If event->ctx is a cloned context, callers must make sure that
2389  * every task struct that event->ctx->task could possibly point to
2390  * remains valid.  This is OK when called from perf_release since
2391  * that only calls us on the top-level context, which can't be a clone.
2392  * When called from perf_event_exit_task, it's OK because the
2393  * context has been detached from its task.
2394  */
2395 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2396 {
2397         struct perf_event_context *ctx = event->ctx;
2398 
2399         lockdep_assert_held(&ctx->mutex);
2400 
2401         /*
2402          * Because of perf_event_exit_task(), perf_remove_from_context() ought
2403          * to work in the face of TASK_TOMBSTONE, unlike every other
2404          * event_function_call() user.
2405          */
2406         raw_spin_lock_irq(&ctx->lock);
2407         if (!ctx->is_active) {
2408                 __perf_remove_from_context(event, __get_cpu_context(ctx),
2409                                            ctx, (void *)flags);
2410                 raw_spin_unlock_irq(&ctx->lock);
2411                 return;
2412         }
2413         raw_spin_unlock_irq(&ctx->lock);
2414 
2415         event_function_call(event, __perf_remove_from_context, (void *)flags);
2416 }
2417 
2418 /*
2419  * Cross CPU call to disable a performance event
2420  */
2421 static void __perf_event_disable(struct perf_event *event,
2422                                  struct perf_cpu_context *cpuctx,
2423                                  struct perf_event_context *ctx,
2424                                  void *info)
2425 {
2426         if (event->state < PERF_EVENT_STATE_INACTIVE)
2427                 return;
2428 
2429         if (ctx->is_active & EVENT_TIME) {
2430                 update_context_time(ctx);
2431                 update_cgrp_time_from_event(event);
2432         }
2433 
2434         if (event == event->group_leader)
2435                 group_sched_out(event, cpuctx, ctx);
2436         else
2437                 event_sched_out(event, cpuctx, ctx);
2438 
2439         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2440         perf_cgroup_event_disable(event, ctx);
2441 }
2442 
2443 /*
2444  * Disable an event.
2445  *
2446  * If event->ctx is a cloned context, callers must make sure that
2447  * every task struct that event->ctx->task could possibly point to
2448  * remains valid.  This condition is satisfied when called through
2449  * perf_event_for_each_child or perf_event_for_each because they
2450  * hold the top-level event's child_mutex, so any descendant that
2451  * goes to exit will block in perf_event_exit_event().
2452  *
2453  * When called from perf_pending_event it's OK because event->ctx
2454  * is the current context on this CPU and preemption is disabled,
2455  * hence we can't get into perf_event_task_sched_out for this context.
2456  */
2457 static void _perf_event_disable(struct perf_event *event)
2458 {
2459         struct perf_event_context *ctx = event->ctx;
2460 
2461         raw_spin_lock_irq(&ctx->lock);
2462         if (event->state <= PERF_EVENT_STATE_OFF) {
2463                 raw_spin_unlock_irq(&ctx->lock);
2464                 return;
2465         }
2466         raw_spin_unlock_irq(&ctx->lock);
2467 
2468         event_function_call(event, __perf_event_disable, NULL);
2469 }
2470 
2471 void perf_event_disable_local(struct perf_event *event)
2472 {
2473         event_function_local(event, __perf_event_disable, NULL);
2474 }
2475 
2476 /*
2477  * Strictly speaking kernel users cannot create groups and therefore this
2478  * interface does not need the perf_event_ctx_lock() magic.
2479  */
2480 void perf_event_disable(struct perf_event *event)
2481 {
2482         struct perf_event_context *ctx;
2483 
2484         ctx = perf_event_ctx_lock(event);
2485         _perf_event_disable(event);
2486         perf_event_ctx_unlock(event, ctx);
2487 }
2488 EXPORT_SYMBOL_GPL(perf_event_disable);
2489 
2490 void perf_event_disable_inatomic(struct perf_event *event)
2491 {
2492         WRITE_ONCE(event->pending_disable, smp_processor_id());
2493         /* can fail, see perf_pending_event_disable() */
2494         irq_work_queue(&event->pending);
2495 }
2496 
2497 static void perf_set_shadow_time(struct perf_event *event,
2498                                  struct perf_event_context *ctx)
2499 {
2500         /*
2501          * use the correct time source for the time snapshot
2502          *
2503          * We could get by without this by leveraging the
2504          * fact that to get to this function, the caller
2505          * has most likely already called update_context_time()
2506          * and update_cgrp_time_xx() and thus both timestamp
2507          * are identical (or very close). Given that tstamp is,
2508          * already adjusted for cgroup, we could say that:
2509          *    tstamp - ctx->timestamp
2510          * is equivalent to
2511          *    tstamp - cgrp->timestamp.
2512          *
2513          * Then, in perf_output_read(), the calculation would
2514          * work with no changes because:
2515          * - event is guaranteed scheduled in
2516          * - no scheduled out in between
2517          * - thus the timestamp would be the same
2518          *
2519          * But this is a bit hairy.
2520          *
2521          * So instead, we have an explicit cgroup call to remain
2522          * within the time time source all along. We believe it
2523          * is cleaner and simpler to understand.
2524          */
2525         if (is_cgroup_event(event))
2526                 perf_cgroup_set_shadow_time(event, event->tstamp);
2527         else
2528                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2529 }
2530 
2531 #define MAX_INTERRUPTS (~0ULL)
2532 
2533 static void perf_log_throttle(struct perf_event *event, int enable);
2534 static void perf_log_itrace_start(struct perf_event *event);
2535 
2536 static int
2537 event_sched_in(struct perf_event *event,
2538                  struct perf_cpu_context *cpuctx,
2539                  struct perf_event_context *ctx)
2540 {
2541         int ret = 0;
2542 
2543         WARN_ON_ONCE(event->ctx != ctx);
2544 
2545         lockdep_assert_held(&ctx->lock);
2546 
2547         if (event->state <= PERF_EVENT_STATE_OFF)
2548                 return 0;
2549 
2550         WRITE_ONCE(event->oncpu, smp_processor_id());
2551         /*
2552          * Order event::oncpu write to happen before the ACTIVE state is
2553          * visible. This allows perf_event_{stop,read}() to observe the correct
2554          * ->oncpu if it sees ACTIVE.
2555          */
2556         smp_wmb();
2557         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2558 
2559         /*
2560          * Unthrottle events, since we scheduled we might have missed several
2561          * ticks already, also for a heavily scheduling task there is little
2562          * guarantee it'll get a tick in a timely manner.
2563          */
2564         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2565                 perf_log_throttle(event, 1);
2566                 event->hw.interrupts = 0;
2567         }
2568 
2569         perf_pmu_disable(event->pmu);
2570 
2571         perf_set_shadow_time(event, ctx);
2572 
2573         perf_log_itrace_start(event);
2574 
2575         if (event->pmu->add(event, PERF_EF_START)) {
2576                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2577                 event->oncpu = -1;
2578                 ret = -EAGAIN;
2579                 goto out;
2580         }
2581 
2582         if (!is_software_event(event))
2583                 cpuctx->active_oncpu++;
2584         if (!ctx->nr_active++)
2585                 perf_event_ctx_activate(ctx);
2586         if (event->attr.freq && event->attr.sample_freq)
2587                 ctx->nr_freq++;
2588 
2589         if (event->attr.exclusive)
2590                 cpuctx->exclusive = 1;
2591 
2592 out:
2593         perf_pmu_enable(event->pmu);
2594 
2595         return ret;
2596 }
2597 
2598 static int
2599 group_sched_in(struct perf_event *group_event,
2600                struct perf_cpu_context *cpuctx,
2601                struct perf_event_context *ctx)
2602 {
2603         struct perf_event *event, *partial_group = NULL;
2604         struct pmu *pmu = ctx->pmu;
2605 
2606         if (group_event->state == PERF_EVENT_STATE_OFF)
2607                 return 0;
2608 
2609         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2610 
2611         if (event_sched_in(group_event, cpuctx, ctx))
2612                 goto error;
2613 
2614         /*
2615          * Schedule in siblings as one group (if any):
2616          */
2617         for_each_sibling_event(event, group_event) {
2618                 if (event_sched_in(event, cpuctx, ctx)) {
2619                         partial_group = event;
2620                         goto group_error;
2621                 }
2622         }
2623 
2624         if (!pmu->commit_txn(pmu))
2625                 return 0;
2626 
2627 group_error:
2628         /*
2629          * Groups can be scheduled in as one unit only, so undo any
2630          * partial group before returning:
2631          * The events up to the failed event are scheduled out normally.
2632          */
2633         for_each_sibling_event(event, group_event) {
2634                 if (event == partial_group)
2635                         break;
2636 
2637                 event_sched_out(event, cpuctx, ctx);
2638         }
2639         event_sched_out(group_event, cpuctx, ctx);
2640 
2641 error:
2642         pmu->cancel_txn(pmu);
2643         return -EAGAIN;
2644 }
2645 
2646 /*
2647  * Work out whether we can put this event group on the CPU now.
2648  */
2649 static int group_can_go_on(struct perf_event *event,
2650                            struct perf_cpu_context *cpuctx,
2651                            int can_add_hw)
2652 {
2653         /*
2654          * Groups consisting entirely of software events can always go on.
2655          */
2656         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2657                 return 1;
2658         /*
2659          * If an exclusive group is already on, no other hardware
2660          * events can go on.
2661          */
2662         if (cpuctx->exclusive)
2663                 return 0;
2664         /*
2665          * If this group is exclusive and there are already
2666          * events on the CPU, it can't go on.
2667          */
2668         if (event->attr.exclusive && !list_empty(get_event_list(event)))
2669                 return 0;
2670         /*
2671          * Otherwise, try to add it if all previous groups were able
2672          * to go on.
2673          */
2674         return can_add_hw;
2675 }
2676 
2677 static void add_event_to_ctx(struct perf_event *event,
2678                                struct perf_event_context *ctx)
2679 {
2680         list_add_event(event, ctx);
2681         perf_group_attach(event);
2682 }
2683 
2684 static void ctx_sched_out(struct perf_event_context *ctx,
2685                           struct perf_cpu_context *cpuctx,
2686                           enum event_type_t event_type);
2687 static void
2688 ctx_sched_in(struct perf_event_context *ctx,
2689              struct perf_cpu_context *cpuctx,
2690              enum event_type_t event_type,
2691              struct task_struct *task);
2692 
2693 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2694                                struct perf_event_context *ctx,
2695                                enum event_type_t event_type)
2696 {
2697         if (!cpuctx->task_ctx)
2698                 return;
2699 
2700         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2701                 return;
2702 
2703         ctx_sched_out(ctx, cpuctx, event_type);
2704 }
2705 
2706 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2707                                 struct perf_event_context *ctx,
2708                                 struct task_struct *task)
2709 {
2710         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2711         if (ctx)
2712                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2713         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2714         if (ctx)
2715                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2716 }
2717 
2718 /*
2719  * We want to maintain the following priority of scheduling:
2720  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2721  *  - task pinned (EVENT_PINNED)
2722  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2723  *  - task flexible (EVENT_FLEXIBLE).
2724  *
2725  * In order to avoid unscheduling and scheduling back in everything every
2726  * time an event is added, only do it for the groups of equal priority and
2727  * below.
2728  *
2729  * This can be called after a batch operation on task events, in which case
2730  * event_type is a bit mask of the types of events involved. For CPU events,
2731  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2732  */
2733 static void ctx_resched(struct perf_cpu_context *cpuctx,
2734                         struct perf_event_context *task_ctx,
2735                         enum event_type_t event_type)
2736 {
2737         enum event_type_t ctx_event_type;
2738         bool cpu_event = !!(event_type & EVENT_CPU);
2739 
2740         /*
2741          * If pinned groups are involved, flexible groups also need to be
2742          * scheduled out.
2743          */
2744         if (event_type & EVENT_PINNED)
2745                 event_type |= EVENT_FLEXIBLE;
2746 
2747         ctx_event_type = event_type & EVENT_ALL;
2748 
2749         perf_pmu_disable(cpuctx->ctx.pmu);
2750         if (task_ctx)
2751                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2752 
2753         /*
2754          * Decide which cpu ctx groups to schedule out based on the types
2755          * of events that caused rescheduling:
2756          *  - EVENT_CPU: schedule out corresponding groups;
2757          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2758          *  - otherwise, do nothing more.
2759          */
2760         if (cpu_event)
2761                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2762         else if (ctx_event_type & EVENT_PINNED)
2763                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2764 
2765         perf_event_sched_in(cpuctx, task_ctx, current);
2766         perf_pmu_enable(cpuctx->ctx.pmu);
2767 }
2768 
2769 void perf_pmu_resched(struct pmu *pmu)
2770 {
2771         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2772         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2773 
2774         perf_ctx_lock(cpuctx, task_ctx);
2775         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2776         perf_ctx_unlock(cpuctx, task_ctx);
2777 }
2778 
2779 /*
2780  * Cross CPU call to install and enable a performance event
2781  *
2782  * Very similar to remote_function() + event_function() but cannot assume that
2783  * things like ctx->is_active and cpuctx->task_ctx are set.
2784  */
2785 static int  __perf_install_in_context(void *info)
2786 {
2787         struct perf_event *event = info;
2788         struct perf_event_context *ctx = event->ctx;
2789         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2790         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2791         bool reprogram = true;
2792         int ret = 0;
2793 
2794         raw_spin_lock(&cpuctx->ctx.lock);
2795         if (ctx->task) {
2796                 raw_spin_lock(&ctx->lock);
2797                 task_ctx = ctx;
2798 
2799                 reprogram = (ctx->task == current);
2800 
2801                 /*
2802                  * If the task is running, it must be running on this CPU,
2803                  * otherwise we cannot reprogram things.
2804                  *
2805                  * If its not running, we don't care, ctx->lock will
2806                  * serialize against it becoming runnable.
2807                  */
2808                 if (task_curr(ctx->task) && !reprogram) {
2809                         ret = -ESRCH;
2810                         goto unlock;
2811                 }
2812 
2813                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2814         } else if (task_ctx) {
2815                 raw_spin_lock(&task_ctx->lock);
2816         }
2817 
2818 #ifdef CONFIG_CGROUP_PERF
2819         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2820                 /*
2821                  * If the current cgroup doesn't match the event's
2822                  * cgroup, we should not try to schedule it.
2823                  */
2824                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2825                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2826                                         event->cgrp->css.cgroup);
2827         }
2828 #endif
2829 
2830         if (reprogram) {
2831                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2832                 add_event_to_ctx(event, ctx);
2833                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2834         } else {
2835                 add_event_to_ctx(event, ctx);
2836         }
2837 
2838 unlock:
2839         perf_ctx_unlock(cpuctx, task_ctx);
2840 
2841         return ret;
2842 }
2843 
2844 static bool exclusive_event_installable(struct perf_event *event,
2845                                         struct perf_event_context *ctx);
2846 
2847 /*
2848  * Attach a performance event to a context.
2849  *
2850  * Very similar to event_function_call, see comment there.
2851  */
2852 static void
2853 perf_install_in_context(struct perf_event_context *ctx,
2854                         struct perf_event *event,
2855                         int cpu)
2856 {
2857         struct task_struct *task = READ_ONCE(ctx->task);
2858 
2859         lockdep_assert_held(&ctx->mutex);
2860 
2861         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2862 
2863         if (event->cpu != -1)
2864                 event->cpu = cpu;
2865 
2866         /*
2867          * Ensures that if we can observe event->ctx, both the event and ctx
2868          * will be 'complete'. See perf_iterate_sb_cpu().
2869          */
2870         smp_store_release(&event->ctx, ctx);
2871 
2872         /*
2873          * perf_event_attr::disabled events will not run and can be initialized
2874          * without IPI. Except when this is the first event for the context, in
2875          * that case we need the magic of the IPI to set ctx->is_active.
2876          *
2877          * The IOC_ENABLE that is sure to follow the creation of a disabled
2878          * event will issue the IPI and reprogram the hardware.
2879          */
2880         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2881                 raw_spin_lock_irq(&ctx->lock);
2882                 if (ctx->task == TASK_TOMBSTONE) {
2883                         raw_spin_unlock_irq(&ctx->lock);
2884                         return;
2885                 }
2886                 add_event_to_ctx(event, ctx);
2887                 raw_spin_unlock_irq(&ctx->lock);
2888                 return;
2889         }
2890 
2891         if (!task) {
2892                 cpu_function_call(cpu, __perf_install_in_context, event);
2893                 return;
2894         }
2895 
2896         /*
2897          * Should not happen, we validate the ctx is still alive before calling.
2898          */
2899         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2900                 return;
2901 
2902         /*
2903          * Installing events is tricky because we cannot rely on ctx->is_active
2904          * to be set in case this is the nr_events 0 -> 1 transition.
2905          *
2906          * Instead we use task_curr(), which tells us if the task is running.
2907          * However, since we use task_curr() outside of rq::lock, we can race
2908          * against the actual state. This means the result can be wrong.
2909          *
2910          * If we get a false positive, we retry, this is harmless.
2911          *
2912          * If we get a false negative, things are complicated. If we are after
2913          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2914          * value must be correct. If we're before, it doesn't matter since
2915          * perf_event_context_sched_in() will program the counter.
2916          *
2917          * However, this hinges on the remote context switch having observed
2918          * our task->perf_event_ctxp[] store, such that it will in fact take
2919          * ctx::lock in perf_event_context_sched_in().
2920          *
2921          * We do this by task_function_call(), if the IPI fails to hit the task
2922          * we know any future context switch of task must see the
2923          * perf_event_ctpx[] store.
2924          */
2925 
2926         /*
2927          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2928          * task_cpu() load, such that if the IPI then does not find the task
2929          * running, a future context switch of that task must observe the
2930          * store.
2931          */
2932         smp_mb();
2933 again:
2934         if (!task_function_call(task, __perf_install_in_context, event))
2935                 return;
2936 
2937         raw_spin_lock_irq(&ctx->lock);
2938         task = ctx->task;
2939         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2940                 /*
2941                  * Cannot happen because we already checked above (which also
2942                  * cannot happen), and we hold ctx->mutex, which serializes us
2943                  * against perf_event_exit_task_context().
2944                  */
2945                 raw_spin_unlock_irq(&ctx->lock);
2946                 return;
2947         }
2948         /*
2949          * If the task is not running, ctx->lock will avoid it becoming so,
2950          * thus we can safely install the event.
2951          */
2952         if (task_curr(task)) {
2953                 raw_spin_unlock_irq(&ctx->lock);
2954                 goto again;
2955         }
2956         add_event_to_ctx(event, ctx);
2957         raw_spin_unlock_irq(&ctx->lock);
2958 }
2959 
2960 /*
2961  * Cross CPU call to enable a performance event
2962  */
2963 static void __perf_event_enable(struct perf_event *event,
2964                                 struct perf_cpu_context *cpuctx,
2965                                 struct perf_event_context *ctx,
2966                                 void *info)
2967 {
2968         struct perf_event *leader = event->group_leader;
2969         struct perf_event_context *task_ctx;
2970 
2971         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2972             event->state <= PERF_EVENT_STATE_ERROR)
2973                 return;
2974 
2975         if (ctx->is_active)
2976                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2977 
2978         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2979         perf_cgroup_event_enable(event, ctx);
2980 
2981         if (!ctx->is_active)
2982                 return;
2983 
2984         if (!event_filter_match(event)) {
2985                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2986                 return;
2987         }
2988 
2989         /*
2990          * If the event is in a group and isn't the group leader,
2991          * then don't put it on unless the group is on.
2992          */
2993         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2994                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2995                 return;
2996         }
2997 
2998         task_ctx = cpuctx->task_ctx;
2999         if (ctx->task)
3000                 WARN_ON_ONCE(task_ctx != ctx);
3001 
3002         ctx_resched(cpuctx, task_ctx, get_event_type(event));
3003 }
3004 
3005 /*
3006  * Enable an event.
3007  *
3008  * If event->ctx is a cloned context, callers must make sure that
3009  * every task struct that event->ctx->task could possibly point to
3010  * remains valid.  This condition is satisfied when called through
3011  * perf_event_for_each_child or perf_event_for_each as described
3012  * for perf_event_disable.
3013  */
3014 static void _perf_event_enable(struct perf_event *event)
3015 {
3016         struct perf_event_context *ctx = event->ctx;
3017 
3018         raw_spin_lock_irq(&ctx->lock);
3019         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3020             event->state <  PERF_EVENT_STATE_ERROR) {
3021 out:
3022                 raw_spin_unlock_irq(&ctx->lock);
3023                 return;
3024         }
3025 
3026         /*
3027          * If the event is in error state, clear that first.
3028          *
3029          * That way, if we see the event in error state below, we know that it
3030          * has gone back into error state, as distinct from the task having
3031          * been scheduled away before the cross-call arrived.
3032          */
3033         if (event->state == PERF_EVENT_STATE_ERROR) {
3034                 /*
3035                  * Detached SIBLING events cannot leave ERROR state.
3036                  */
3037                 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3038                     event->group_leader == event)
3039                         goto out;
3040 
3041                 event->state = PERF_EVENT_STATE_OFF;
3042         }
3043         raw_spin_unlock_irq(&ctx->lock);
3044 
3045         event_function_call(event, __perf_event_enable, NULL);
3046 }
3047 
3048 /*
3049  * See perf_event_disable();
3050  */
3051 void perf_event_enable(struct perf_event *event)
3052 {
3053         struct perf_event_context *ctx;
3054 
3055         ctx = perf_event_ctx_lock(event);
3056         _perf_event_enable(event);
3057         perf_event_ctx_unlock(event, ctx);
3058 }
3059 EXPORT_SYMBOL_GPL(perf_event_enable);
3060 
3061 struct stop_event_data {
3062         struct perf_event       *event;
3063         unsigned int            restart;
3064 };
3065 
3066 static int __perf_event_stop(void *info)
3067 {
3068         struct stop_event_data *sd = info;
3069         struct perf_event *event = sd->event;
3070 
3071         /* if it's already INACTIVE, do nothing */
3072         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3073                 return 0;
3074 
3075         /* matches smp_wmb() in event_sched_in() */
3076         smp_rmb();
3077 
3078         /*
3079          * There is a window with interrupts enabled before we get here,
3080          * so we need to check again lest we try to stop another CPU's event.
3081          */
3082         if (READ_ONCE(event->oncpu) != smp_processor_id())
3083                 return -EAGAIN;
3084 
3085         event->pmu->stop(event, PERF_EF_UPDATE);
3086 
3087         /*
3088          * May race with the actual stop (through perf_pmu_output_stop()),
3089          * but it is only used for events with AUX ring buffer, and such
3090          * events will refuse to restart because of rb::aux_mmap_count==0,
3091          * see comments in perf_aux_output_begin().
3092          *
3093          * Since this is happening on an event-local CPU, no trace is lost
3094          * while restarting.
3095          */
3096         if (sd->restart)
3097                 event->pmu->start(event, 0);
3098 
3099         return 0;
3100 }
3101 
3102 static int perf_event_stop(struct perf_event *event, int restart)
3103 {
3104         struct stop_event_data sd = {
3105                 .event          = event,
3106                 .restart        = restart,
3107         };
3108         int ret = 0;
3109 
3110         do {
3111                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3112                         return 0;
3113 
3114                 /* matches smp_wmb() in event_sched_in() */
3115                 smp_rmb();
3116 
3117                 /*
3118                  * We only want to restart ACTIVE events, so if the event goes
3119                  * inactive here (event->oncpu==-1), there's nothing more to do;
3120                  * fall through with ret==-ENXIO.
3121                  */
3122                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3123                                         __perf_event_stop, &sd);
3124         } while (ret == -EAGAIN);
3125 
3126         return ret;
3127 }
3128 
3129 /*
3130  * In order to contain the amount of racy and tricky in the address filter
3131  * configuration management, it is a two part process:
3132  *
3133  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3134  *      we update the addresses of corresponding vmas in
3135  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3136  * (p2) when an event is scheduled in (pmu::add), it calls
3137  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3138  *      if the generation has changed since the previous call.
3139  *
3140  * If (p1) happens while the event is active, we restart it to force (p2).
3141  *
3142  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3143  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3144  *     ioctl;
3145  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3146  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3147  *     for reading;
3148  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3149  *     of exec.
3150  */
3151 void perf_event_addr_filters_sync(struct perf_event *event)
3152 {
3153         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3154 
3155         if (!has_addr_filter(event))
3156                 return;
3157 
3158         raw_spin_lock(&ifh->lock);
3159         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3160                 event->pmu->addr_filters_sync(event);
3161                 event->hw.addr_filters_gen = event->addr_filters_gen;
3162         }
3163         raw_spin_unlock(&ifh->lock);
3164 }
3165 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3166 
3167 static int _perf_event_refresh(struct perf_event *event, int refresh)
3168 {
3169         /*
3170          * not supported on inherited events
3171          */
3172         if (event->attr.inherit || !is_sampling_event(event))
3173                 return -EINVAL;
3174 
3175         atomic_add(refresh, &event->event_limit);
3176         _perf_event_enable(event);
3177 
3178         return 0;
3179 }
3180 
3181 /*
3182  * See perf_event_disable()
3183  */
3184 int perf_event_refresh(struct perf_event *event, int refresh)
3185 {
3186         struct perf_event_context *ctx;
3187         int ret;
3188 
3189         ctx = perf_event_ctx_lock(event);
3190         ret = _perf_event_refresh(event, refresh);
3191         perf_event_ctx_unlock(event, ctx);
3192 
3193         return ret;
3194 }
3195 EXPORT_SYMBOL_GPL(perf_event_refresh);
3196 
3197 static int perf_event_modify_breakpoint(struct perf_event *bp,
3198                                          struct perf_event_attr *attr)
3199 {
3200         int err;
3201 
3202         _perf_event_disable(bp);
3203 
3204         err = modify_user_hw_breakpoint_check(bp, attr, true);
3205 
3206         if (!bp->attr.disabled)
3207                 _perf_event_enable(bp);
3208 
3209         return err;
3210 }
3211 
3212 static int perf_event_modify_attr(struct perf_event *event,
3213                                   struct perf_event_attr *attr)
3214 {
3215         if (event->attr.type != attr->type)
3216                 return -EINVAL;
3217 
3218         switch (event->attr.type) {
3219         case PERF_TYPE_BREAKPOINT:
3220                 return perf_event_modify_breakpoint(event, attr);
3221         default:
3222                 /* Place holder for future additions. */
3223                 return -EOPNOTSUPP;
3224         }
3225 }
3226 
3227 static void ctx_sched_out(struct perf_event_context *ctx,
3228                           struct perf_cpu_context *cpuctx,
3229                           enum event_type_t event_type)
3230 {
3231         struct perf_event *event, *tmp;
3232         int is_active = ctx->is_active;
3233 
3234         lockdep_assert_held(&ctx->lock);
3235 
3236         if (likely(!ctx->nr_events)) {
3237                 /*
3238                  * See __perf_remove_from_context().
3239                  */
3240                 WARN_ON_ONCE(ctx->is_active);
3241                 if (ctx->task)
3242                         WARN_ON_ONCE(cpuctx->task_ctx);
3243                 return;
3244         }
3245 
3246         ctx->is_active &= ~event_type;
3247         if (!(ctx->is_active & EVENT_ALL))
3248                 ctx->is_active = 0;
3249 
3250         if (ctx->task) {
3251                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3252                 if (!ctx->is_active)
3253                         cpuctx->task_ctx = NULL;
3254         }
3255 
3256         /*
3257          * Always update time if it was set; not only when it changes.
3258          * Otherwise we can 'forget' to update time for any but the last
3259          * context we sched out. For example:
3260          *
3261          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3262          *   ctx_sched_out(.event_type = EVENT_PINNED)
3263          *
3264          * would only update time for the pinned events.
3265          */
3266         if (is_active & EVENT_TIME) {
3267                 /* update (and stop) ctx time */
3268                 update_context_time(ctx);
3269                 update_cgrp_time_from_cpuctx(cpuctx);
3270         }
3271 
3272         is_active ^= ctx->is_active; /* changed bits */
3273 
3274         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3275                 return;
3276 
3277         perf_pmu_disable(ctx->pmu);
3278         if (is_active & EVENT_PINNED) {
3279                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3280                         group_sched_out(event, cpuctx, ctx);
3281         }
3282 
3283         if (is_active & EVENT_FLEXIBLE) {
3284                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3285                         group_sched_out(event, cpuctx, ctx);
3286 
3287                 /*
3288                  * Since we cleared EVENT_FLEXIBLE, also clear
3289                  * rotate_necessary, is will be reset by
3290                  * ctx_flexible_sched_in() when needed.
3291                  */
3292                 ctx->rotate_necessary = 0;
3293         }
3294         perf_pmu_enable(ctx->pmu);
3295 }
3296 
3297 /*
3298  * Test whether two contexts are equivalent, i.e. whether they have both been
3299  * cloned from the same version of the same context.
3300  *
3301  * Equivalence is measured using a generation number in the context that is
3302  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3303  * and list_del_event().
3304  */
3305 static int context_equiv(struct perf_event_context *ctx1,
3306                          struct perf_event_context *ctx2)
3307 {
3308         lockdep_assert_held(&ctx1->lock);
3309         lockdep_assert_held(&ctx2->lock);
3310 
3311         /* Pinning disables the swap optimization */
3312         if (ctx1->pin_count || ctx2->pin_count)
3313                 return 0;
3314 
3315         /* If ctx1 is the parent of ctx2 */
3316         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3317                 return 1;
3318 
3319         /* If ctx2 is the parent of ctx1 */
3320         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3321                 return 1;
3322 
3323         /*
3324          * If ctx1 and ctx2 have the same parent; we flatten the parent
3325          * hierarchy, see perf_event_init_context().
3326          */
3327         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3328                         ctx1->parent_gen == ctx2->parent_gen)
3329                 return 1;
3330 
3331         /* Unmatched */
3332         return 0;
3333 }
3334 
3335 static void __perf_event_sync_stat(struct perf_event *event,
3336                                      struct perf_event *next_event)
3337 {
3338         u64 value;
3339 
3340         if (!event->attr.inherit_stat)
3341                 return;
3342 
3343         /*
3344          * Update the event value, we cannot use perf_event_read()
3345          * because we're in the middle of a context switch and have IRQs
3346          * disabled, which upsets smp_call_function_single(), however
3347          * we know the event must be on the current CPU, therefore we
3348          * don't need to use it.
3349          */
3350         if (event->state == PERF_EVENT_STATE_ACTIVE)
3351                 event->pmu->read(event);
3352 
3353         perf_event_update_time(event);
3354 
3355         /*
3356          * In order to keep per-task stats reliable we need to flip the event
3357          * values when we flip the contexts.
3358          */
3359         value = local64_read(&next_event->count);
3360         value = local64_xchg(&event->count, value);
3361         local64_set(&next_event->count, value);
3362 
3363         swap(event->total_time_enabled, next_event->total_time_enabled);
3364         swap(event->total_time_running, next_event->total_time_running);
3365 
3366         /*
3367          * Since we swizzled the values, update the user visible data too.
3368          */
3369         perf_event_update_userpage(event);
3370         perf_event_update_userpage(next_event);
3371 }
3372 
3373 static void perf_event_sync_stat(struct perf_event_context *ctx,
3374                                    struct perf_event_context *next_ctx)
3375 {
3376         struct perf_event *event, *next_event;
3377 
3378         if (!ctx->nr_stat)
3379                 return;
3380 
3381         update_context_time(ctx);
3382 
3383         event = list_first_entry(&ctx->event_list,
3384                                    struct perf_event, event_entry);
3385 
3386         next_event = list_first_entry(&next_ctx->event_list,
3387                                         struct perf_event, event_entry);
3388 
3389         while (&event->event_entry != &ctx->event_list &&
3390                &next_event->event_entry != &next_ctx->event_list) {
3391 
3392                 __perf_event_sync_stat(event, next_event);
3393 
3394                 event = list_next_entry(event, event_entry);
3395                 next_event = list_next_entry(next_event, event_entry);
3396         }
3397 }
3398 
3399 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3400                                          struct task_struct *next)
3401 {
3402         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3403         struct perf_event_context *next_ctx;
3404         struct perf_event_context *parent, *next_parent;
3405         struct perf_cpu_context *cpuctx;
3406         int do_switch = 1;
3407         struct pmu *pmu;
3408 
3409         if (likely(!ctx))
3410                 return;
3411 
3412         pmu = ctx->pmu;
3413         cpuctx = __get_cpu_context(ctx);
3414         if (!cpuctx->task_ctx)
3415                 return;
3416 
3417         rcu_read_lock();
3418         next_ctx = next->perf_event_ctxp[ctxn];
3419         if (!next_ctx)
3420                 goto unlock;
3421 
3422         parent = rcu_dereference(ctx->parent_ctx);
3423         next_parent = rcu_dereference(next_ctx->parent_ctx);
3424 
3425         /* If neither context have a parent context; they cannot be clones. */
3426         if (!parent && !next_parent)
3427                 goto unlock;
3428 
3429         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3430                 /*
3431                  * Looks like the two contexts are clones, so we might be
3432                  * able to optimize the context switch.  We lock both
3433                  * contexts and check that they are clones under the
3434                  * lock (including re-checking that neither has been
3435                  * uncloned in the meantime).  It doesn't matter which
3436                  * order we take the locks because no other cpu could
3437                  * be trying to lock both of these tasks.
3438                  */
3439                 raw_spin_lock(&ctx->lock);
3440                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3441                 if (context_equiv(ctx, next_ctx)) {
3442 
3443                         WRITE_ONCE(ctx->task, next);
3444                         WRITE_ONCE(next_ctx->task, task);
3445 
3446                         perf_pmu_disable(pmu);
3447 
3448                         if (cpuctx->sched_cb_usage && pmu->sched_task)
3449                                 pmu->sched_task(ctx, false);
3450 
3451                         /*
3452                          * PMU specific parts of task perf context can require
3453                          * additional synchronization. As an example of such
3454                          * synchronization see implementation details of Intel
3455                          * LBR call stack data profiling;
3456                          */
3457                         if (pmu->swap_task_ctx)
3458                                 pmu->swap_task_ctx(ctx, next_ctx);
3459                         else
3460                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3461 
3462                         perf_pmu_enable(pmu);
3463 
3464                         /*
3465                          * RCU_INIT_POINTER here is safe because we've not
3466                          * modified the ctx and the above modification of
3467                          * ctx->task and ctx->task_ctx_data are immaterial
3468                          * since those values are always verified under
3469                          * ctx->lock which we're now holding.
3470                          */
3471                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3472                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3473 
3474                         do_switch = 0;
3475 
3476                         perf_event_sync_stat(ctx, next_ctx);
3477                 }
3478                 raw_spin_unlock(&next_ctx->lock);
3479                 raw_spin_unlock(&ctx->lock);
3480         }
3481 unlock:
3482         rcu_read_unlock();
3483 
3484         if (do_switch) {
3485                 raw_spin_lock(&ctx->lock);
3486                 perf_pmu_disable(pmu);
3487 
3488                 if (cpuctx->sched_cb_usage && pmu->sched_task)
3489                         pmu->sched_task(ctx, false);
3490                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3491 
3492                 perf_pmu_enable(pmu);
3493                 raw_spin_unlock(&ctx->lock);
3494         }
3495 }
3496 
3497 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3498 
3499 void perf_sched_cb_dec(struct pmu *pmu)
3500 {
3501         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3502 
3503         this_cpu_dec(perf_sched_cb_usages);
3504 
3505         if (!--cpuctx->sched_cb_usage)
3506                 list_del(&cpuctx->sched_cb_entry);
3507 }
3508 
3509 
3510 void perf_sched_cb_inc(struct pmu *pmu)
3511 {
3512         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3513 
3514         if (!cpuctx->sched_cb_usage++)
3515                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3516 
3517         this_cpu_inc(perf_sched_cb_usages);
3518 }
3519 
3520 /*
3521  * This function provides the context switch callback to the lower code
3522  * layer. It is invoked ONLY when the context switch callback is enabled.
3523  *
3524  * This callback is relevant even to per-cpu events; for example multi event
3525  * PEBS requires this to provide PID/TID information. This requires we flush
3526  * all queued PEBS records before we context switch to a new task.
3527  */
3528 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3529 {
3530         struct pmu *pmu;
3531 
3532         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3533 
3534         if (WARN_ON_ONCE(!pmu->sched_task))
3535                 return;
3536 
3537         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3538         perf_pmu_disable(pmu);
3539 
3540         pmu->sched_task(cpuctx->task_ctx, sched_in);
3541 
3542         perf_pmu_enable(pmu);
3543         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3544 }
3545 
3546 static void perf_pmu_sched_task(struct task_struct *prev,
3547                                 struct task_struct *next,
3548                                 bool sched_in)
3549 {
3550         struct perf_cpu_context *cpuctx;
3551 
3552         if (prev == next)
3553                 return;
3554 
3555         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3556                 /* will be handled in perf_event_context_sched_in/out */
3557                 if (cpuctx->task_ctx)
3558                         continue;
3559 
3560                 __perf_pmu_sched_task(cpuctx, sched_in);
3561         }
3562 }
3563 
3564 static void perf_event_switch(struct task_struct *task,
3565                               struct task_struct *next_prev, bool sched_in);
3566 
3567 #define for_each_task_context_nr(ctxn)                                  \
3568         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3569 
3570 /*
3571  * Called from scheduler to remove the events of the current task,
3572  * with interrupts disabled.
3573  *
3574  * We stop each event and update the event value in event->count.
3575  *
3576  * This does not protect us against NMI, but disable()
3577  * sets the disabled bit in the control field of event _before_
3578  * accessing the event control register. If a NMI hits, then it will
3579  * not restart the event.
3580  */
3581 void __perf_event_task_sched_out(struct task_struct *task,
3582                                  struct task_struct *next)
3583 {
3584         int ctxn;
3585 
3586         if (__this_cpu_read(perf_sched_cb_usages))
3587                 perf_pmu_sched_task(task, next, false);
3588 
3589         if (atomic_read(&nr_switch_events))
3590                 perf_event_switch(task, next, false);
3591 
3592         for_each_task_context_nr(ctxn)
3593                 perf_event_context_sched_out(task, ctxn, next);
3594 
3595         /*
3596          * if cgroup events exist on this CPU, then we need
3597          * to check if we have to switch out PMU state.
3598          * cgroup event are system-wide mode only
3599          */
3600         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3601                 perf_cgroup_sched_out(task, next);
3602 }
3603 
3604 /*
3605  * Called with IRQs disabled
3606  */
3607 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3608                               enum event_type_t event_type)
3609 {
3610         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3611 }
3612 
3613 static bool perf_less_group_idx(const void *l, const void *r)
3614 {
3615         const struct perf_event *le = *(const struct perf_event **)l;
3616         const struct perf_event *re = *(const struct perf_event **)r;
3617 
3618         return le->group_index < re->group_index;
3619 }
3620 
3621 static void swap_ptr(void *l, void *r)
3622 {
3623         void **lp = l, **rp = r;
3624 
3625         swap(*lp, *rp);
3626 }
3627 
3628 static const struct min_heap_callbacks perf_min_heap = {
3629         .elem_size = sizeof(struct perf_event *),
3630         .less = perf_less_group_idx,
3631         .swp = swap_ptr,
3632 };
3633 
3634 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3635 {
3636         struct perf_event **itrs = heap->data;
3637 
3638         if (event) {
3639                 itrs[heap->nr] = event;
3640                 heap->nr++;
3641         }
3642 }
3643 
3644 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3645                                 struct perf_event_groups *groups, int cpu,
3646                                 int (*func)(struct perf_event *, void *),
3647                                 void *data)
3648 {
3649 #ifdef CONFIG_CGROUP_PERF
3650         struct cgroup_subsys_state *css = NULL;
3651 #endif
3652         /* Space for per CPU and/or any CPU event iterators. */
3653         struct perf_event *itrs[2];
3654         struct min_heap event_heap;
3655         struct perf_event **evt;
3656         int ret;
3657 
3658         if (cpuctx) {
3659                 event_heap = (struct min_heap){
3660                         .data = cpuctx->heap,
3661                         .nr = 0,
3662                         .size = cpuctx->heap_size,
3663                 };
3664 
3665                 lockdep_assert_held(&cpuctx->ctx.lock);
3666 
3667 #ifdef CONFIG_CGROUP_PERF
3668                 if (cpuctx->cgrp)
3669                         css = &cpuctx->cgrp->css;
3670 #endif
3671         } else {
3672                 event_heap = (struct min_heap){
3673                         .data = itrs,
3674                         .nr = 0,
3675                         .size = ARRAY_SIZE(itrs),
3676                 };
3677                 /* Events not within a CPU context may be on any CPU. */
3678                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3679         }
3680         evt = event_heap.data;
3681 
3682         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3683 
3684 #ifdef CONFIG_CGROUP_PERF
3685         for (; css; css = css->parent)
3686                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3687 #endif
3688 
3689         min_heapify_all(&event_heap, &perf_min_heap);
3690 
3691         while (event_heap.nr) {
3692                 ret = func(*evt, data);
3693                 if (ret)
3694                         return ret;
3695 
3696                 *evt = perf_event_groups_next(*evt);
3697                 if (*evt)
3698                         min_heapify(&event_heap, 0, &perf_min_heap);
3699                 else
3700                         min_heap_pop(&event_heap, &perf_min_heap);
3701         }
3702 
3703         return 0;
3704 }
3705 
3706 static int merge_sched_in(struct perf_event *event, void *data)
3707 {
3708         struct perf_event_context *ctx = event->ctx;
3709         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3710         int *can_add_hw = data;
3711 
3712         if (event->state <= PERF_EVENT_STATE_OFF)
3713                 return 0;
3714 
3715         if (!event_filter_match(event))
3716                 return 0;
3717 
3718         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3719                 if (!group_sched_in(event, cpuctx, ctx))
3720                         list_add_tail(&event->active_list, get_event_list(event));
3721         }
3722 
3723         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3724                 if (event->attr.pinned) {
3725                         perf_cgroup_event_disable(event, ctx);
3726                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3727                 }
3728 
3729                 *can_add_hw = 0;
3730                 ctx->rotate_necessary = 1;
3731                 perf_mux_hrtimer_restart(cpuctx);
3732         }
3733 
3734         return 0;
3735 }
3736 
3737 static void
3738 ctx_pinned_sched_in(struct perf_event_context *ctx,
3739                     struct perf_cpu_context *cpuctx)
3740 {
3741         int can_add_hw = 1;
3742 
3743         if (ctx != &cpuctx->ctx)
3744                 cpuctx = NULL;
3745 
3746         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3747                            smp_processor_id(),
3748                            merge_sched_in, &can_add_hw);
3749 }
3750 
3751 static void
3752 ctx_flexible_sched_in(struct perf_event_context *ctx,
3753                       struct perf_cpu_context *cpuctx)
3754 {
3755         int can_add_hw = 1;
3756 
3757         if (ctx != &cpuctx->ctx)
3758                 cpuctx = NULL;
3759 
3760         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3761                            smp_processor_id(),
3762                            merge_sched_in, &can_add_hw);
3763 }
3764 
3765 static void
3766 ctx_sched_in(struct perf_event_context *ctx,
3767              struct perf_cpu_context *cpuctx,
3768              enum event_type_t event_type,
3769              struct task_struct *task)
3770 {
3771         int is_active = ctx->is_active;
3772         u64 now;
3773 
3774         lockdep_assert_held(&ctx->lock);
3775 
3776         if (likely(!ctx->nr_events))
3777                 return;
3778 
3779         ctx->is_active |= (event_type | EVENT_TIME);
3780         if (ctx->task) {
3781                 if (!is_active)
3782                         cpuctx->task_ctx = ctx;
3783                 else
3784                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3785         }
3786 
3787         is_active ^= ctx->is_active; /* changed bits */
3788 
3789         if (is_active & EVENT_TIME) {
3790                 /* start ctx time */
3791                 now = perf_clock();
3792                 ctx->timestamp = now;
3793                 perf_cgroup_set_timestamp(task, ctx);
3794         }
3795 
3796         /*
3797          * First go through the list and put on any pinned groups
3798          * in order to give them the best chance of going on.
3799          */
3800         if (is_active & EVENT_PINNED)
3801                 ctx_pinned_sched_in(ctx, cpuctx);
3802 
3803         /* Then walk through the lower prio flexible groups */
3804         if (is_active & EVENT_FLEXIBLE)
3805                 ctx_flexible_sched_in(ctx, cpuctx);
3806 }
3807 
3808 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3809                              enum event_type_t event_type,
3810                              struct task_struct *task)
3811 {
3812         struct perf_event_context *ctx = &cpuctx->ctx;
3813 
3814         ctx_sched_in(ctx, cpuctx, event_type, task);
3815 }
3816 
3817 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3818                                         struct task_struct *task)
3819 {
3820         struct perf_cpu_context *cpuctx;
3821         struct pmu *pmu = ctx->pmu;
3822 
3823         cpuctx = __get_cpu_context(ctx);
3824         if (cpuctx->task_ctx == ctx) {
3825                 if (cpuctx->sched_cb_usage)
3826                         __perf_pmu_sched_task(cpuctx, true);
3827                 return;
3828         }
3829 
3830         perf_ctx_lock(cpuctx, ctx);
3831         /*
3832          * We must check ctx->nr_events while holding ctx->lock, such
3833          * that we serialize against perf_install_in_context().
3834          */
3835         if (!ctx->nr_events)
3836                 goto unlock;
3837 
3838         perf_pmu_disable(pmu);
3839         /*
3840          * We want to keep the following priority order:
3841          * cpu pinned (that don't need to move), task pinned,
3842          * cpu flexible, task flexible.
3843          *
3844          * However, if task's ctx is not carrying any pinned
3845          * events, no need to flip the cpuctx's events around.
3846          */
3847         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3848                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3849         perf_event_sched_in(cpuctx, ctx, task);
3850 
3851         if (cpuctx->sched_cb_usage && pmu->sched_task)
3852                 pmu->sched_task(cpuctx->task_ctx, true);
3853 
3854         perf_pmu_enable(pmu);
3855 
3856 unlock:
3857         perf_ctx_unlock(cpuctx, ctx);
3858 }
3859 
3860 /*
3861  * Called from scheduler to add the events of the current task
3862  * with interrupts disabled.
3863  *
3864  * We restore the event value and then enable it.
3865  *
3866  * This does not protect us against NMI, but enable()
3867  * sets the enabled bit in the control field of event _before_
3868  * accessing the event control register. If a NMI hits, then it will
3869  * keep the event running.
3870  */
3871 void __perf_event_task_sched_in(struct task_struct *prev,
3872                                 struct task_struct *task)
3873 {
3874         struct perf_event_context *ctx;
3875         int ctxn;
3876 
3877         /*
3878          * If cgroup events exist on this CPU, then we need to check if we have
3879          * to switch in PMU state; cgroup event are system-wide mode only.
3880          *
3881          * Since cgroup events are CPU events, we must schedule these in before
3882          * we schedule in the task events.
3883          */
3884         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3885                 perf_cgroup_sched_in(prev, task);
3886 
3887         for_each_task_context_nr(ctxn) {
3888                 ctx = task->perf_event_ctxp[ctxn];
3889                 if (likely(!ctx))
3890                         continue;
3891 
3892                 perf_event_context_sched_in(ctx, task);
3893         }
3894 
3895         if (atomic_read(&nr_switch_events))
3896                 perf_event_switch(task, prev, true);
3897 
3898         if (__this_cpu_read(perf_sched_cb_usages))
3899                 perf_pmu_sched_task(prev, task, true);
3900 }
3901 
3902 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3903 {
3904         u64 frequency = event->attr.sample_freq;
3905         u64 sec = NSEC_PER_SEC;
3906         u64 divisor, dividend;
3907 
3908         int count_fls, nsec_fls, frequency_fls, sec_fls;
3909 
3910         count_fls = fls64(count);
3911         nsec_fls = fls64(nsec);
3912         frequency_fls = fls64(frequency);
3913         sec_fls = 30;
3914 
3915         /*
3916          * We got @count in @nsec, with a target of sample_freq HZ
3917          * the target period becomes:
3918          *
3919          *             @count * 10^9
3920          * period = -------------------
3921          *          @nsec * sample_freq
3922          *
3923          */
3924 
3925         /*
3926          * Reduce accuracy by one bit such that @a and @b converge
3927          * to a similar magnitude.
3928          */
3929 #define REDUCE_FLS(a, b)                \
3930 do {                                    \
3931         if (a##_fls > b##_fls) {        \
3932                 a >>= 1;                \
3933                 a##_fls--;              \
3934         } else {                        \
3935                 b >>= 1;                \
3936                 b##_fls--;              \
3937         }                               \
3938 } while (0)
3939 
3940         /*
3941          * Reduce accuracy until either term fits in a u64, then proceed with
3942          * the other, so that finally we can do a u64/u64 division.
3943          */
3944         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3945                 REDUCE_FLS(nsec, frequency);
3946                 REDUCE_FLS(sec, count);
3947         }
3948 
3949         if (count_fls + sec_fls > 64) {
3950                 divisor = nsec * frequency;
3951 
3952                 while (count_fls + sec_fls > 64) {
3953                         REDUCE_FLS(count, sec);
3954                         divisor >>= 1;
3955                 }
3956 
3957                 dividend = count * sec;
3958         } else {
3959                 dividend = count * sec;
3960 
3961                 while (nsec_fls + frequency_fls > 64) {
3962                         REDUCE_FLS(nsec, frequency);
3963                         dividend >>= 1;
3964                 }
3965 
3966                 divisor = nsec * frequency;
3967         }
3968 
3969         if (!divisor)
3970                 return dividend;
3971 
3972         return div64_u64(dividend, divisor);
3973 }
3974 
3975 static DEFINE_PER_CPU(int, perf_throttled_count);
3976 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3977 
3978 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3979 {
3980         struct hw_perf_event *hwc = &event->hw;
3981         s64 period, sample_period;
3982         s64 delta;
3983 
3984         period = perf_calculate_period(event, nsec, count);
3985 
3986         delta = (s64)(period - hwc->sample_period);
3987         delta = (delta + 7) / 8; /* low pass filter */
3988 
3989         sample_period = hwc->sample_period + delta;
3990 
3991         if (!sample_period)
3992                 sample_period = 1;
3993 
3994         hwc->sample_period = sample_period;
3995 
3996         if (local64_read(&hwc->period_left) > 8*sample_period) {
3997                 if (disable)
3998                         event->pmu->stop(event, PERF_EF_UPDATE);
3999 
4000                 local64_set(&hwc->period_left, 0);
4001 
4002                 if (disable)
4003                         event->pmu->start(event, PERF_EF_RELOAD);
4004         }
4005 }
4006 
4007 /*
4008  * combine freq adjustment with unthrottling to avoid two passes over the
4009  * events. At the same time, make sure, having freq events does not change
4010  * the rate of unthrottling as that would introduce bias.
4011  */
4012 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4013                                            int needs_unthr)
4014 {
4015         struct perf_event *event;
4016         struct hw_perf_event *hwc;
4017         u64 now, period = TICK_NSEC;
4018         s64 delta;
4019 
4020         /*
4021          * only need to iterate over all events iff:
4022          * - context have events in frequency mode (needs freq adjust)
4023          * - there are events to unthrottle on this cpu
4024          */
4025         if (!(ctx->nr_freq || needs_unthr))
4026                 return;
4027 
4028         raw_spin_lock(&ctx->lock);
4029         perf_pmu_disable(ctx->pmu);
4030 
4031         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4032                 if (event->state != PERF_EVENT_STATE_ACTIVE)
4033                         continue;
4034 
4035                 if (!event_filter_match(event))
4036                         continue;
4037 
4038                 perf_pmu_disable(event->pmu);
4039 
4040                 hwc = &event->hw;
4041 
4042                 if (hwc->interrupts == MAX_INTERRUPTS) {
4043                         hwc->interrupts = 0;
4044                         perf_log_throttle(event, 1);
4045                         event->pmu->start(event, 0);
4046                 }
4047 
4048                 if (!event->attr.freq || !event->attr.sample_freq)
4049                         goto next;
4050 
4051                 /*
4052                  * stop the event and update event->count
4053                  */
4054                 event->pmu->stop(event, PERF_EF_UPDATE);
4055 
4056                 now = local64_read(&event->count);
4057                 delta = now - hwc->freq_count_stamp;
4058                 hwc->freq_count_stamp = now;
4059 
4060                 /*
4061                  * restart the event
4062                  * reload only if value has changed
4063                  * we have stopped the event so tell that
4064                  * to perf_adjust_period() to avoid stopping it
4065                  * twice.
4066                  */
4067                 if (delta > 0)
4068                         perf_adjust_period(event, period, delta, false);
4069 
4070                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4071         next:
4072                 perf_pmu_enable(event->pmu);
4073         }
4074 
4075         perf_pmu_enable(ctx->pmu);
4076         raw_spin_unlock(&ctx->lock);
4077 }
4078 
4079 /*
4080  * Move @event to the tail of the @ctx's elegible events.
4081  */
4082 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4083 {
4084         /*
4085          * Rotate the first entry last of non-pinned groups. Rotation might be
4086          * disabled by the inheritance code.
4087          */
4088         if (ctx->rotate_disable)
4089                 return;
4090 
4091         perf_event_groups_delete(&ctx->flexible_groups, event);
4092         perf_event_groups_insert(&ctx->flexible_groups, event);
4093 }
4094 
4095 /* pick an event from the flexible_groups to rotate */
4096 static inline struct perf_event *
4097 ctx_event_to_rotate(struct perf_event_context *ctx)
4098 {
4099         struct perf_event *event;
4100 
4101         /* pick the first active flexible event */
4102         event = list_first_entry_or_null(&ctx->flexible_active,
4103                                          struct perf_event, active_list);
4104 
4105         /* if no active flexible event, pick the first event */
4106         if (!event) {
4107                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4108                                       typeof(*event), group_node);
4109         }
4110 
4111         /*
4112          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4113          * finds there are unschedulable events, it will set it again.
4114          */
4115         ctx->rotate_necessary = 0;
4116 
4117         return event;
4118 }
4119 
4120 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4121 {
4122         struct perf_event *cpu_event = NULL, *task_event = NULL;
4123         struct perf_event_context *task_ctx = NULL;
4124         int cpu_rotate, task_rotate;
4125 
4126         /*
4127          * Since we run this from IRQ context, nobody can install new
4128          * events, thus the event count values are stable.
4129          */
4130 
4131         cpu_rotate = cpuctx->ctx.rotate_necessary;
4132         task_ctx = cpuctx->task_ctx;
4133         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4134 
4135         if (!(cpu_rotate || task_rotate))
4136                 return false;
4137 
4138         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4139         perf_pmu_disable(cpuctx->ctx.pmu);
4140 
4141         if (task_rotate)
4142                 task_event = ctx_event_to_rotate(task_ctx);
4143         if (cpu_rotate)
4144                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4145 
4146         /*
4147          * As per the order given at ctx_resched() first 'pop' task flexible
4148          * and then, if needed CPU flexible.
4149          */
4150         if (task_event || (task_ctx && cpu_event))
4151                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4152         if (cpu_event)
4153                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4154 
4155         if (task_event)
4156                 rotate_ctx(task_ctx, task_event);
4157         if (cpu_event)
4158                 rotate_ctx(&cpuctx->ctx, cpu_event);
4159 
4160         perf_event_sched_in(cpuctx, task_ctx, current);
4161 
4162         perf_pmu_enable(cpuctx->ctx.pmu);
4163         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4164 
4165         return true;
4166 }
4167 
4168 void perf_event_task_tick(void)
4169 {
4170         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4171         struct perf_event_context *ctx, *tmp;
4172         int throttled;
4173 
4174         lockdep_assert_irqs_disabled();
4175 
4176         __this_cpu_inc(perf_throttled_seq);
4177         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4178         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4179 
4180         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4181                 perf_adjust_freq_unthr_context(ctx, throttled);
4182 }
4183 
4184 static int event_enable_on_exec(struct perf_event *event,
4185                                 struct perf_event_context *ctx)
4186 {
4187         if (!event->attr.enable_on_exec)
4188                 return 0;
4189 
4190         event->attr.enable_on_exec = 0;
4191         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4192                 return 0;
4193 
4194         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4195 
4196         return 1;
4197 }
4198 
4199 /*
4200  * Enable all of a task's events that have been marked enable-on-exec.
4201  * This expects task == current.
4202  */
4203 static void perf_event_enable_on_exec(int ctxn)
4204 {
4205         struct perf_event_context *ctx, *clone_ctx = NULL;
4206         enum event_type_t event_type = 0;
4207         struct perf_cpu_context *cpuctx;
4208         struct perf_event *event;
4209         unsigned long flags;
4210         int enabled = 0;
4211 
4212         local_irq_save(flags);
4213         ctx = current->perf_event_ctxp[ctxn];
4214         if (!ctx || !ctx->nr_events)
4215                 goto out;
4216 
4217         cpuctx = __get_cpu_context(ctx);
4218         perf_ctx_lock(cpuctx, ctx);
4219         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4220         list_for_each_entry(event, &ctx->event_list, event_entry) {
4221                 enabled |= event_enable_on_exec(event, ctx);
4222                 event_type |= get_event_type(event);
4223         }
4224 
4225         /*
4226          * Unclone and reschedule this context if we enabled any event.
4227          */
4228         if (enabled) {
4229                 clone_ctx = unclone_ctx(ctx);
4230                 ctx_resched(cpuctx, ctx, event_type);
4231         } else {
4232                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4233         }
4234         perf_ctx_unlock(cpuctx, ctx);
4235 
4236 out:
4237         local_irq_restore(flags);
4238 
4239         if (clone_ctx)
4240                 put_ctx(clone_ctx);
4241 }
4242 
4243 struct perf_read_data {
4244         struct perf_event *event;
4245         bool group;
4246         int ret;
4247 };
4248 
4249 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4250 {
4251         u16 local_pkg, event_pkg;
4252 
4253         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4254                 int local_cpu = smp_processor_id();
4255 
4256                 event_pkg = topology_physical_package_id(event_cpu);
4257                 local_pkg = topology_physical_package_id(local_cpu);
4258 
4259                 if (event_pkg == local_pkg)
4260                         return local_cpu;
4261         }
4262 
4263         return event_cpu;
4264 }
4265 
4266 /*
4267  * Cross CPU call to read the hardware event
4268  */
4269 static void __perf_event_read(void *info)
4270 {
4271         struct perf_read_data *data = info;
4272         struct perf_event *sub, *event = data->event;
4273         struct perf_event_context *ctx = event->ctx;
4274         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4275         struct pmu *pmu = event->pmu;
4276 
4277         /*
4278          * If this is a task context, we need to check whether it is
4279          * the current task context of this cpu.  If not it has been
4280          * scheduled out before the smp call arrived.  In that case
4281          * event->count would have been updated to a recent sample
4282          * when the event was scheduled out.
4283          */
4284         if (ctx->task && cpuctx->task_ctx != ctx)
4285                 return;
4286 
4287         raw_spin_lock(&ctx->lock);
4288         if (ctx->is_active & EVENT_TIME) {
4289                 update_context_time(ctx);
4290                 update_cgrp_time_from_event(event);
4291         }
4292 
4293         perf_event_update_time(event);
4294         if (data->group)
4295                 perf_event_update_sibling_time(event);
4296 
4297         if (event->state != PERF_EVENT_STATE_ACTIVE)
4298                 goto unlock;
4299 
4300         if (!data->group) {
4301                 pmu->read(event);
4302                 data->ret = 0;
4303                 goto unlock;
4304         }
4305 
4306         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4307 
4308         pmu->read(event);
4309 
4310         for_each_sibling_event(sub, event) {
4311                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4312                         /*
4313                          * Use sibling's PMU rather than @event's since
4314                          * sibling could be on different (eg: software) PMU.
4315                          */
4316                         sub->pmu->read(sub);
4317                 }
4318         }
4319 
4320         data->ret = pmu->commit_txn(pmu);
4321 
4322 unlock:
4323         raw_spin_unlock(&ctx->lock);
4324 }
4325 
4326 static inline u64 perf_event_count(struct perf_event *event)
4327 {
4328         return local64_read(&event->count) + atomic64_read(&event->child_count);
4329 }
4330 
4331 /*
4332  * NMI-safe method to read a local event, that is an event that
4333  * is:
4334  *   - either for the current task, or for this CPU
4335  *   - does not have inherit set, for inherited task events
4336  *     will not be local and we cannot read them atomically
4337  *   - must not have a pmu::count method
4338  */
4339 int perf_event_read_local(struct perf_event *event, u64 *value,
4340                           u64 *enabled, u64 *running)
4341 {
4342         unsigned long flags;
4343         int ret = 0;
4344 
4345         /*
4346          * Disabling interrupts avoids all counter scheduling (context
4347          * switches, timer based rotation and IPIs).
4348          */
4349         local_irq_save(flags);
4350 
4351         /*
4352          * It must not be an event with inherit set, we cannot read
4353          * all child counters from atomic context.
4354          */
4355         if (event->attr.inherit) {
4356                 ret = -EOPNOTSUPP;
4357                 goto out;
4358         }
4359 
4360         /* If this is a per-task event, it must be for current */
4361         if ((event->attach_state & PERF_ATTACH_TASK) &&
4362             event->hw.target != current) {
4363                 ret = -EINVAL;
4364                 goto out;
4365         }
4366 
4367         /* If this is a per-CPU event, it must be for this CPU */
4368         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4369             event->cpu != smp_processor_id()) {
4370                 ret = -EINVAL;
4371                 goto out;
4372         }
4373 
4374         /* If this is a pinned event it must be running on this CPU */
4375         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4376                 ret = -EBUSY;
4377                 goto out;
4378         }
4379 
4380         /*
4381          * If the event is currently on this CPU, its either a per-task event,
4382          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4383          * oncpu == -1).
4384          */
4385         if (event->oncpu == smp_processor_id())
4386                 event->pmu->read(event);
4387 
4388         *value = local64_read(&event->count);
4389         if (enabled || running) {
4390                 u64 now = event->shadow_ctx_time + perf_clock();
4391                 u64 __enabled, __running;
4392 
4393                 __perf_update_times(event, now, &__enabled, &__running);
4394                 if (enabled)
4395                         *enabled = __enabled;
4396                 if (running)
4397                         *running = __running;
4398         }
4399 out:
4400         local_irq_restore(flags);
4401 
4402         return ret;
4403 }
4404 
4405 static int perf_event_read(struct perf_event *event, bool group)
4406 {
4407         enum perf_event_state state = READ_ONCE(event->state);
4408         int event_cpu, ret = 0;
4409 
4410         /*
4411          * If event is enabled and currently active on a CPU, update the
4412          * value in the event structure:
4413          */
4414 again:
4415         if (state == PERF_EVENT_STATE_ACTIVE) {
4416                 struct perf_read_data data;
4417 
4418                 /*
4419                  * Orders the ->state and ->oncpu loads such that if we see
4420                  * ACTIVE we must also see the right ->oncpu.
4421                  *
4422                  * Matches the smp_wmb() from event_sched_in().
4423                  */
4424                 smp_rmb();
4425 
4426                 event_cpu = READ_ONCE(event->oncpu);
4427                 if ((unsigned)event_cpu >= nr_cpu_ids)
4428                         return 0;
4429 
4430                 data = (struct perf_read_data){
4431                         .event = event,
4432                         .group = group,
4433                         .ret = 0,
4434                 };
4435 
4436                 preempt_disable();
4437                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4438 
4439                 /*
4440                  * Purposely ignore the smp_call_function_single() return
4441                  * value.
4442                  *
4443                  * If event_cpu isn't a valid CPU it means the event got
4444                  * scheduled out and that will have updated the event count.
4445                  *
4446                  * Therefore, either way, we'll have an up-to-date event count
4447                  * after this.
4448                  */
4449                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4450                 preempt_enable();
4451                 ret = data.ret;
4452 
4453         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4454                 struct perf_event_context *ctx = event->ctx;
4455                 unsigned long flags;
4456 
4457                 raw_spin_lock_irqsave(&ctx->lock, flags);
4458                 state = event->state;
4459                 if (state != PERF_EVENT_STATE_INACTIVE) {
4460                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4461                         goto again;
4462                 }
4463 
4464                 /*
4465                  * May read while context is not active (e.g., thread is
4466                  * blocked), in that case we cannot update context time
4467                  */
4468                 if (ctx->is_active & EVENT_TIME) {
4469                         update_context_time(ctx);
4470                         update_cgrp_time_from_event(event);
4471                 }
4472 
4473                 perf_event_update_time(event);
4474                 if (group)
4475                         perf_event_update_sibling_time(event);
4476                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4477         }
4478 
4479         return ret;
4480 }
4481 
4482 /*
4483  * Initialize the perf_event context in a task_struct:
4484  */
4485 static void __perf_event_init_context(struct perf_event_context *ctx)
4486 {
4487         raw_spin_lock_init(&ctx->lock);
4488         mutex_init(&ctx->mutex);
4489         INIT_LIST_HEAD(&ctx->active_ctx_list);
4490         perf_event_groups_init(&ctx->pinned_groups);
4491         perf_event_groups_init(&ctx->flexible_groups);
4492         INIT_LIST_HEAD(&ctx->event_list);
4493         INIT_LIST_HEAD(&ctx->pinned_active);
4494         INIT_LIST_HEAD(&ctx->flexible_active);
4495         refcount_set(&ctx->refcount, 1);
4496 }
4497 
4498 static struct perf_event_context *
4499 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4500 {
4501         struct perf_event_context *ctx;
4502 
4503         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4504         if (!ctx)
4505                 return NULL;
4506 
4507         __perf_event_init_context(ctx);
4508         if (task)
4509                 ctx->task = get_task_struct(task);
4510         ctx->pmu = pmu;
4511 
4512         return ctx;
4513 }
4514 
4515 static struct task_struct *
4516 find_lively_task_by_vpid(pid_t vpid)
4517 {
4518         struct task_struct *task;
4519 
4520         rcu_read_lock();
4521         if (!vpid)
4522                 task = current;
4523         else
4524                 task = find_task_by_vpid(vpid);
4525         if (task)
4526                 get_task_struct(task);
4527         rcu_read_unlock();
4528 
4529         if (!task)
4530                 return ERR_PTR(-ESRCH);
4531 
4532         return task;
4533 }
4534 
4535 /*
4536  * Returns a matching context with refcount and pincount.
4537  */
4538 static struct perf_event_context *
4539 find_get_context(struct pmu *pmu, struct task_struct *task,
4540                 struct perf_event *event)
4541 {
4542         struct perf_event_context *ctx, *clone_ctx = NULL;
4543         struct perf_cpu_context *cpuctx;
4544         void *task_ctx_data = NULL;
4545         unsigned long flags;
4546         int ctxn, err;
4547         int cpu = event->cpu;
4548 
4549         if (!task) {
4550                 /* Must be root to operate on a CPU event: */
4551                 err = perf_allow_cpu(&event->attr);
4552                 if (err)
4553                         return ERR_PTR(err);
4554 
4555                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4556                 ctx = &cpuctx->ctx;
4557                 get_ctx(ctx);
4558                 ++ctx->pin_count;
4559 
4560                 return ctx;
4561         }
4562 
4563         err = -EINVAL;
4564         ctxn = pmu->task_ctx_nr;
4565         if (ctxn < 0)
4566                 goto errout;
4567 
4568         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4569                 task_ctx_data = alloc_task_ctx_data(pmu);
4570                 if (!task_ctx_data) {
4571                         err = -ENOMEM;
4572                         goto errout;
4573                 }
4574         }
4575 
4576 retry:
4577         ctx = perf_lock_task_context(task, ctxn, &flags);
4578         if (ctx) {
4579                 clone_ctx = unclone_ctx(ctx);
4580                 ++ctx->pin_count;
4581 
4582                 if (task_ctx_data && !ctx->task_ctx_data) {
4583                         ctx->task_ctx_data = task_ctx_data;
4584                         task_ctx_data = NULL;
4585                 }
4586                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4587 
4588                 if (clone_ctx)
4589                         put_ctx(clone_ctx);
4590         } else {
4591                 ctx = alloc_perf_context(pmu, task);
4592                 err = -ENOMEM;
4593                 if (!ctx)
4594                         goto errout;
4595 
4596                 if (task_ctx_data) {
4597                         ctx->task_ctx_data = task_ctx_data;
4598                         task_ctx_data = NULL;
4599                 }
4600 
4601                 err = 0;
4602                 mutex_lock(&task->perf_event_mutex);
4603                 /*
4604                  * If it has already passed perf_event_exit_task().
4605                  * we must see PF_EXITING, it takes this mutex too.
4606                  */
4607                 if (task->flags & PF_EXITING)
4608                         err = -ESRCH;
4609                 else if (task->perf_event_ctxp[ctxn])
4610                         err = -EAGAIN;
4611                 else {
4612                         get_ctx(ctx);
4613                         ++ctx->pin_count;
4614                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4615                 }
4616                 mutex_unlock(&task->perf_event_mutex);
4617 
4618                 if (unlikely(err)) {
4619                         put_ctx(ctx);
4620 
4621                         if (err == -EAGAIN)
4622                                 goto retry;
4623                         goto errout;
4624                 }
4625         }
4626 
4627         free_task_ctx_data(pmu, task_ctx_data);
4628         return ctx;
4629 
4630 errout:
4631         free_task_ctx_data(pmu, task_ctx_data);
4632         return ERR_PTR(err);
4633 }
4634 
4635 static void perf_event_free_filter(struct perf_event *event);
4636 static void perf_event_free_bpf_prog(struct perf_event *event);
4637 
4638 static void free_event_rcu(struct rcu_head *head)
4639 {
4640         struct perf_event *event;
4641 
4642         event = container_of(head, struct perf_event, rcu_head);
4643         if (event->ns)
4644                 put_pid_ns(event->ns);
4645         perf_event_free_filter(event);
4646         kfree(event);
4647 }
4648 
4649 static void ring_buffer_attach(struct perf_event *event,
4650                                struct perf_buffer *rb);
4651 
4652 static void detach_sb_event(struct perf_event *event)
4653 {
4654         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4655 
4656         raw_spin_lock(&pel->lock);
4657         list_del_rcu(&event->sb_list);
4658         raw_spin_unlock(&pel->lock);
4659 }
4660 
4661 static bool is_sb_event(struct perf_event *event)
4662 {
4663         struct perf_event_attr *attr = &event->attr;
4664 
4665         if (event->parent)
4666                 return false;
4667 
4668         if (event->attach_state & PERF_ATTACH_TASK)
4669                 return false;
4670 
4671         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4672             attr->comm || attr->comm_exec ||
4673             attr->task || attr->ksymbol ||
4674             attr->context_switch || attr->text_poke ||
4675             attr->bpf_event)
4676                 return true;
4677         return false;
4678 }
4679 
4680 static void unaccount_pmu_sb_event(struct perf_event *event)
4681 {
4682         if (is_sb_event(event))
4683                 detach_sb_event(event);
4684 }
4685 
4686 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4687 {
4688         if (event->parent)
4689                 return;
4690 
4691         if (is_cgroup_event(event))
4692                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4693 }
4694 
4695 #ifdef CONFIG_NO_HZ_FULL
4696 static DEFINE_SPINLOCK(nr_freq_lock);
4697 #endif
4698 
4699 static void unaccount_freq_event_nohz(void)
4700 {
4701 #ifdef CONFIG_NO_HZ_FULL
4702         spin_lock(&nr_freq_lock);
4703         if (atomic_dec_and_test(&nr_freq_events))
4704                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4705         spin_unlock(&nr_freq_lock);
4706 #endif
4707 }
4708 
4709 static void unaccount_freq_event(void)
4710 {
4711         if (tick_nohz_full_enabled())
4712                 unaccount_freq_event_nohz();
4713         else
4714                 atomic_dec(&nr_freq_events);
4715 }
4716 
4717 static void unaccount_event(struct perf_event *event)
4718 {
4719         bool dec = false;
4720 
4721         if (event->parent)
4722                 return;
4723 
4724         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4725                 dec = true;
4726         if (event->attr.mmap || event->attr.mmap_data)
4727                 atomic_dec(&nr_mmap_events);
4728         if (event->attr.comm)
4729                 atomic_dec(&nr_comm_events);
4730         if (event->attr.namespaces)
4731                 atomic_dec(&nr_namespaces_events);
4732         if (event->attr.cgroup)
4733                 atomic_dec(&nr_cgroup_events);
4734         if (event->attr.task)
4735                 atomic_dec(&nr_task_events);
4736         if (event->attr.freq)
4737                 unaccount_freq_event();
4738         if (event->attr.context_switch) {
4739                 dec = true;
4740                 atomic_dec(&nr_switch_events);
4741         }
4742         if (is_cgroup_event(event))
4743                 dec = true;
4744         if (has_branch_stack(event))
4745                 dec = true;
4746         if (event->attr.ksymbol)
4747                 atomic_dec(&nr_ksymbol_events);
4748         if (event->attr.bpf_event)
4749                 atomic_dec(&nr_bpf_events);
4750         if (event->attr.text_poke)
4751                 atomic_dec(&nr_text_poke_events);
4752 
4753         if (dec) {
4754                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4755                         schedule_delayed_work(&perf_sched_work, HZ);
4756         }
4757 
4758         unaccount_event_cpu(event, event->cpu);
4759 
4760         unaccount_pmu_sb_event(event);
4761 }
4762 
4763 static void perf_sched_delayed(struct work_struct *work)
4764 {
4765         mutex_lock(&perf_sched_mutex);
4766         if (atomic_dec_and_test(&perf_sched_count))
4767                 static_branch_disable(&perf_sched_events);
4768         mutex_unlock(&perf_sched_mutex);
4769 }
4770 
4771 /*
4772  * The following implement mutual exclusion of events on "exclusive" pmus
4773  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4774  * at a time, so we disallow creating events that might conflict, namely:
4775  *
4776  *  1) cpu-wide events in the presence of per-task events,
4777  *  2) per-task events in the presence of cpu-wide events,
4778  *  3) two matching events on the same context.
4779  *
4780  * The former two cases are handled in the allocation path (perf_event_alloc(),
4781  * _free_event()), the latter -- before the first perf_install_in_context().
4782  */
4783 static int exclusive_event_init(struct perf_event *event)
4784 {
4785         struct pmu *pmu = event->pmu;
4786 
4787         if (!is_exclusive_pmu(pmu))
4788                 return 0;
4789 
4790         /*
4791          * Prevent co-existence of per-task and cpu-wide events on the
4792          * same exclusive pmu.
4793          *
4794          * Negative pmu::exclusive_cnt means there are cpu-wide
4795          * events on this "exclusive" pmu, positive means there are
4796          * per-task events.
4797          *
4798          * Since this is called in perf_event_alloc() path, event::ctx
4799          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4800          * to mean "per-task event", because unlike other attach states it
4801          * never gets cleared.
4802          */
4803         if (event->attach_state & PERF_ATTACH_TASK) {
4804                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4805                         return -EBUSY;
4806         } else {
4807                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4808                         return -EBUSY;
4809         }
4810 
4811         return 0;
4812 }
4813 
4814 static void exclusive_event_destroy(struct perf_event *event)
4815 {
4816         struct pmu *pmu = event->pmu;
4817 
4818         if (!is_exclusive_pmu(pmu))
4819                 return;
4820 
4821         /* see comment in exclusive_event_init() */
4822         if (event->attach_state & PERF_ATTACH_TASK)
4823                 atomic_dec(&pmu->exclusive_cnt);
4824         else
4825                 atomic_inc(&pmu->exclusive_cnt);
4826 }
4827 
4828 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4829 {
4830         if ((e1->pmu == e2->pmu) &&
4831             (e1->cpu == e2->cpu ||
4832              e1->cpu == -1 ||
4833              e2->cpu == -1))
4834                 return true;
4835         return false;
4836 }
4837 
4838 static bool exclusive_event_installable(struct perf_event *event,
4839                                         struct perf_event_context *ctx)
4840 {
4841         struct perf_event *iter_event;
4842         struct pmu *pmu = event->pmu;
4843 
4844         lockdep_assert_held(&ctx->mutex);
4845 
4846         if (!is_exclusive_pmu(pmu))
4847                 return true;
4848 
4849         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4850                 if (exclusive_event_match(iter_event, event))
4851                         return false;
4852         }
4853 
4854         return true;
4855 }
4856 
4857 static void perf_addr_filters_splice(struct perf_event *event,
4858                                        struct list_head *head);
4859 
4860 static void _free_event(struct perf_event *event)
4861 {
4862         irq_work_sync(&event->pending);
4863 
4864         unaccount_event(event);
4865 
4866         security_perf_event_free(event);
4867 
4868         if (event->rb) {
4869                 /*
4870                  * Can happen when we close an event with re-directed output.
4871                  *
4872                  * Since we have a 0 refcount, perf_mmap_close() will skip
4873                  * over us; possibly making our ring_buffer_put() the last.
4874                  */
4875                 mutex_lock(&event->mmap_mutex);
4876                 ring_buffer_attach(event, NULL);
4877                 mutex_unlock(&event->mmap_mutex);
4878         }
4879 
4880         if (is_cgroup_event(event))
4881                 perf_detach_cgroup(event);
4882 
4883         if (!event->parent) {
4884                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4885                         put_callchain_buffers();
4886         }
4887 
4888         perf_event_free_bpf_prog(event);
4889         perf_addr_filters_splice(event, NULL);
4890         kfree(event->addr_filter_ranges);
4891 
4892         if (event->destroy)
4893                 event->destroy(event);
4894 
4895         /*
4896          * Must be after ->destroy(), due to uprobe_perf_close() using
4897          * hw.target.
4898          */
4899         if (event->hw.target)
4900                 put_task_struct(event->hw.target);
4901 
4902         /*
4903          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4904          * all task references must be cleaned up.
4905          */
4906         if (event->ctx)
4907                 put_ctx(event->ctx);
4908 
4909         exclusive_event_destroy(event);
4910         module_put(event->pmu->module);
4911 
4912         call_rcu(&event->rcu_head, free_event_rcu);
4913 }
4914 
4915 /*
4916  * Used to free events which have a known refcount of 1, such as in error paths
4917  * where the event isn't exposed yet and inherited events.
4918  */
4919 static void free_event(struct perf_event *event)
4920 {
4921         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4922                                 "unexpected event refcount: %ld; ptr=%p\n",
4923                                 atomic_long_read(&event->refcount), event)) {
4924                 /* leak to avoid use-after-free */
4925                 return;
4926         }
4927 
4928         _free_event(event);
4929 }
4930 
4931 /*
4932  * Remove user event from the owner task.
4933  */
4934 static void perf_remove_from_owner(struct perf_event *event)
4935 {
4936         struct task_struct *owner;
4937 
4938         rcu_read_lock();
4939         /*
4940          * Matches the smp_store_release() in perf_event_exit_task(). If we
4941          * observe !owner it means the list deletion is complete and we can
4942          * indeed free this event, otherwise we need to serialize on
4943          * owner->perf_event_mutex.
4944          */
4945         owner = READ_ONCE(event->owner);
4946         if (owner) {
4947                 /*
4948                  * Since delayed_put_task_struct() also drops the last
4949                  * task reference we can safely take a new reference
4950                  * while holding the rcu_read_lock().
4951                  */
4952                 get_task_struct(owner);
4953         }
4954         rcu_read_unlock();
4955 
4956         if (owner) {
4957                 /*
4958                  * If we're here through perf_event_exit_task() we're already
4959                  * holding ctx->mutex which would be an inversion wrt. the
4960                  * normal lock order.
4961                  *
4962                  * However we can safely take this lock because its the child
4963                  * ctx->mutex.
4964                  */
4965                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4966 
4967                 /*
4968                  * We have to re-check the event->owner field, if it is cleared
4969                  * we raced with perf_event_exit_task(), acquiring the mutex
4970                  * ensured they're done, and we can proceed with freeing the
4971                  * event.
4972                  */
4973                 if (event->owner) {
4974                         list_del_init(&event->owner_entry);
4975                         smp_store_release(&event->owner, NULL);
4976                 }
4977                 mutex_unlock(&owner->perf_event_mutex);
4978                 put_task_struct(owner);
4979         }
4980 }
4981 
4982 static void put_event(struct perf_event *event)
4983 {
4984         if (!atomic_long_dec_and_test(&event->refcount))
4985                 return;
4986 
4987         _free_event(event);
4988 }
4989 
4990 /*
4991  * Kill an event dead; while event:refcount will preserve the event
4992  * object, it will not preserve its functionality. Once the last 'user'
4993  * gives up the object, we'll destroy the thing.
4994  */
4995 int perf_event_release_kernel(struct perf_event *event)
4996 {
4997         struct perf_event_context *ctx = event->ctx;
4998         struct perf_event *child, *tmp;
4999         LIST_HEAD(free_list);
5000 
5001         /*
5002          * If we got here through err_file: fput(event_file); we will not have
5003          * attached to a context yet.
5004          */
5005         if (!ctx) {
5006                 WARN_ON_ONCE(event->attach_state &
5007                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5008                 goto no_ctx;
5009         }
5010 
5011         if (!is_kernel_event(event))
5012                 perf_remove_from_owner(event);
5013 
5014         ctx = perf_event_ctx_lock(event);
5015         WARN_ON_ONCE(ctx->parent_ctx);
5016         perf_remove_from_context(event, DETACH_GROUP);
5017 
5018         raw_spin_lock_irq(&ctx->lock);
5019         /*
5020          * Mark this event as STATE_DEAD, there is no external reference to it
5021          * anymore.
5022          *
5023          * Anybody acquiring event->child_mutex after the below loop _must_
5024          * also see this, most importantly inherit_event() which will avoid
5025          * placing more children on the list.
5026          *
5027          * Thus this guarantees that we will in fact observe and kill _ALL_
5028          * child events.
5029          */
5030         event->state = PERF_EVENT_STATE_DEAD;
5031         raw_spin_unlock_irq(&ctx->lock);
5032 
5033         perf_event_ctx_unlock(event, ctx);
5034 
5035 again:
5036         mutex_lock(&event->child_mutex);
5037         list_for_each_entry(child, &event->child_list, child_list) {
5038 
5039                 /*
5040                  * Cannot change, child events are not migrated, see the
5041                  * comment with perf_event_ctx_lock_nested().
5042                  */
5043                 ctx = READ_ONCE(child->ctx);
5044                 /*
5045                  * Since child_mutex nests inside ctx::mutex, we must jump
5046                  * through hoops. We start by grabbing a reference on the ctx.
5047                  *
5048                  * Since the event cannot get freed while we hold the
5049                  * child_mutex, the context must also exist and have a !0
5050                  * reference count.
5051                  */
5052                 get_ctx(ctx);
5053 
5054                 /*
5055                  * Now that we have a ctx ref, we can drop child_mutex, and
5056                  * acquire ctx::mutex without fear of it going away. Then we
5057                  * can re-acquire child_mutex.
5058                  */
5059                 mutex_unlock(&event->child_mutex);
5060                 mutex_lock(&ctx->mutex);
5061                 mutex_lock(&event->child_mutex);
5062 
5063                 /*
5064                  * Now that we hold ctx::mutex and child_mutex, revalidate our
5065                  * state, if child is still the first entry, it didn't get freed
5066                  * and we can continue doing so.
5067                  */
5068                 tmp = list_first_entry_or_null(&event->child_list,
5069                                                struct perf_event, child_list);
5070                 if (tmp == child) {
5071                         perf_remove_from_context(child, DETACH_GROUP);
5072                         list_move(&child->child_list, &free_list);
5073                         /*
5074                          * This matches the refcount bump in inherit_event();
5075                          * this can't be the last reference.
5076                          */
5077                         put_event(event);
5078                 }
5079 
5080                 mutex_unlock(&event->child_mutex);
5081                 mutex_unlock(&ctx->mutex);
5082                 put_ctx(ctx);
5083                 goto again;
5084         }
5085         mutex_unlock(&event->child_mutex);
5086 
5087         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5088                 void *var = &child->ctx->refcount;
5089 
5090                 list_del(&child->child_list);
5091                 free_event(child);
5092 
5093                 /*
5094                  * Wake any perf_event_free_task() waiting for this event to be
5095                  * freed.
5096                  */
5097                 smp_mb(); /* pairs with wait_var_event() */
5098                 wake_up_var(var);
5099         }
5100 
5101 no_ctx:
5102         put_event(event); /* Must be the 'last' reference */
5103         return 0;
5104 }
5105 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5106 
5107 /*
5108  * Called when the last reference to the file is gone.
5109  */
5110 static int perf_release(struct inode *inode, struct file *file)
5111 {
5112         perf_event_release_kernel(file->private_data);
5113         return 0;
5114 }
5115 
5116 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5117 {
5118         struct perf_event *child;
5119         u64 total = 0;
5120 
5121         *enabled = 0;
5122         *running = 0;
5123 
5124         mutex_lock(&event->child_mutex);
5125 
5126         (void)perf_event_read(event, false);
5127         total += perf_event_count(event);
5128 
5129         *enabled += event->total_time_enabled +
5130                         atomic64_read(&event->child_total_time_enabled);
5131         *running += event->total_time_running +
5132                         atomic64_read(&event->child_total_time_running);
5133 
5134         list_for_each_entry(child, &event->child_list, child_list) {
5135                 (void)perf_event_read(child, false);
5136                 total += perf_event_count(child);
5137                 *enabled += child->total_time_enabled;
5138                 *running += child->total_time_running;
5139         }
5140         mutex_unlock(&event->child_mutex);
5141 
5142         return total;
5143 }
5144 
5145 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5146 {
5147         struct perf_event_context *ctx;
5148         u64 count;
5149 
5150         ctx = perf_event_ctx_lock(event);
5151         count = __perf_event_read_value(event, enabled, running);
5152         perf_event_ctx_unlock(event, ctx);
5153 
5154         return count;
5155 }
5156 EXPORT_SYMBOL_GPL(perf_event_read_value);
5157 
5158 static int __perf_read_group_add(struct perf_event *leader,
5159                                         u64 read_format, u64 *values)
5160 {
5161         struct perf_event_context *ctx = leader->ctx;
5162         struct perf_event *sub;
5163         unsigned long flags;
5164         int n = 1; /* skip @nr */
5165         int ret;
5166 
5167         ret = perf_event_read(leader, true);
5168         if (ret)
5169                 return ret;
5170 
5171         raw_spin_lock_irqsave(&ctx->lock, flags);
5172 
5173         /*
5174          * Since we co-schedule groups, {enabled,running} times of siblings
5175          * will be identical to those of the leader, so we only publish one
5176          * set.
5177          */
5178         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5179                 values[n++] += leader->total_time_enabled +
5180                         atomic64_read(&leader->child_total_time_enabled);
5181         }
5182 
5183         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5184                 values[n++] += leader->total_time_running +
5185                         atomic64_read(&leader->child_total_time_running);
5186         }
5187 
5188         /*
5189          * Write {count,id} tuples for every sibling.
5190          */
5191         values[n++] += perf_event_count(leader);
5192         if (read_format & PERF_FORMAT_ID)
5193                 values[n++] = primary_event_id(leader);
5194 
5195         for_each_sibling_event(sub, leader) {
5196                 values[n++] += perf_event_count(sub);
5197                 if (read_format & PERF_FORMAT_ID)
5198                         values[n++] = primary_event_id(sub);
5199         }
5200 
5201         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5202         return 0;
5203 }
5204 
5205 static int perf_read_group(struct perf_event *event,
5206                                    u64 read_format, char __user *buf)
5207 {
5208         struct perf_event *leader = event->group_leader, *child;
5209         struct perf_event_context *ctx = leader->ctx;
5210         int ret;
5211         u64 *values;
5212 
5213         lockdep_assert_held(&ctx->mutex);
5214 
5215         values = kzalloc(event->read_size, GFP_KERNEL);
5216         if (!values)
5217                 return -ENOMEM;
5218 
5219         values[0] = 1 + leader->nr_siblings;
5220 
5221         /*
5222          * By locking the child_mutex of the leader we effectively
5223          * lock the child list of all siblings.. XXX explain how.
5224          */
5225         mutex_lock(&leader->child_mutex);
5226 
5227         ret = __perf_read_group_add(leader, read_format, values);
5228         if (ret)
5229                 goto unlock;
5230 
5231         list_for_each_entry(child, &leader->child_list, child_list) {
5232                 ret = __perf_read_group_add(child, read_format, values);
5233                 if (ret)
5234                         goto unlock;
5235         }
5236 
5237         mutex_unlock(&leader->child_mutex);
5238 
5239         ret = event->read_size;
5240         if (copy_to_user(buf, values, event->read_size))
5241                 ret = -EFAULT;
5242         goto out;
5243 
5244 unlock:
5245         mutex_unlock(&leader->child_mutex);
5246 out:
5247         kfree(values);
5248         return ret;
5249 }
5250 
5251 static int perf_read_one(struct perf_event *event,
5252                                  u64 read_format, char __user *buf)
5253 {
5254         u64 enabled, running;
5255         u64 values[4];
5256         int n = 0;
5257 
5258         values[n++] = __perf_event_read_value(event, &enabled, &running);
5259         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5260                 values[n++] = enabled;
5261         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5262                 values[n++] = running;
5263         if (read_format & PERF_FORMAT_ID)
5264                 values[n++] = primary_event_id(event);
5265 
5266         if (copy_to_user(buf, values, n * sizeof(u64)))
5267                 return -EFAULT;
5268 
5269         return n * sizeof(u64);
5270 }
5271 
5272 static bool is_event_hup(struct perf_event *event)
5273 {
5274         bool no_children;
5275 
5276         if (event->state > PERF_EVENT_STATE_EXIT)
5277                 return false;
5278 
5279         mutex_lock(&event->child_mutex);
5280         no_children = list_empty(&event->child_list);
5281         mutex_unlock(&event->child_mutex);
5282         return no_children;
5283 }
5284 
5285 /*
5286  * Read the performance event - simple non blocking version for now
5287  */
5288 static ssize_t
5289 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5290 {
5291         u64 read_format = event->attr.read_format;
5292         int ret;
5293 
5294         /*
5295          * Return end-of-file for a read on an event that is in
5296          * error state (i.e. because it was pinned but it couldn't be
5297          * scheduled on to the CPU at some point).
5298          */
5299         if (event->state == PERF_EVENT_STATE_ERROR)
5300                 return 0;
5301 
5302         if (count < event->read_size)
5303                 return -ENOSPC;
5304 
5305         WARN_ON_ONCE(event->ctx->parent_ctx);
5306         if (read_format & PERF_FORMAT_GROUP)
5307                 ret = perf_read_group(event, read_format, buf);
5308         else
5309                 ret = perf_read_one(event, read_format, buf);
5310 
5311         return ret;
5312 }
5313 
5314 static ssize_t
5315 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5316 {
5317         struct perf_event *event = file->private_data;
5318         struct perf_event_context *ctx;
5319         int ret;
5320 
5321         ret = security_perf_event_read(event);
5322         if (ret)
5323                 return ret;
5324 
5325         ctx = perf_event_ctx_lock(event);
5326         ret = __perf_read(event, buf, count);
5327         perf_event_ctx_unlock(event, ctx);
5328 
5329         return ret;
5330 }
5331 
5332 static __poll_t perf_poll(struct file *file, poll_table *wait)
5333 {
5334         struct perf_event *event = file->private_data;
5335         struct perf_buffer *rb;
5336         __poll_t events = EPOLLHUP;
5337 
5338         poll_wait(file, &event->waitq, wait);
5339 
5340         if (is_event_hup(event))
5341                 return events;
5342 
5343         /*
5344          * Pin the event->rb by taking event->mmap_mutex; otherwise
5345          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5346          */
5347         mutex_lock(&event->mmap_mutex);
5348         rb = event->rb;
5349         if (rb)
5350                 events = atomic_xchg(&rb->poll, 0);
5351         mutex_unlock(&event->mmap_mutex);
5352         return events;
5353 }
5354 
5355 static void _perf_event_reset(struct perf_event *event)
5356 {
5357         (void)perf_event_read(event, false);
5358         local64_set(&event->count, 0);
5359         perf_event_update_userpage(event);
5360 }
5361 
5362 /* Assume it's not an event with inherit set. */
5363 u64 perf_event_pause(struct perf_event *event, bool reset)
5364 {
5365         struct perf_event_context *ctx;
5366         u64 count;
5367 
5368         ctx = perf_event_ctx_lock(event);
5369         WARN_ON_ONCE(event->attr.inherit);
5370         _perf_event_disable(event);
5371         count = local64_read(&event->count);
5372         if (reset)
5373                 local64_set(&event->count, 0);
5374         perf_event_ctx_unlock(event, ctx);
5375 
5376         return count;
5377 }
5378 EXPORT_SYMBOL_GPL(perf_event_pause);
5379 
5380 /*
5381  * Holding the top-level event's child_mutex means that any
5382  * descendant process that has inherited this event will block
5383  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5384  * task existence requirements of perf_event_enable/disable.
5385  */
5386 static void perf_event_for_each_child(struct perf_event *event,
5387                                         void (*func)(struct perf_event *))
5388 {
5389         struct perf_event *child;
5390 
5391         WARN_ON_ONCE(event->ctx->parent_ctx);
5392 
5393         mutex_lock(&event->child_mutex);
5394         func(event);
5395         list_for_each_entry(child, &event->child_list, child_list)
5396                 func(child);
5397         mutex_unlock(&event->child_mutex);
5398 }
5399 
5400 static void perf_event_for_each(struct perf_event *event,
5401                                   void (*func)(struct perf_event *))
5402 {
5403         struct perf_event_context *ctx = event->ctx;
5404         struct perf_event *sibling;
5405 
5406         lockdep_assert_held(&ctx->mutex);
5407 
5408         event = event->group_leader;
5409 
5410         perf_event_for_each_child(event, func);
5411         for_each_sibling_event(sibling, event)
5412                 perf_event_for_each_child(sibling, func);
5413 }
5414 
5415 static void __perf_event_period(struct perf_event *event,
5416                                 struct perf_cpu_context *cpuctx,
5417                                 struct perf_event_context *ctx,
5418                                 void *info)
5419 {
5420         u64 value = *((u64 *)info);
5421         bool active;
5422 
5423         if (event->attr.freq) {
5424                 event->attr.sample_freq = value;
5425         } else {
5426                 event->attr.sample_period = value;
5427                 event->hw.sample_period = value;
5428         }
5429 
5430         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5431         if (active) {
5432                 perf_pmu_disable(ctx->pmu);
5433                 /*
5434                  * We could be throttled; unthrottle now to avoid the tick
5435                  * trying to unthrottle while we already re-started the event.
5436                  */
5437                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5438                         event->hw.interrupts = 0;
5439                         perf_log_throttle(event, 1);
5440                 }
5441                 event->pmu->stop(event, PERF_EF_UPDATE);
5442         }
5443 
5444         local64_set(&event->hw.period_left, 0);
5445 
5446         if (active) {
5447                 event->pmu->start(event, PERF_EF_RELOAD);
5448                 perf_pmu_enable(ctx->pmu);
5449         }
5450 }
5451 
5452 static int perf_event_check_period(struct perf_event *event, u64 value)
5453 {
5454         return event->pmu->check_period(event, value);
5455 }
5456 
5457 static int _perf_event_period(struct perf_event *event, u64 value)
5458 {
5459         if (!is_sampling_event(event))
5460                 return -EINVAL;
5461 
5462         if (!value)
5463                 return -EINVAL;
5464 
5465         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5466                 return -EINVAL;
5467 
5468         if (perf_event_check_period(event, value))
5469                 return -EINVAL;
5470 
5471         if (!event->attr.freq && (value & (1ULL << 63)))
5472                 return -EINVAL;
5473 
5474         event_function_call(event, __perf_event_period, &value);
5475 
5476         return 0;
5477 }
5478 
5479 int perf_event_period(struct perf_event *event, u64 value)
5480 {
5481         struct perf_event_context *ctx;
5482         int ret;
5483 
5484         ctx = perf_event_ctx_lock(event);
5485         ret = _perf_event_period(event, value);
5486         perf_event_ctx_unlock(event, ctx);
5487 
5488         return ret;
5489 }
5490 EXPORT_SYMBOL_GPL(perf_event_period);
5491 
5492 static const struct file_operations perf_fops;
5493 
5494 static inline int perf_fget_light(int fd, struct fd *p)
5495 {
5496         struct fd f = fdget(fd);
5497         if (!f.file)
5498                 return -EBADF;
5499 
5500         if (f.file->f_op != &perf_fops) {
5501                 fdput(f);
5502                 return -EBADF;
5503         }
5504         *p = f;
5505         return 0;
5506 }
5507 
5508 static int perf_event_set_output(struct perf_event *event,
5509                                  struct perf_event *output_event);
5510 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5511 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5512 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5513                           struct perf_event_attr *attr);
5514 
5515 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5516 {
5517         void (*func)(struct perf_event *);
5518         u32 flags = arg;
5519 
5520         switch (cmd) {
5521         case PERF_EVENT_IOC_ENABLE:
5522                 func = _perf_event_enable;
5523                 break;
5524         case PERF_EVENT_IOC_DISABLE:
5525                 func = _perf_event_disable;
5526                 break;
5527         case PERF_EVENT_IOC_RESET:
5528                 func = _perf_event_reset;
5529                 break;
5530 
5531         case PERF_EVENT_IOC_REFRESH:
5532                 return _perf_event_refresh(event, arg);
5533 
5534         case PERF_EVENT_IOC_PERIOD:
5535         {
5536                 u64 value;
5537 
5538                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5539                         return -EFAULT;
5540 
5541                 return _perf_event_period(event, value);
5542         }
5543         case PERF_EVENT_IOC_ID:
5544         {
5545                 u64 id = primary_event_id(event);
5546 
5547                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5548                         return -EFAULT;
5549                 return 0;
5550         }
5551 
5552         case PERF_EVENT_IOC_SET_OUTPUT:
5553         {
5554                 int ret;
5555                 if (arg != -1) {
5556                         struct perf_event *output_event;
5557                         struct fd output;
5558                         ret = perf_fget_light(arg, &output);
5559                         if (ret)
5560                                 return ret;
5561                         output_event = output.file->private_data;
5562                         ret = perf_event_set_output(event, output_event);
5563                         fdput(output);
5564                 } else {
5565                         ret = perf_event_set_output(event, NULL);
5566                 }
5567                 return ret;
5568         }
5569 
5570         case PERF_EVENT_IOC_SET_FILTER:
5571                 return perf_event_set_filter(event, (void __user *)arg);
5572 
5573         case PERF_EVENT_IOC_SET_BPF:
5574                 return perf_event_set_bpf_prog(event, arg);
5575 
5576         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5577                 struct perf_buffer *rb;
5578 
5579                 rcu_read_lock();
5580                 rb = rcu_dereference(event->rb);
5581                 if (!rb || !rb->nr_pages) {
5582                         rcu_read_unlock();
5583                         return -EINVAL;
5584                 }
5585                 rb_toggle_paused(rb, !!arg);
5586                 rcu_read_unlock();
5587                 return 0;
5588         }
5589 
5590         case PERF_EVENT_IOC_QUERY_BPF:
5591                 return perf_event_query_prog_array(event, (void __user *)arg);
5592 
5593         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5594                 struct perf_event_attr new_attr;
5595                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5596                                          &new_attr);
5597 
5598                 if (err)
5599                         return err;
5600 
5601                 return perf_event_modify_attr(event,  &new_attr);
5602         }
5603         default:
5604                 return -ENOTTY;
5605         }
5606 
5607         if (flags & PERF_IOC_FLAG_GROUP)
5608                 perf_event_for_each(event, func);
5609         else
5610                 perf_event_for_each_child(event, func);
5611 
5612         return 0;
5613 }
5614 
5615 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5616 {
5617         struct perf_event *event = file->private_data;
5618         struct perf_event_context *ctx;
5619         long ret;
5620 
5621         /* Treat ioctl like writes as it is likely a mutating operation. */
5622         ret = security_perf_event_write(event);
5623         if (ret)
5624                 return ret;
5625 
5626         ctx = perf_event_ctx_lock(event);
5627         ret = _perf_ioctl(event, cmd, arg);
5628         perf_event_ctx_unlock(event, ctx);
5629 
5630         return ret;
5631 }
5632 
5633 #ifdef CONFIG_COMPAT
5634 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5635                                 unsigned long arg)
5636 {
5637         switch (_IOC_NR(cmd)) {
5638         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5639         case _IOC_NR(PERF_EVENT_IOC_ID):
5640         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5641         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5642                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5643                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5644                         cmd &= ~IOCSIZE_MASK;
5645                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5646                 }
5647                 break;
5648         }
5649         return perf_ioctl(file, cmd, arg);
5650 }
5651 #else
5652 # define perf_compat_ioctl NULL
5653 #endif
5654 
5655 int perf_event_task_enable(void)
5656 {
5657         struct perf_event_context *ctx;
5658         struct perf_event *event;
5659 
5660         mutex_lock(&current->perf_event_mutex);
5661         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5662                 ctx = perf_event_ctx_lock(event);
5663                 perf_event_for_each_child(event, _perf_event_enable);
5664                 perf_event_ctx_unlock(event, ctx);
5665         }
5666         mutex_unlock(&current->perf_event_mutex);
5667 
5668         return 0;
5669 }
5670 
5671 int perf_event_task_disable(void)
5672 {
5673         struct perf_event_context *ctx;
5674         struct perf_event *event;
5675 
5676         mutex_lock(&current->perf_event_mutex);
5677         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5678                 ctx = perf_event_ctx_lock(event);
5679                 perf_event_for_each_child(event, _perf_event_disable);
5680                 perf_event_ctx_unlock(event, ctx);
5681         }
5682         mutex_unlock(&current->perf_event_mutex);
5683 
5684         return 0;
5685 }
5686 
5687 static int perf_event_index(struct perf_event *event)
5688 {
5689         if (event->hw.state & PERF_HES_STOPPED)
5690                 return 0;
5691 
5692         if (event->state != PERF_EVENT_STATE_ACTIVE)
5693                 return 0;
5694 
5695         return event->pmu->event_idx(event);
5696 }
5697 
5698 static void calc_timer_values(struct perf_event *event,
5699                                 u64 *now,
5700                                 u64 *enabled,
5701                                 u64 *running)
5702 {
5703         u64 ctx_time;
5704 
5705         *now = perf_clock();
5706         ctx_time = event->shadow_ctx_time + *now;
5707         __perf_update_times(event, ctx_time, enabled, running);
5708 }
5709 
5710 static void perf_event_init_userpage(struct perf_event *event)
5711 {
5712         struct perf_event_mmap_page *userpg;
5713         struct perf_buffer *rb;
5714 
5715         rcu_read_lock();
5716         rb = rcu_dereference(event->rb);
5717         if (!rb)
5718                 goto unlock;
5719 
5720         userpg = rb->user_page;
5721 
5722         /* Allow new userspace to detect that bit 0 is deprecated */
5723         userpg->cap_bit0_is_deprecated = 1;
5724         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5725         userpg->data_offset = PAGE_SIZE;
5726         userpg->data_size = perf_data_size(rb);
5727 
5728 unlock:
5729         rcu_read_unlock();
5730 }
5731 
5732 void __weak arch_perf_update_userpage(
5733         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5734 {
5735 }
5736 
5737 /*
5738  * Callers need to ensure there can be no nesting of this function, otherwise
5739  * the seqlock logic goes bad. We can not serialize this because the arch
5740  * code calls this from NMI context.
5741  */
5742 void perf_event_update_userpage(struct perf_event *event)
5743 {
5744         struct perf_event_mmap_page *userpg;
5745         struct perf_buffer *rb;
5746         u64 enabled, running, now;
5747 
5748         rcu_read_lock();
5749         rb = rcu_dereference(event->rb);
5750         if (!rb)
5751                 goto unlock;
5752 
5753         /*
5754          * compute total_time_enabled, total_time_running
5755          * based on snapshot values taken when the event
5756          * was last scheduled in.
5757          *
5758          * we cannot simply called update_context_time()
5759          * because of locking issue as we can be called in
5760          * NMI context
5761          */
5762         calc_timer_values(event, &now, &enabled, &running);
5763 
5764         userpg = rb->user_page;
5765         /*
5766          * Disable preemption to guarantee consistent time stamps are stored to
5767          * the user page.
5768          */
5769         preempt_disable();
5770         ++userpg->lock;
5771         barrier();
5772         userpg->index = perf_event_index(event);
5773         userpg->offset = perf_event_count(event);
5774         if (userpg->index)
5775                 userpg->offset -= local64_read(&event->hw.prev_count);
5776 
5777         userpg->time_enabled = enabled +
5778                         atomic64_read(&event->child_total_time_enabled);
5779 
5780         userpg->time_running = running +
5781                         atomic64_read(&event->child_total_time_running);
5782 
5783         arch_perf_update_userpage(event, userpg, now);
5784 
5785         barrier();
5786         ++userpg->lock;
5787         preempt_enable();
5788 unlock:
5789         rcu_read_unlock();
5790 }
5791 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5792 
5793 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5794 {
5795         struct perf_event *event = vmf->vma->vm_file->private_data;
5796         struct perf_buffer *rb;
5797         vm_fault_t ret = VM_FAULT_SIGBUS;
5798 
5799         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5800                 if (vmf->pgoff == 0)
5801                         ret = 0;
5802                 return ret;
5803         }
5804 
5805         rcu_read_lock();
5806         rb = rcu_dereference(event->rb);
5807         if (!rb)
5808                 goto unlock;
5809 
5810         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5811                 goto unlock;
5812 
5813         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5814         if (!vmf->page)
5815                 goto unlock;
5816 
5817         get_page(vmf->page);
5818         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5819         vmf->page->index   = vmf->pgoff;
5820 
5821         ret = 0;
5822 unlock:
5823         rcu_read_unlock();
5824 
5825         return ret;
5826 }
5827 
5828 static void ring_buffer_attach(struct perf_event *event,
5829                                struct perf_buffer *rb)
5830 {
5831         struct perf_buffer *old_rb = NULL;
5832         unsigned long flags;
5833 
5834         if (event->rb) {
5835                 /*
5836                  * Should be impossible, we set this when removing
5837                  * event->rb_entry and wait/clear when adding event->rb_entry.
5838                  */
5839                 WARN_ON_ONCE(event->rcu_pending);
5840 
5841                 old_rb = event->rb;
5842                 spin_lock_irqsave(&old_rb->event_lock, flags);
5843                 list_del_rcu(&event->rb_entry);
5844                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5845 
5846                 event->rcu_batches = get_state_synchronize_rcu();
5847                 event->rcu_pending = 1;
5848         }
5849 
5850         if (rb) {
5851                 if (event->rcu_pending) {
5852                         cond_synchronize_rcu(event->rcu_batches);
5853                         event->rcu_pending = 0;
5854                 }
5855 
5856                 spin_lock_irqsave(&rb->event_lock, flags);
5857                 list_add_rcu(&event->rb_entry, &rb->event_list);
5858                 spin_unlock_irqrestore(&rb->event_lock, flags);
5859         }
5860 
5861         /*
5862          * Avoid racing with perf_mmap_close(AUX): stop the event
5863          * before swizzling the event::rb pointer; if it's getting
5864          * unmapped, its aux_mmap_count will be 0 and it won't
5865          * restart. See the comment in __perf_pmu_output_stop().
5866          *
5867          * Data will inevitably be lost when set_output is done in
5868          * mid-air, but then again, whoever does it like this is
5869          * not in for the data anyway.
5870          */
5871         if (has_aux(event))
5872                 perf_event_stop(event, 0);
5873 
5874         rcu_assign_pointer(event->rb, rb);
5875 
5876         if (old_rb) {
5877                 ring_buffer_put(old_rb);
5878                 /*
5879                  * Since we detached before setting the new rb, so that we
5880                  * could attach the new rb, we could have missed a wakeup.
5881                  * Provide it now.
5882                  */
5883                 wake_up_all(&event->waitq);
5884         }
5885 }
5886 
5887 static void ring_buffer_wakeup(struct perf_event *event)
5888 {
5889         struct perf_buffer *rb;
5890 
5891         rcu_read_lock();
5892         rb = rcu_dereference(event->rb);
5893         if (rb) {
5894                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5895                         wake_up_all(&event->waitq);
5896         }
5897         rcu_read_unlock();
5898 }
5899 
5900 struct perf_buffer *ring_buffer_get(struct perf_event *event)
5901 {
5902         struct perf_buffer *rb;
5903 
5904         rcu_read_lock();
5905         rb = rcu_dereference(event->rb);
5906         if (rb) {
5907                 if (!refcount_inc_not_zero(&rb->refcount))
5908                         rb = NULL;
5909         }
5910         rcu_read_unlock();
5911 
5912         return rb;
5913 }
5914 
5915 void ring_buffer_put(struct perf_buffer *rb)
5916 {
5917         if (!refcount_dec_and_test(&rb->refcount))
5918                 return;
5919 
5920         WARN_ON_ONCE(!list_empty(&rb->event_list));
5921 
5922         call_rcu(&rb->rcu_head, rb_free_rcu);
5923 }
5924 
5925 static void perf_mmap_open(struct vm_area_struct *vma)
5926 {
5927         struct perf_event *event = vma->vm_file->private_data;
5928 
5929         atomic_inc(&event->mmap_count);
5930         atomic_inc(&event->rb->mmap_count);
5931 
5932         if (vma->vm_pgoff)
5933                 atomic_inc(&event->rb->aux_mmap_count);
5934 
5935         if (event->pmu->event_mapped)
5936                 event->pmu->event_mapped(event, vma->vm_mm);
5937 }
5938 
5939 static void perf_pmu_output_stop(struct perf_event *event);
5940 
5941 /*
5942  * A buffer can be mmap()ed multiple times; either directly through the same
5943  * event, or through other events by use of perf_event_set_output().
5944  *
5945  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5946  * the buffer here, where we still have a VM context. This means we need
5947  * to detach all events redirecting to us.
5948  */
5949 static void perf_mmap_close(struct vm_area_struct *vma)
5950 {
5951         struct perf_event *event = vma->vm_file->private_data;
5952         struct perf_buffer *rb = ring_buffer_get(event);
5953         struct user_struct *mmap_user = rb->mmap_user;
5954         int mmap_locked = rb->mmap_locked;
5955         unsigned long size = perf_data_size(rb);
5956         bool detach_rest = false;
5957 
5958         if (event->pmu->event_unmapped)
5959                 event->pmu->event_unmapped(event, vma->vm_mm);
5960 
5961         /*
5962          * rb->aux_mmap_count will always drop before rb->mmap_count and
5963          * event->mmap_count, so it is ok to use event->mmap_mutex to
5964          * serialize with perf_mmap here.
5965          */
5966         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5967             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5968                 /*
5969                  * Stop all AUX events that are writing to this buffer,
5970                  * so that we can free its AUX pages and corresponding PMU
5971                  * data. Note that after rb::aux_mmap_count dropped to zero,
5972                  * they won't start any more (see perf_aux_output_begin()).
5973                  */
5974                 perf_pmu_output_stop(event);
5975 
5976                 /* now it's safe to free the pages */
5977                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5978                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5979 
5980                 /* this has to be the last one */
5981                 rb_free_aux(rb);
5982                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5983 
5984                 mutex_unlock(&event->mmap_mutex);
5985         }
5986 
5987         if (atomic_dec_and_test(&rb->mmap_count))
5988                 detach_rest = true;
5989 
5990         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5991                 goto out_put;
5992 
5993         ring_buffer_attach(event, NULL);
5994         mutex_unlock(&event->mmap_mutex);
5995 
5996         /* If there's still other mmap()s of this buffer, we're done. */
5997         if (!detach_rest)
5998                 goto out_put;
5999 
6000         /*
6001          * No other mmap()s, detach from all other events that might redirect
6002          * into the now unreachable buffer. Somewhat complicated by the
6003          * fact that rb::event_lock otherwise nests inside mmap_mutex.
6004          */
6005 again:
6006         rcu_read_lock();
6007         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6008                 if (!atomic_long_inc_not_zero(&event->refcount)) {
6009                         /*
6010                          * This event is en-route to free_event() which will
6011                          * detach it and remove it from the list.
6012                          */
6013                         continue;
6014                 }
6015                 rcu_read_unlock();
6016 
6017                 mutex_lock(&event->mmap_mutex);
6018                 /*
6019                  * Check we didn't race with perf_event_set_output() which can
6020                  * swizzle the rb from under us while we were waiting to
6021                  * acquire mmap_mutex.
6022                  *
6023                  * If we find a different rb; ignore this event, a next
6024                  * iteration will no longer find it on the list. We have to
6025                  * still restart the iteration to make sure we're not now
6026                  * iterating the wrong list.
6027                  */
6028                 if (event->rb == rb)
6029                         ring_buffer_attach(event, NULL);
6030 
6031                 mutex_unlock(&event->mmap_mutex);
6032                 put_event(event);
6033 
6034                 /*
6035                  * Restart the iteration; either we're on the wrong list or
6036                  * destroyed its integrity by doing a deletion.
6037                  */
6038                 goto again;
6039         }
6040         rcu_read_unlock();
6041 
6042         /*
6043          * It could be there's still a few 0-ref events on the list; they'll
6044          * get cleaned up by free_event() -- they'll also still have their
6045          * ref on the rb and will free it whenever they are done with it.
6046          *
6047          * Aside from that, this buffer is 'fully' detached and unmapped,
6048          * undo the VM accounting.
6049          */
6050 
6051         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6052                         &mmap_user->locked_vm);
6053         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6054         free_uid(mmap_user);
6055 
6056 out_put:
6057         ring_buffer_put(rb); /* could be last */
6058 }
6059 
6060 static const struct vm_operations_struct perf_mmap_vmops = {
6061         .open           = perf_mmap_open,
6062         .close          = perf_mmap_close, /* non mergeable */
6063         .fault          = perf_mmap_fault,
6064         .page_mkwrite   = perf_mmap_fault,
6065 };
6066 
6067 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6068 {
6069         struct perf_event *event = file->private_data;
6070         unsigned long user_locked, user_lock_limit;
6071         struct user_struct *user = current_user();
6072         struct perf_buffer *rb = NULL;
6073         unsigned long locked, lock_limit;
6074         unsigned long vma_size;
6075         unsigned long nr_pages;
6076         long user_extra = 0, extra = 0;
6077         int ret = 0, flags = 0;
6078 
6079         /*
6080          * Don't allow mmap() of inherited per-task counters. This would
6081          * create a performance issue due to all children writing to the
6082          * same rb.
6083          */
6084         if (event->cpu == -1 && event->attr.inherit)
6085                 return -EINVAL;
6086 
6087         if (!(vma->vm_flags & VM_SHARED))
6088                 return -EINVAL;
6089 
6090         ret = security_perf_event_read(event);
6091         if (ret)
6092                 return ret;
6093 
6094         vma_size = vma->vm_end - vma->vm_start;
6095 
6096         if (vma->vm_pgoff == 0) {
6097                 nr_pages = (vma_size / PAGE_SIZE) - 1;
6098         } else {
6099                 /*
6100                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6101                  * mapped, all subsequent mappings should have the same size
6102                  * and offset. Must be above the normal perf buffer.
6103                  */
6104                 u64 aux_offset, aux_size;
6105 
6106                 if (!event->rb)
6107                         return -EINVAL;
6108 
6109                 nr_pages = vma_size / PAGE_SIZE;
6110 
6111                 mutex_lock(&event->mmap_mutex);
6112                 ret = -EINVAL;
6113 
6114                 rb = event->rb;
6115                 if (!rb)
6116                         goto aux_unlock;
6117 
6118                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6119                 aux_size = READ_ONCE(rb->user_page->aux_size);
6120 
6121                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6122                         goto aux_unlock;
6123 
6124                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6125                         goto aux_unlock;
6126 
6127                 /* already mapped with a different offset */
6128                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6129                         goto aux_unlock;
6130 
6131                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6132                         goto aux_unlock;
6133 
6134                 /* already mapped with a different size */
6135                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6136                         goto aux_unlock;
6137 
6138                 if (!is_power_of_2(nr_pages))
6139                         goto aux_unlock;
6140 
6141                 if (!atomic_inc_not_zero(&rb->mmap_count))
6142                         goto aux_unlock;
6143 
6144                 if (rb_has_aux(rb)) {
6145                         atomic_inc(&rb->aux_mmap_count);
6146                         ret = 0;
6147                         goto unlock;
6148                 }
6149 
6150                 atomic_set(&rb->aux_mmap_count, 1);
6151                 user_extra = nr_pages;
6152 
6153                 goto accounting;
6154         }
6155 
6156         /*
6157          * If we have rb pages ensure they're a power-of-two number, so we
6158          * can do bitmasks instead of modulo.
6159          */
6160         if (nr_pages != 0 && !is_power_of_2(nr_pages))
6161                 return -EINVAL;
6162 
6163         if (vma_size != PAGE_SIZE * (1 + nr_pages))
6164                 return -EINVAL;
6165 
6166         WARN_ON_ONCE(event->ctx->parent_ctx);
6167 again:
6168         mutex_lock(&event->mmap_mutex);
6169         if (event->rb) {
6170                 if (event->rb->nr_pages != nr_pages) {
6171                         ret = -EINVAL;
6172                         goto unlock;
6173                 }
6174 
6175                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6176                         /*
6177                          * Raced against perf_mmap_close() through
6178                          * perf_event_set_output(). Try again, hope for better
6179                          * luck.
6180                          */
6181                         mutex_unlock(&event->mmap_mutex);
6182                         goto again;
6183                 }
6184 
6185                 goto unlock;
6186         }
6187 
6188         user_extra = nr_pages + 1;
6189 
6190 accounting:
6191         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6192 
6193         /*
6194          * Increase the limit linearly with more CPUs:
6195          */
6196         user_lock_limit *= num_online_cpus();
6197 
6198         user_locked = atomic_long_read(&user->locked_vm);
6199 
6200         /*
6201          * sysctl_perf_event_mlock may have changed, so that
6202          *     user->locked_vm > user_lock_limit
6203          */
6204         if (user_locked > user_lock_limit)
6205                 user_locked = user_lock_limit;
6206         user_locked += user_extra;
6207 
6208         if (user_locked > user_lock_limit) {
6209                 /*
6210                  * charge locked_vm until it hits user_lock_limit;
6211                  * charge the rest from pinned_vm
6212                  */
6213                 extra = user_locked - user_lock_limit;
6214                 user_extra -= extra;
6215         }
6216 
6217         lock_limit = rlimit(RLIMIT_MEMLOCK);
6218         lock_limit >>= PAGE_SHIFT;
6219         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6220 
6221         if ((locked > lock_limit) && perf_is_paranoid() &&
6222                 !capable(CAP_IPC_LOCK)) {
6223                 ret = -EPERM;
6224                 goto unlock;
6225         }
6226 
6227         WARN_ON(!rb && event->rb);
6228 
6229         if (vma->vm_flags & VM_WRITE)
6230                 flags |= RING_BUFFER_WRITABLE;
6231 
6232         if (!rb) {
6233                 rb = rb_alloc(nr_pages,
6234                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
6235                               event->cpu, flags);
6236 
6237                 if (!rb) {
6238                         ret = -ENOMEM;
6239                         goto unlock;
6240                 }
6241 
6242                 atomic_set(&rb->mmap_count, 1);
6243                 rb->mmap_user = get_current_user();
6244                 rb->mmap_locked = extra;
6245 
6246                 ring_buffer_attach(event, rb);
6247 
6248                 perf_event_init_userpage(event);
6249                 perf_event_update_userpage(event);
6250         } else {
6251                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6252                                    event->attr.aux_watermark, flags);
6253                 if (!ret)
6254                         rb->aux_mmap_locked = extra;
6255         }
6256 
6257 unlock:
6258         if (!ret) {
6259                 atomic_long_add(user_extra, &user->locked_vm);
6260                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6261 
6262                 atomic_inc(&event->mmap_count);
6263         } else if (rb) {
6264                 atomic_dec(&rb->mmap_count);
6265         }
6266 aux_unlock:
6267         mutex_unlock(&event->mmap_mutex);
6268 
6269         /*
6270          * Since pinned accounting is per vm we cannot allow fork() to copy our
6271          * vma.
6272          */
6273         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6274         vma->vm_ops = &perf_mmap_vmops;
6275 
6276         if (event->pmu->event_mapped)
6277                 event->pmu->event_mapped(event, vma->vm_mm);
6278 
6279         return ret;
6280 }
6281 
6282 static int perf_fasync(int fd, struct file *filp, int on)
6283 {
6284         struct inode *inode = file_inode(filp);
6285         struct perf_event *event = filp->private_data;
6286         int retval;
6287 
6288         inode_lock(inode);
6289         retval = fasync_helper(fd, filp, on, &event->fasync);
6290         inode_unlock(inode);
6291 
6292         if (retval < 0)
6293                 return retval;
6294 
6295         return 0;
6296 }
6297 
6298 static const struct file_operations perf_fops = {
6299         .llseek                 = no_llseek,
6300         .release                = perf_release,
6301         .read                   = perf_read,
6302         .poll                   = perf_poll,
6303         .unlocked_ioctl         = perf_ioctl,
6304         .compat_ioctl           = perf_compat_ioctl,
6305         .mmap                   = perf_mmap,
6306         .fasync                 = perf_fasync,
6307 };
6308 
6309 /*
6310  * Perf event wakeup
6311  *
6312  * If there's data, ensure we set the poll() state and publish everything
6313  * to user-space before waking everybody up.
6314  */
6315 
6316 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6317 {
6318         /* only the parent has fasync state */
6319         if (event->parent)
6320                 event = event->parent;
6321         return &event->fasync;
6322 }
6323 
6324 void perf_event_wakeup(struct perf_event *event)
6325 {
6326         ring_buffer_wakeup(event);
6327 
6328         if (event->pending_kill) {
6329                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6330                 event->pending_kill = 0;
6331         }
6332 }
6333 
6334 static void perf_pending_event_disable(struct perf_event *event)
6335 {
6336         int cpu = READ_ONCE(event->pending_disable);
6337 
6338         if (cpu < 0)
6339                 return;
6340 
6341         if (cpu == smp_processor_id()) {
6342                 WRITE_ONCE(event->pending_disable, -1);
6343                 perf_event_disable_local(event);
6344                 return;
6345         }
6346 
6347         /*
6348          *  CPU-A                       CPU-B
6349          *
6350          *  perf_event_disable_inatomic()
6351          *    @pending_disable = CPU-A;
6352          *    irq_work_queue();
6353          *
6354          *  sched-out
6355          *    @pending_disable = -1;
6356          *
6357          *                              sched-in
6358          *                              perf_event_disable_inatomic()
6359          *                                @pending_disable = CPU-B;
6360          *                                irq_work_queue(); // FAILS
6361          *
6362          *  irq_work_run()
6363          *    perf_pending_event()
6364          *
6365          * But the event runs on CPU-B and wants disabling there.
6366          */
6367         irq_work_queue_on(&event->pending, cpu);
6368 }
6369 
6370 static void perf_pending_event(struct irq_work *entry)
6371 {
6372         struct perf_event *event = container_of(entry, struct perf_event, pending);
6373         int rctx;
6374 
6375         rctx = perf_swevent_get_recursion_context();
6376         /*
6377          * If we 'fail' here, that's OK, it means recursion is already disabled
6378          * and we won't recurse 'further'.
6379          */
6380 
6381         perf_pending_event_disable(event);
6382 
6383         if (event->pending_wakeup) {
6384                 event->pending_wakeup = 0;
6385                 perf_event_wakeup(event);
6386         }
6387 
6388         if (rctx >= 0)
6389                 perf_swevent_put_recursion_context(rctx);
6390 }
6391 
6392 /*
6393  * We assume there is only KVM supporting the callbacks.
6394  * Later on, we might change it to a list if there is
6395  * another virtualization implementation supporting the callbacks.
6396  */
6397 struct perf_guest_info_callbacks *perf_guest_cbs;
6398 
6399 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6400 {
6401         perf_guest_cbs = cbs;
6402         return 0;
6403 }
6404 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6405 
6406 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6407 {
6408         perf_guest_cbs = NULL;
6409         return 0;
6410 }
6411 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6412 
6413 static void
6414 perf_output_sample_regs(struct perf_output_handle *handle,
6415                         struct pt_regs *regs, u64 mask)
6416 {
6417         int bit;
6418         DECLARE_BITMAP(_mask, 64);
6419 
6420         bitmap_from_u64(_mask, mask);
6421         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6422                 u64 val;
6423 
6424                 val = perf_reg_value(regs, bit);
6425                 perf_output_put(handle, val);
6426         }
6427 }
6428 
6429 static void perf_sample_regs_user(struct perf_regs *regs_user,
6430                                   struct pt_regs *regs)
6431 {
6432         if (user_mode(regs)) {
6433                 regs_user->abi = perf_reg_abi(current);
6434                 regs_user->regs = regs;
6435         } else if (!(current->flags & PF_KTHREAD)) {
6436                 perf_get_regs_user(regs_user, regs);
6437         } else {
6438                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6439                 regs_user->regs = NULL;
6440         }
6441 }
6442 
6443 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6444                                   struct pt_regs *regs)
6445 {
6446         regs_intr->regs = regs;
6447         regs_intr->abi  = perf_reg_abi(current);
6448 }
6449 
6450 
6451 /*
6452  * Get remaining task size from user stack pointer.
6453  *
6454  * It'd be better to take stack vma map and limit this more
6455  * precisely, but there's no way to get it safely under interrupt,
6456  * so using TASK_SIZE as limit.
6457  */
6458 static u64 perf_ustack_task_size(struct pt_regs *regs)
6459 {
6460         unsigned long addr = perf_user_stack_pointer(regs);
6461 
6462         if (!addr || addr >= TASK_SIZE)
6463                 return 0;
6464 
6465         return TASK_SIZE - addr;
6466 }
6467 
6468 static u16
6469 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6470                         struct pt_regs *regs)
6471 {
6472         u64 task_size;
6473 
6474         /* No regs, no stack pointer, no dump. */
6475         if (!regs)
6476                 return 0;
6477 
6478         /*
6479          * Check if we fit in with the requested stack size into the:
6480          * - TASK_SIZE
6481          *   If we don't, we limit the size to the TASK_SIZE.
6482          *
6483          * - remaining sample size
6484          *   If we don't, we customize the stack size to
6485          *   fit in to the remaining sample size.
6486          */
6487 
6488         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6489         s