~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-6.3-rc3 ] ~ [ linux-6.2.7 ] ~ [ linux-6.1.20 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.103 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.175 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.237 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.278 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.310 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Performance events core code:
  4  *
  5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  9  */
 10 
 11 #include <linux/fs.h>
 12 #include <linux/mm.h>
 13 #include <linux/cpu.h>
 14 #include <linux/smp.h>
 15 #include <linux/idr.h>
 16 #include <linux/file.h>
 17 #include <linux/poll.h>
 18 #include <linux/slab.h>
 19 #include <linux/hash.h>
 20 #include <linux/tick.h>
 21 #include <linux/sysfs.h>
 22 #include <linux/dcache.h>
 23 #include <linux/percpu.h>
 24 #include <linux/ptrace.h>
 25 #include <linux/reboot.h>
 26 #include <linux/vmstat.h>
 27 #include <linux/device.h>
 28 #include <linux/export.h>
 29 #include <linux/vmalloc.h>
 30 #include <linux/hardirq.h>
 31 #include <linux/hugetlb.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/cgroup.h>
 38 #include <linux/perf_event.h>
 39 #include <linux/trace_events.h>
 40 #include <linux/hw_breakpoint.h>
 41 #include <linux/mm_types.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 #include <linux/bpf.h>
 46 #include <linux/filter.h>
 47 #include <linux/namei.h>
 48 #include <linux/parser.h>
 49 #include <linux/sched/clock.h>
 50 #include <linux/sched/mm.h>
 51 #include <linux/proc_ns.h>
 52 #include <linux/mount.h>
 53 #include <linux/min_heap.h>
 54 #include <linux/highmem.h>
 55 #include <linux/pgtable.h>
 56 #include <linux/buildid.h>
 57 
 58 #include "internal.h"
 59 
 60 #include <asm/irq_regs.h>
 61 
 62 typedef int (*remote_function_f)(void *);
 63 
 64 struct remote_function_call {
 65         struct task_struct      *p;
 66         remote_function_f       func;
 67         void                    *info;
 68         int                     ret;
 69 };
 70 
 71 static void remote_function(void *data)
 72 {
 73         struct remote_function_call *tfc = data;
 74         struct task_struct *p = tfc->p;
 75 
 76         if (p) {
 77                 /* -EAGAIN */
 78                 if (task_cpu(p) != smp_processor_id())
 79                         return;
 80 
 81                 /*
 82                  * Now that we're on right CPU with IRQs disabled, we can test
 83                  * if we hit the right task without races.
 84                  */
 85 
 86                 tfc->ret = -ESRCH; /* No such (running) process */
 87                 if (p != current)
 88                         return;
 89         }
 90 
 91         tfc->ret = tfc->func(tfc->info);
 92 }
 93 
 94 /**
 95  * task_function_call - call a function on the cpu on which a task runs
 96  * @p:          the task to evaluate
 97  * @func:       the function to be called
 98  * @info:       the function call argument
 99  *
100  * Calls the function @func when the task is currently running. This might
101  * be on the current CPU, which just calls the function directly.  This will
102  * retry due to any failures in smp_call_function_single(), such as if the
103  * task_cpu() goes offline concurrently.
104  *
105  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
106  */
107 static int
108 task_function_call(struct task_struct *p, remote_function_f func, void *info)
109 {
110         struct remote_function_call data = {
111                 .p      = p,
112                 .func   = func,
113                 .info   = info,
114                 .ret    = -EAGAIN,
115         };
116         int ret;
117 
118         for (;;) {
119                 ret = smp_call_function_single(task_cpu(p), remote_function,
120                                                &data, 1);
121                 if (!ret)
122                         ret = data.ret;
123 
124                 if (ret != -EAGAIN)
125                         break;
126 
127                 cond_resched();
128         }
129 
130         return ret;
131 }
132 
133 /**
134  * cpu_function_call - call a function on the cpu
135  * @cpu:        target cpu to queue this function
136  * @func:       the function to be called
137  * @info:       the function call argument
138  *
139  * Calls the function @func on the remote cpu.
140  *
141  * returns: @func return value or -ENXIO when the cpu is offline
142  */
143 static int cpu_function_call(int cpu, remote_function_f func, void *info)
144 {
145         struct remote_function_call data = {
146                 .p      = NULL,
147                 .func   = func,
148                 .info   = info,
149                 .ret    = -ENXIO, /* No such CPU */
150         };
151 
152         smp_call_function_single(cpu, remote_function, &data, 1);
153 
154         return data.ret;
155 }
156 
157 static inline struct perf_cpu_context *
158 __get_cpu_context(struct perf_event_context *ctx)
159 {
160         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
161 }
162 
163 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
164                           struct perf_event_context *ctx)
165 {
166         raw_spin_lock(&cpuctx->ctx.lock);
167         if (ctx)
168                 raw_spin_lock(&ctx->lock);
169 }
170 
171 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
172                             struct perf_event_context *ctx)
173 {
174         if (ctx)
175                 raw_spin_unlock(&ctx->lock);
176         raw_spin_unlock(&cpuctx->ctx.lock);
177 }
178 
179 #define TASK_TOMBSTONE ((void *)-1L)
180 
181 static bool is_kernel_event(struct perf_event *event)
182 {
183         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
184 }
185 
186 /*
187  * On task ctx scheduling...
188  *
189  * When !ctx->nr_events a task context will not be scheduled. This means
190  * we can disable the scheduler hooks (for performance) without leaving
191  * pending task ctx state.
192  *
193  * This however results in two special cases:
194  *
195  *  - removing the last event from a task ctx; this is relatively straight
196  *    forward and is done in __perf_remove_from_context.
197  *
198  *  - adding the first event to a task ctx; this is tricky because we cannot
199  *    rely on ctx->is_active and therefore cannot use event_function_call().
200  *    See perf_install_in_context().
201  *
202  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
203  */
204 
205 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
206                         struct perf_event_context *, void *);
207 
208 struct event_function_struct {
209         struct perf_event *event;
210         event_f func;
211         void *data;
212 };
213 
214 static int event_function(void *info)
215 {
216         struct event_function_struct *efs = info;
217         struct perf_event *event = efs->event;
218         struct perf_event_context *ctx = event->ctx;
219         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
220         struct perf_event_context *task_ctx = cpuctx->task_ctx;
221         int ret = 0;
222 
223         lockdep_assert_irqs_disabled();
224 
225         perf_ctx_lock(cpuctx, task_ctx);
226         /*
227          * Since we do the IPI call without holding ctx->lock things can have
228          * changed, double check we hit the task we set out to hit.
229          */
230         if (ctx->task) {
231                 if (ctx->task != current) {
232                         ret = -ESRCH;
233                         goto unlock;
234                 }
235 
236                 /*
237                  * We only use event_function_call() on established contexts,
238                  * and event_function() is only ever called when active (or
239                  * rather, we'll have bailed in task_function_call() or the
240                  * above ctx->task != current test), therefore we must have
241                  * ctx->is_active here.
242                  */
243                 WARN_ON_ONCE(!ctx->is_active);
244                 /*
245                  * And since we have ctx->is_active, cpuctx->task_ctx must
246                  * match.
247                  */
248                 WARN_ON_ONCE(task_ctx != ctx);
249         } else {
250                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
251         }
252 
253         efs->func(event, cpuctx, ctx, efs->data);
254 unlock:
255         perf_ctx_unlock(cpuctx, task_ctx);
256 
257         return ret;
258 }
259 
260 static void event_function_call(struct perf_event *event, event_f func, void *data)
261 {
262         struct perf_event_context *ctx = event->ctx;
263         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
264         struct event_function_struct efs = {
265                 .event = event,
266                 .func = func,
267                 .data = data,
268         };
269 
270         if (!event->parent) {
271                 /*
272                  * If this is a !child event, we must hold ctx::mutex to
273                  * stabilize the event->ctx relation. See
274                  * perf_event_ctx_lock().
275                  */
276                 lockdep_assert_held(&ctx->mutex);
277         }
278 
279         if (!task) {
280                 cpu_function_call(event->cpu, event_function, &efs);
281                 return;
282         }
283 
284         if (task == TASK_TOMBSTONE)
285                 return;
286 
287 again:
288         if (!task_function_call(task, event_function, &efs))
289                 return;
290 
291         raw_spin_lock_irq(&ctx->lock);
292         /*
293          * Reload the task pointer, it might have been changed by
294          * a concurrent perf_event_context_sched_out().
295          */
296         task = ctx->task;
297         if (task == TASK_TOMBSTONE) {
298                 raw_spin_unlock_irq(&ctx->lock);
299                 return;
300         }
301         if (ctx->is_active) {
302                 raw_spin_unlock_irq(&ctx->lock);
303                 goto again;
304         }
305         func(event, NULL, ctx, data);
306         raw_spin_unlock_irq(&ctx->lock);
307 }
308 
309 /*
310  * Similar to event_function_call() + event_function(), but hard assumes IRQs
311  * are already disabled and we're on the right CPU.
312  */
313 static void event_function_local(struct perf_event *event, event_f func, void *data)
314 {
315         struct perf_event_context *ctx = event->ctx;
316         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
317         struct task_struct *task = READ_ONCE(ctx->task);
318         struct perf_event_context *task_ctx = NULL;
319 
320         lockdep_assert_irqs_disabled();
321 
322         if (task) {
323                 if (task == TASK_TOMBSTONE)
324                         return;
325 
326                 task_ctx = ctx;
327         }
328 
329         perf_ctx_lock(cpuctx, task_ctx);
330 
331         task = ctx->task;
332         if (task == TASK_TOMBSTONE)
333                 goto unlock;
334 
335         if (task) {
336                 /*
337                  * We must be either inactive or active and the right task,
338                  * otherwise we're screwed, since we cannot IPI to somewhere
339                  * else.
340                  */
341                 if (ctx->is_active) {
342                         if (WARN_ON_ONCE(task != current))
343                                 goto unlock;
344 
345                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
346                                 goto unlock;
347                 }
348         } else {
349                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
350         }
351 
352         func(event, cpuctx, ctx, data);
353 unlock:
354         perf_ctx_unlock(cpuctx, task_ctx);
355 }
356 
357 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
358                        PERF_FLAG_FD_OUTPUT  |\
359                        PERF_FLAG_PID_CGROUP |\
360                        PERF_FLAG_FD_CLOEXEC)
361 
362 /*
363  * branch priv levels that need permission checks
364  */
365 #define PERF_SAMPLE_BRANCH_PERM_PLM \
366         (PERF_SAMPLE_BRANCH_KERNEL |\
367          PERF_SAMPLE_BRANCH_HV)
368 
369 enum event_type_t {
370         EVENT_FLEXIBLE = 0x1,
371         EVENT_PINNED = 0x2,
372         EVENT_TIME = 0x4,
373         /* see ctx_resched() for details */
374         EVENT_CPU = 0x8,
375         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
376 };
377 
378 /*
379  * perf_sched_events : >0 events exist
380  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
381  */
382 
383 static void perf_sched_delayed(struct work_struct *work);
384 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
385 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
386 static DEFINE_MUTEX(perf_sched_mutex);
387 static atomic_t perf_sched_count;
388 
389 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
390 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
391 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
392 
393 static atomic_t nr_mmap_events __read_mostly;
394 static atomic_t nr_comm_events __read_mostly;
395 static atomic_t nr_namespaces_events __read_mostly;
396 static atomic_t nr_task_events __read_mostly;
397 static atomic_t nr_freq_events __read_mostly;
398 static atomic_t nr_switch_events __read_mostly;
399 static atomic_t nr_ksymbol_events __read_mostly;
400 static atomic_t nr_bpf_events __read_mostly;
401 static atomic_t nr_cgroup_events __read_mostly;
402 static atomic_t nr_text_poke_events __read_mostly;
403 static atomic_t nr_build_id_events __read_mostly;
404 
405 static LIST_HEAD(pmus);
406 static DEFINE_MUTEX(pmus_lock);
407 static struct srcu_struct pmus_srcu;
408 static cpumask_var_t perf_online_mask;
409 static struct kmem_cache *perf_event_cache;
410 
411 /*
412  * perf event paranoia level:
413  *  -1 - not paranoid at all
414  *   0 - disallow raw tracepoint access for unpriv
415  *   1 - disallow cpu events for unpriv
416  *   2 - disallow kernel profiling for unpriv
417  */
418 int sysctl_perf_event_paranoid __read_mostly = 2;
419 
420 /* Minimum for 512 kiB + 1 user control page */
421 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
422 
423 /*
424  * max perf event sample rate
425  */
426 #define DEFAULT_MAX_SAMPLE_RATE         100000
427 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
428 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
429 
430 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
431 
432 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
433 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
434 
435 static int perf_sample_allowed_ns __read_mostly =
436         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
437 
438 static void update_perf_cpu_limits(void)
439 {
440         u64 tmp = perf_sample_period_ns;
441 
442         tmp *= sysctl_perf_cpu_time_max_percent;
443         tmp = div_u64(tmp, 100);
444         if (!tmp)
445                 tmp = 1;
446 
447         WRITE_ONCE(perf_sample_allowed_ns, tmp);
448 }
449 
450 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
451 
452 int perf_proc_update_handler(struct ctl_table *table, int write,
453                 void *buffer, size_t *lenp, loff_t *ppos)
454 {
455         int ret;
456         int perf_cpu = sysctl_perf_cpu_time_max_percent;
457         /*
458          * If throttling is disabled don't allow the write:
459          */
460         if (write && (perf_cpu == 100 || perf_cpu == 0))
461                 return -EINVAL;
462 
463         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
464         if (ret || !write)
465                 return ret;
466 
467         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
468         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
469         update_perf_cpu_limits();
470 
471         return 0;
472 }
473 
474 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
475 
476 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
477                 void *buffer, size_t *lenp, loff_t *ppos)
478 {
479         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
480 
481         if (ret || !write)
482                 return ret;
483 
484         if (sysctl_perf_cpu_time_max_percent == 100 ||
485             sysctl_perf_cpu_time_max_percent == 0) {
486                 printk(KERN_WARNING
487                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
488                 WRITE_ONCE(perf_sample_allowed_ns, 0);
489         } else {
490                 update_perf_cpu_limits();
491         }
492 
493         return 0;
494 }
495 
496 /*
497  * perf samples are done in some very critical code paths (NMIs).
498  * If they take too much CPU time, the system can lock up and not
499  * get any real work done.  This will drop the sample rate when
500  * we detect that events are taking too long.
501  */
502 #define NR_ACCUMULATED_SAMPLES 128
503 static DEFINE_PER_CPU(u64, running_sample_length);
504 
505 static u64 __report_avg;
506 static u64 __report_allowed;
507 
508 static void perf_duration_warn(struct irq_work *w)
509 {
510         printk_ratelimited(KERN_INFO
511                 "perf: interrupt took too long (%lld > %lld), lowering "
512                 "kernel.perf_event_max_sample_rate to %d\n",
513                 __report_avg, __report_allowed,
514                 sysctl_perf_event_sample_rate);
515 }
516 
517 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
518 
519 void perf_sample_event_took(u64 sample_len_ns)
520 {
521         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
522         u64 running_len;
523         u64 avg_len;
524         u32 max;
525 
526         if (max_len == 0)
527                 return;
528 
529         /* Decay the counter by 1 average sample. */
530         running_len = __this_cpu_read(running_sample_length);
531         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
532         running_len += sample_len_ns;
533         __this_cpu_write(running_sample_length, running_len);
534 
535         /*
536          * Note: this will be biased artifically low until we have
537          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
538          * from having to maintain a count.
539          */
540         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
541         if (avg_len <= max_len)
542                 return;
543 
544         __report_avg = avg_len;
545         __report_allowed = max_len;
546 
547         /*
548          * Compute a throttle threshold 25% below the current duration.
549          */
550         avg_len += avg_len / 4;
551         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
552         if (avg_len < max)
553                 max /= (u32)avg_len;
554         else
555                 max = 1;
556 
557         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
558         WRITE_ONCE(max_samples_per_tick, max);
559 
560         sysctl_perf_event_sample_rate = max * HZ;
561         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
562 
563         if (!irq_work_queue(&perf_duration_work)) {
564                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
565                              "kernel.perf_event_max_sample_rate to %d\n",
566                              __report_avg, __report_allowed,
567                              sysctl_perf_event_sample_rate);
568         }
569 }
570 
571 static atomic64_t perf_event_id;
572 
573 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
574                               enum event_type_t event_type);
575 
576 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
577                              enum event_type_t event_type);
578 
579 static void update_context_time(struct perf_event_context *ctx);
580 static u64 perf_event_time(struct perf_event *event);
581 
582 void __weak perf_event_print_debug(void)        { }
583 
584 static inline u64 perf_clock(void)
585 {
586         return local_clock();
587 }
588 
589 static inline u64 perf_event_clock(struct perf_event *event)
590 {
591         return event->clock();
592 }
593 
594 /*
595  * State based event timekeeping...
596  *
597  * The basic idea is to use event->state to determine which (if any) time
598  * fields to increment with the current delta. This means we only need to
599  * update timestamps when we change state or when they are explicitly requested
600  * (read).
601  *
602  * Event groups make things a little more complicated, but not terribly so. The
603  * rules for a group are that if the group leader is OFF the entire group is
604  * OFF, irrespecive of what the group member states are. This results in
605  * __perf_effective_state().
606  *
607  * A futher ramification is that when a group leader flips between OFF and
608  * !OFF, we need to update all group member times.
609  *
610  *
611  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
612  * need to make sure the relevant context time is updated before we try and
613  * update our timestamps.
614  */
615 
616 static __always_inline enum perf_event_state
617 __perf_effective_state(struct perf_event *event)
618 {
619         struct perf_event *leader = event->group_leader;
620 
621         if (leader->state <= PERF_EVENT_STATE_OFF)
622                 return leader->state;
623 
624         return event->state;
625 }
626 
627 static __always_inline void
628 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
629 {
630         enum perf_event_state state = __perf_effective_state(event);
631         u64 delta = now - event->tstamp;
632 
633         *enabled = event->total_time_enabled;
634         if (state >= PERF_EVENT_STATE_INACTIVE)
635                 *enabled += delta;
636 
637         *running = event->total_time_running;
638         if (state >= PERF_EVENT_STATE_ACTIVE)
639                 *running += delta;
640 }
641 
642 static void perf_event_update_time(struct perf_event *event)
643 {
644         u64 now = perf_event_time(event);
645 
646         __perf_update_times(event, now, &event->total_time_enabled,
647                                         &event->total_time_running);
648         event->tstamp = now;
649 }
650 
651 static void perf_event_update_sibling_time(struct perf_event *leader)
652 {
653         struct perf_event *sibling;
654 
655         for_each_sibling_event(sibling, leader)
656                 perf_event_update_time(sibling);
657 }
658 
659 static void
660 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
661 {
662         if (event->state == state)
663                 return;
664 
665         perf_event_update_time(event);
666         /*
667          * If a group leader gets enabled/disabled all its siblings
668          * are affected too.
669          */
670         if ((event->state < 0) ^ (state < 0))
671                 perf_event_update_sibling_time(event);
672 
673         WRITE_ONCE(event->state, state);
674 }
675 
676 /*
677  * UP store-release, load-acquire
678  */
679 
680 #define __store_release(ptr, val)                                       \
681 do {                                                                    \
682         barrier();                                                      \
683         WRITE_ONCE(*(ptr), (val));                                      \
684 } while (0)
685 
686 #define __load_acquire(ptr)                                             \
687 ({                                                                      \
688         __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
689         barrier();                                                      \
690         ___p;                                                           \
691 })
692 
693 #ifdef CONFIG_CGROUP_PERF
694 
695 static inline bool
696 perf_cgroup_match(struct perf_event *event)
697 {
698         struct perf_event_context *ctx = event->ctx;
699         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
700 
701         /* @event doesn't care about cgroup */
702         if (!event->cgrp)
703                 return true;
704 
705         /* wants specific cgroup scope but @cpuctx isn't associated with any */
706         if (!cpuctx->cgrp)
707                 return false;
708 
709         /*
710          * Cgroup scoping is recursive.  An event enabled for a cgroup is
711          * also enabled for all its descendant cgroups.  If @cpuctx's
712          * cgroup is a descendant of @event's (the test covers identity
713          * case), it's a match.
714          */
715         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
716                                     event->cgrp->css.cgroup);
717 }
718 
719 static inline void perf_detach_cgroup(struct perf_event *event)
720 {
721         css_put(&event->cgrp->css);
722         event->cgrp = NULL;
723 }
724 
725 static inline int is_cgroup_event(struct perf_event *event)
726 {
727         return event->cgrp != NULL;
728 }
729 
730 static inline u64 perf_cgroup_event_time(struct perf_event *event)
731 {
732         struct perf_cgroup_info *t;
733 
734         t = per_cpu_ptr(event->cgrp->info, event->cpu);
735         return t->time;
736 }
737 
738 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
739 {
740         struct perf_cgroup_info *t;
741 
742         t = per_cpu_ptr(event->cgrp->info, event->cpu);
743         if (!__load_acquire(&t->active))
744                 return t->time;
745         now += READ_ONCE(t->timeoffset);
746         return now;
747 }
748 
749 static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
750 {
751         if (adv)
752                 info->time += now - info->timestamp;
753         info->timestamp = now;
754         /*
755          * see update_context_time()
756          */
757         WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
758 }
759 
760 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
761 {
762         struct perf_cgroup *cgrp = cpuctx->cgrp;
763         struct cgroup_subsys_state *css;
764         struct perf_cgroup_info *info;
765 
766         if (cgrp) {
767                 u64 now = perf_clock();
768 
769                 for (css = &cgrp->css; css; css = css->parent) {
770                         cgrp = container_of(css, struct perf_cgroup, css);
771                         info = this_cpu_ptr(cgrp->info);
772 
773                         __update_cgrp_time(info, now, true);
774                         if (final)
775                                 __store_release(&info->active, 0);
776                 }
777         }
778 }
779 
780 static inline void update_cgrp_time_from_event(struct perf_event *event)
781 {
782         struct perf_cgroup_info *info;
783 
784         /*
785          * ensure we access cgroup data only when needed and
786          * when we know the cgroup is pinned (css_get)
787          */
788         if (!is_cgroup_event(event))
789                 return;
790 
791         info = this_cpu_ptr(event->cgrp->info);
792         /*
793          * Do not update time when cgroup is not active
794          */
795         if (info->active)
796                 __update_cgrp_time(info, perf_clock(), true);
797 }
798 
799 static inline void
800 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
801 {
802         struct perf_event_context *ctx = &cpuctx->ctx;
803         struct perf_cgroup *cgrp = cpuctx->cgrp;
804         struct perf_cgroup_info *info;
805         struct cgroup_subsys_state *css;
806 
807         /*
808          * ctx->lock held by caller
809          * ensure we do not access cgroup data
810          * unless we have the cgroup pinned (css_get)
811          */
812         if (!cgrp)
813                 return;
814 
815         WARN_ON_ONCE(!ctx->nr_cgroups);
816 
817         for (css = &cgrp->css; css; css = css->parent) {
818                 cgrp = container_of(css, struct perf_cgroup, css);
819                 info = this_cpu_ptr(cgrp->info);
820                 __update_cgrp_time(info, ctx->timestamp, false);
821                 __store_release(&info->active, 1);
822         }
823 }
824 
825 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
826 
827 /*
828  * reschedule events based on the cgroup constraint of task.
829  */
830 static void perf_cgroup_switch(struct task_struct *task)
831 {
832         struct perf_cgroup *cgrp;
833         struct perf_cpu_context *cpuctx, *tmp;
834         struct list_head *list;
835         unsigned long flags;
836 
837         /*
838          * Disable interrupts and preemption to avoid this CPU's
839          * cgrp_cpuctx_entry to change under us.
840          */
841         local_irq_save(flags);
842 
843         cgrp = perf_cgroup_from_task(task, NULL);
844 
845         list = this_cpu_ptr(&cgrp_cpuctx_list);
846         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
847                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
848                 if (READ_ONCE(cpuctx->cgrp) == cgrp)
849                         continue;
850 
851                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
852                 perf_pmu_disable(cpuctx->ctx.pmu);
853 
854                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
855                 /*
856                  * must not be done before ctxswout due
857                  * to update_cgrp_time_from_cpuctx() in
858                  * ctx_sched_out()
859                  */
860                 cpuctx->cgrp = cgrp;
861                 /*
862                  * set cgrp before ctxsw in to allow
863                  * perf_cgroup_set_timestamp() in ctx_sched_in()
864                  * to not have to pass task around
865                  */
866                 cpu_ctx_sched_in(cpuctx, EVENT_ALL);
867 
868                 perf_pmu_enable(cpuctx->ctx.pmu);
869                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
870         }
871 
872         local_irq_restore(flags);
873 }
874 
875 static int perf_cgroup_ensure_storage(struct perf_event *event,
876                                 struct cgroup_subsys_state *css)
877 {
878         struct perf_cpu_context *cpuctx;
879         struct perf_event **storage;
880         int cpu, heap_size, ret = 0;
881 
882         /*
883          * Allow storage to have sufficent space for an iterator for each
884          * possibly nested cgroup plus an iterator for events with no cgroup.
885          */
886         for (heap_size = 1; css; css = css->parent)
887                 heap_size++;
888 
889         for_each_possible_cpu(cpu) {
890                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
891                 if (heap_size <= cpuctx->heap_size)
892                         continue;
893 
894                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
895                                        GFP_KERNEL, cpu_to_node(cpu));
896                 if (!storage) {
897                         ret = -ENOMEM;
898                         break;
899                 }
900 
901                 raw_spin_lock_irq(&cpuctx->ctx.lock);
902                 if (cpuctx->heap_size < heap_size) {
903                         swap(cpuctx->heap, storage);
904                         if (storage == cpuctx->heap_default)
905                                 storage = NULL;
906                         cpuctx->heap_size = heap_size;
907                 }
908                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
909 
910                 kfree(storage);
911         }
912 
913         return ret;
914 }
915 
916 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
917                                       struct perf_event_attr *attr,
918                                       struct perf_event *group_leader)
919 {
920         struct perf_cgroup *cgrp;
921         struct cgroup_subsys_state *css;
922         struct fd f = fdget(fd);
923         int ret = 0;
924 
925         if (!f.file)
926                 return -EBADF;
927 
928         css = css_tryget_online_from_dir(f.file->f_path.dentry,
929                                          &perf_event_cgrp_subsys);
930         if (IS_ERR(css)) {
931                 ret = PTR_ERR(css);
932                 goto out;
933         }
934 
935         ret = perf_cgroup_ensure_storage(event, css);
936         if (ret)
937                 goto out;
938 
939         cgrp = container_of(css, struct perf_cgroup, css);
940         event->cgrp = cgrp;
941 
942         /*
943          * all events in a group must monitor
944          * the same cgroup because a task belongs
945          * to only one perf cgroup at a time
946          */
947         if (group_leader && group_leader->cgrp != cgrp) {
948                 perf_detach_cgroup(event);
949                 ret = -EINVAL;
950         }
951 out:
952         fdput(f);
953         return ret;
954 }
955 
956 static inline void
957 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
958 {
959         struct perf_cpu_context *cpuctx;
960 
961         if (!is_cgroup_event(event))
962                 return;
963 
964         /*
965          * Because cgroup events are always per-cpu events,
966          * @ctx == &cpuctx->ctx.
967          */
968         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
969 
970         if (ctx->nr_cgroups++)
971                 return;
972 
973         cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
974         list_add(&cpuctx->cgrp_cpuctx_entry,
975                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
976 }
977 
978 static inline void
979 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
980 {
981         struct perf_cpu_context *cpuctx;
982 
983         if (!is_cgroup_event(event))
984                 return;
985 
986         /*
987          * Because cgroup events are always per-cpu events,
988          * @ctx == &cpuctx->ctx.
989          */
990         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
991 
992         if (--ctx->nr_cgroups)
993                 return;
994 
995         cpuctx->cgrp = NULL;
996         list_del(&cpuctx->cgrp_cpuctx_entry);
997 }
998 
999 #else /* !CONFIG_CGROUP_PERF */
1000 
1001 static inline bool
1002 perf_cgroup_match(struct perf_event *event)
1003 {
1004         return true;
1005 }
1006 
1007 static inline void perf_detach_cgroup(struct perf_event *event)
1008 {}
1009 
1010 static inline int is_cgroup_event(struct perf_event *event)
1011 {
1012         return 0;
1013 }
1014 
1015 static inline void update_cgrp_time_from_event(struct perf_event *event)
1016 {
1017 }
1018 
1019 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1020                                                 bool final)
1021 {
1022 }
1023 
1024 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1025                                       struct perf_event_attr *attr,
1026                                       struct perf_event *group_leader)
1027 {
1028         return -EINVAL;
1029 }
1030 
1031 static inline void
1032 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1033 {
1034 }
1035 
1036 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1037 {
1038         return 0;
1039 }
1040 
1041 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1042 {
1043         return 0;
1044 }
1045 
1046 static inline void
1047 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1048 {
1049 }
1050 
1051 static inline void
1052 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1053 {
1054 }
1055 
1056 static void perf_cgroup_switch(struct task_struct *task)
1057 {
1058 }
1059 #endif
1060 
1061 /*
1062  * set default to be dependent on timer tick just
1063  * like original code
1064  */
1065 #define PERF_CPU_HRTIMER (1000 / HZ)
1066 /*
1067  * function must be called with interrupts disabled
1068  */
1069 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1070 {
1071         struct perf_cpu_context *cpuctx;
1072         bool rotations;
1073 
1074         lockdep_assert_irqs_disabled();
1075 
1076         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1077         rotations = perf_rotate_context(cpuctx);
1078 
1079         raw_spin_lock(&cpuctx->hrtimer_lock);
1080         if (rotations)
1081                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1082         else
1083                 cpuctx->hrtimer_active = 0;
1084         raw_spin_unlock(&cpuctx->hrtimer_lock);
1085 
1086         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1087 }
1088 
1089 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1090 {
1091         struct hrtimer *timer = &cpuctx->hrtimer;
1092         struct pmu *pmu = cpuctx->ctx.pmu;
1093         u64 interval;
1094 
1095         /* no multiplexing needed for SW PMU */
1096         if (pmu->task_ctx_nr == perf_sw_context)
1097                 return;
1098 
1099         /*
1100          * check default is sane, if not set then force to
1101          * default interval (1/tick)
1102          */
1103         interval = pmu->hrtimer_interval_ms;
1104         if (interval < 1)
1105                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1106 
1107         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1108 
1109         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1110         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1111         timer->function = perf_mux_hrtimer_handler;
1112 }
1113 
1114 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1115 {
1116         struct hrtimer *timer = &cpuctx->hrtimer;
1117         struct pmu *pmu = cpuctx->ctx.pmu;
1118         unsigned long flags;
1119 
1120         /* not for SW PMU */
1121         if (pmu->task_ctx_nr == perf_sw_context)
1122                 return 0;
1123 
1124         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1125         if (!cpuctx->hrtimer_active) {
1126                 cpuctx->hrtimer_active = 1;
1127                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1128                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1129         }
1130         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1131 
1132         return 0;
1133 }
1134 
1135 void perf_pmu_disable(struct pmu *pmu)
1136 {
1137         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1138         if (!(*count)++)
1139                 pmu->pmu_disable(pmu);
1140 }
1141 
1142 void perf_pmu_enable(struct pmu *pmu)
1143 {
1144         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1145         if (!--(*count))
1146                 pmu->pmu_enable(pmu);
1147 }
1148 
1149 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1150 
1151 /*
1152  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1153  * perf_event_task_tick() are fully serialized because they're strictly cpu
1154  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1155  * disabled, while perf_event_task_tick is called from IRQ context.
1156  */
1157 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1158 {
1159         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1160 
1161         lockdep_assert_irqs_disabled();
1162 
1163         WARN_ON(!list_empty(&ctx->active_ctx_list));
1164 
1165         list_add(&ctx->active_ctx_list, head);
1166 }
1167 
1168 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1169 {
1170         lockdep_assert_irqs_disabled();
1171 
1172         WARN_ON(list_empty(&ctx->active_ctx_list));
1173 
1174         list_del_init(&ctx->active_ctx_list);
1175 }
1176 
1177 static void get_ctx(struct perf_event_context *ctx)
1178 {
1179         refcount_inc(&ctx->refcount);
1180 }
1181 
1182 static void *alloc_task_ctx_data(struct pmu *pmu)
1183 {
1184         if (pmu->task_ctx_cache)
1185                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1186 
1187         return NULL;
1188 }
1189 
1190 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1191 {
1192         if (pmu->task_ctx_cache && task_ctx_data)
1193                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1194 }
1195 
1196 static void free_ctx(struct rcu_head *head)
1197 {
1198         struct perf_event_context *ctx;
1199 
1200         ctx = container_of(head, struct perf_event_context, rcu_head);
1201         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1202         kfree(ctx);
1203 }
1204 
1205 static void put_ctx(struct perf_event_context *ctx)
1206 {
1207         if (refcount_dec_and_test(&ctx->refcount)) {
1208                 if (ctx->parent_ctx)
1209                         put_ctx(ctx->parent_ctx);
1210                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1211                         put_task_struct(ctx->task);
1212                 call_rcu(&ctx->rcu_head, free_ctx);
1213         }
1214 }
1215 
1216 /*
1217  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1218  * perf_pmu_migrate_context() we need some magic.
1219  *
1220  * Those places that change perf_event::ctx will hold both
1221  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1222  *
1223  * Lock ordering is by mutex address. There are two other sites where
1224  * perf_event_context::mutex nests and those are:
1225  *
1226  *  - perf_event_exit_task_context()    [ child , 0 ]
1227  *      perf_event_exit_event()
1228  *        put_event()                   [ parent, 1 ]
1229  *
1230  *  - perf_event_init_context()         [ parent, 0 ]
1231  *      inherit_task_group()
1232  *        inherit_group()
1233  *          inherit_event()
1234  *            perf_event_alloc()
1235  *              perf_init_event()
1236  *                perf_try_init_event() [ child , 1 ]
1237  *
1238  * While it appears there is an obvious deadlock here -- the parent and child
1239  * nesting levels are inverted between the two. This is in fact safe because
1240  * life-time rules separate them. That is an exiting task cannot fork, and a
1241  * spawning task cannot (yet) exit.
1242  *
1243  * But remember that these are parent<->child context relations, and
1244  * migration does not affect children, therefore these two orderings should not
1245  * interact.
1246  *
1247  * The change in perf_event::ctx does not affect children (as claimed above)
1248  * because the sys_perf_event_open() case will install a new event and break
1249  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1250  * concerned with cpuctx and that doesn't have children.
1251  *
1252  * The places that change perf_event::ctx will issue:
1253  *
1254  *   perf_remove_from_context();
1255  *   synchronize_rcu();
1256  *   perf_install_in_context();
1257  *
1258  * to affect the change. The remove_from_context() + synchronize_rcu() should
1259  * quiesce the event, after which we can install it in the new location. This
1260  * means that only external vectors (perf_fops, prctl) can perturb the event
1261  * while in transit. Therefore all such accessors should also acquire
1262  * perf_event_context::mutex to serialize against this.
1263  *
1264  * However; because event->ctx can change while we're waiting to acquire
1265  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1266  * function.
1267  *
1268  * Lock order:
1269  *    exec_update_lock
1270  *      task_struct::perf_event_mutex
1271  *        perf_event_context::mutex
1272  *          perf_event::child_mutex;
1273  *            perf_event_context::lock
1274  *          perf_event::mmap_mutex
1275  *          mmap_lock
1276  *            perf_addr_filters_head::lock
1277  *
1278  *    cpu_hotplug_lock
1279  *      pmus_lock
1280  *        cpuctx->mutex / perf_event_context::mutex
1281  */
1282 static struct perf_event_context *
1283 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1284 {
1285         struct perf_event_context *ctx;
1286 
1287 again:
1288         rcu_read_lock();
1289         ctx = READ_ONCE(event->ctx);
1290         if (!refcount_inc_not_zero(&ctx->refcount)) {
1291                 rcu_read_unlock();
1292                 goto again;
1293         }
1294         rcu_read_unlock();
1295 
1296         mutex_lock_nested(&ctx->mutex, nesting);
1297         if (event->ctx != ctx) {
1298                 mutex_unlock(&ctx->mutex);
1299                 put_ctx(ctx);
1300                 goto again;
1301         }
1302 
1303         return ctx;
1304 }
1305 
1306 static inline struct perf_event_context *
1307 perf_event_ctx_lock(struct perf_event *event)
1308 {
1309         return perf_event_ctx_lock_nested(event, 0);
1310 }
1311 
1312 static void perf_event_ctx_unlock(struct perf_event *event,
1313                                   struct perf_event_context *ctx)
1314 {
1315         mutex_unlock(&ctx->mutex);
1316         put_ctx(ctx);
1317 }
1318 
1319 /*
1320  * This must be done under the ctx->lock, such as to serialize against
1321  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1322  * calling scheduler related locks and ctx->lock nests inside those.
1323  */
1324 static __must_check struct perf_event_context *
1325 unclone_ctx(struct perf_event_context *ctx)
1326 {
1327         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1328 
1329         lockdep_assert_held(&ctx->lock);
1330 
1331         if (parent_ctx)
1332                 ctx->parent_ctx = NULL;
1333         ctx->generation++;
1334 
1335         return parent_ctx;
1336 }
1337 
1338 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1339                                 enum pid_type type)
1340 {
1341         u32 nr;
1342         /*
1343          * only top level events have the pid namespace they were created in
1344          */
1345         if (event->parent)
1346                 event = event->parent;
1347 
1348         nr = __task_pid_nr_ns(p, type, event->ns);
1349         /* avoid -1 if it is idle thread or runs in another ns */
1350         if (!nr && !pid_alive(p))
1351                 nr = -1;
1352         return nr;
1353 }
1354 
1355 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1356 {
1357         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1358 }
1359 
1360 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1361 {
1362         return perf_event_pid_type(event, p, PIDTYPE_PID);
1363 }
1364 
1365 /*
1366  * If we inherit events we want to return the parent event id
1367  * to userspace.
1368  */
1369 static u64 primary_event_id(struct perf_event *event)
1370 {
1371         u64 id = event->id;
1372 
1373         if (event->parent)
1374                 id = event->parent->id;
1375 
1376         return id;
1377 }
1378 
1379 /*
1380  * Get the perf_event_context for a task and lock it.
1381  *
1382  * This has to cope with the fact that until it is locked,
1383  * the context could get moved to another task.
1384  */
1385 static struct perf_event_context *
1386 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1387 {
1388         struct perf_event_context *ctx;
1389 
1390 retry:
1391         /*
1392          * One of the few rules of preemptible RCU is that one cannot do
1393          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1394          * part of the read side critical section was irqs-enabled -- see
1395          * rcu_read_unlock_special().
1396          *
1397          * Since ctx->lock nests under rq->lock we must ensure the entire read
1398          * side critical section has interrupts disabled.
1399          */
1400         local_irq_save(*flags);
1401         rcu_read_lock();
1402         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1403         if (ctx) {
1404                 /*
1405                  * If this context is a clone of another, it might
1406                  * get swapped for another underneath us by
1407                  * perf_event_task_sched_out, though the
1408                  * rcu_read_lock() protects us from any context
1409                  * getting freed.  Lock the context and check if it
1410                  * got swapped before we could get the lock, and retry
1411                  * if so.  If we locked the right context, then it
1412                  * can't get swapped on us any more.
1413                  */
1414                 raw_spin_lock(&ctx->lock);
1415                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1416                         raw_spin_unlock(&ctx->lock);
1417                         rcu_read_unlock();
1418                         local_irq_restore(*flags);
1419                         goto retry;
1420                 }
1421 
1422                 if (ctx->task == TASK_TOMBSTONE ||
1423                     !refcount_inc_not_zero(&ctx->refcount)) {
1424                         raw_spin_unlock(&ctx->lock);
1425                         ctx = NULL;
1426                 } else {
1427                         WARN_ON_ONCE(ctx->task != task);
1428                 }
1429         }
1430         rcu_read_unlock();
1431         if (!ctx)
1432                 local_irq_restore(*flags);
1433         return ctx;
1434 }
1435 
1436 /*
1437  * Get the context for a task and increment its pin_count so it
1438  * can't get swapped to another task.  This also increments its
1439  * reference count so that the context can't get freed.
1440  */
1441 static struct perf_event_context *
1442 perf_pin_task_context(struct task_struct *task, int ctxn)
1443 {
1444         struct perf_event_context *ctx;
1445         unsigned long flags;
1446 
1447         ctx = perf_lock_task_context(task, ctxn, &flags);
1448         if (ctx) {
1449                 ++ctx->pin_count;
1450                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1451         }
1452         return ctx;
1453 }
1454 
1455 static void perf_unpin_context(struct perf_event_context *ctx)
1456 {
1457         unsigned long flags;
1458 
1459         raw_spin_lock_irqsave(&ctx->lock, flags);
1460         --ctx->pin_count;
1461         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1462 }
1463 
1464 /*
1465  * Update the record of the current time in a context.
1466  */
1467 static void __update_context_time(struct perf_event_context *ctx, bool adv)
1468 {
1469         u64 now = perf_clock();
1470 
1471         if (adv)
1472                 ctx->time += now - ctx->timestamp;
1473         ctx->timestamp = now;
1474 
1475         /*
1476          * The above: time' = time + (now - timestamp), can be re-arranged
1477          * into: time` = now + (time - timestamp), which gives a single value
1478          * offset to compute future time without locks on.
1479          *
1480          * See perf_event_time_now(), which can be used from NMI context where
1481          * it's (obviously) not possible to acquire ctx->lock in order to read
1482          * both the above values in a consistent manner.
1483          */
1484         WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1485 }
1486 
1487 static void update_context_time(struct perf_event_context *ctx)
1488 {
1489         __update_context_time(ctx, true);
1490 }
1491 
1492 static u64 perf_event_time(struct perf_event *event)
1493 {
1494         struct perf_event_context *ctx = event->ctx;
1495 
1496         if (unlikely(!ctx))
1497                 return 0;
1498 
1499         if (is_cgroup_event(event))
1500                 return perf_cgroup_event_time(event);
1501 
1502         return ctx->time;
1503 }
1504 
1505 static u64 perf_event_time_now(struct perf_event *event, u64 now)
1506 {
1507         struct perf_event_context *ctx = event->ctx;
1508 
1509         if (unlikely(!ctx))
1510                 return 0;
1511 
1512         if (is_cgroup_event(event))
1513                 return perf_cgroup_event_time_now(event, now);
1514 
1515         if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1516                 return ctx->time;
1517 
1518         now += READ_ONCE(ctx->timeoffset);
1519         return now;
1520 }
1521 
1522 static enum event_type_t get_event_type(struct perf_event *event)
1523 {
1524         struct perf_event_context *ctx = event->ctx;
1525         enum event_type_t event_type;
1526 
1527         lockdep_assert_held(&ctx->lock);
1528 
1529         /*
1530          * It's 'group type', really, because if our group leader is
1531          * pinned, so are we.
1532          */
1533         if (event->group_leader != event)
1534                 event = event->group_leader;
1535 
1536         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1537         if (!ctx->task)
1538                 event_type |= EVENT_CPU;
1539 
1540         return event_type;
1541 }
1542 
1543 /*
1544  * Helper function to initialize event group nodes.
1545  */
1546 static void init_event_group(struct perf_event *event)
1547 {
1548         RB_CLEAR_NODE(&event->group_node);
1549         event->group_index = 0;
1550 }
1551 
1552 /*
1553  * Extract pinned or flexible groups from the context
1554  * based on event attrs bits.
1555  */
1556 static struct perf_event_groups *
1557 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1558 {
1559         if (event->attr.pinned)
1560                 return &ctx->pinned_groups;
1561         else
1562                 return &ctx->flexible_groups;
1563 }
1564 
1565 /*
1566  * Helper function to initializes perf_event_group trees.
1567  */
1568 static void perf_event_groups_init(struct perf_event_groups *groups)
1569 {
1570         groups->tree = RB_ROOT;
1571         groups->index = 0;
1572 }
1573 
1574 static inline struct cgroup *event_cgroup(const struct perf_event *event)
1575 {
1576         struct cgroup *cgroup = NULL;
1577 
1578 #ifdef CONFIG_CGROUP_PERF
1579         if (event->cgrp)
1580                 cgroup = event->cgrp->css.cgroup;
1581 #endif
1582 
1583         return cgroup;
1584 }
1585 
1586 /*
1587  * Compare function for event groups;
1588  *
1589  * Implements complex key that first sorts by CPU and then by virtual index
1590  * which provides ordering when rotating groups for the same CPU.
1591  */
1592 static __always_inline int
1593 perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1594                       const u64 left_group_index, const struct perf_event *right)
1595 {
1596         if (left_cpu < right->cpu)
1597                 return -1;
1598         if (left_cpu > right->cpu)
1599                 return 1;
1600 
1601 #ifdef CONFIG_CGROUP_PERF
1602         {
1603                 const struct cgroup *right_cgroup = event_cgroup(right);
1604 
1605                 if (left_cgroup != right_cgroup) {
1606                         if (!left_cgroup) {
1607                                 /*
1608                                  * Left has no cgroup but right does, no
1609                                  * cgroups come first.
1610                                  */
1611                                 return -1;
1612                         }
1613                         if (!right_cgroup) {
1614                                 /*
1615                                  * Right has no cgroup but left does, no
1616                                  * cgroups come first.
1617                                  */
1618                                 return 1;
1619                         }
1620                         /* Two dissimilar cgroups, order by id. */
1621                         if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1622                                 return -1;
1623 
1624                         return 1;
1625                 }
1626         }
1627 #endif
1628 
1629         if (left_group_index < right->group_index)
1630                 return -1;
1631         if (left_group_index > right->group_index)
1632                 return 1;
1633 
1634         return 0;
1635 }
1636 
1637 #define __node_2_pe(node) \
1638         rb_entry((node), struct perf_event, group_node)
1639 
1640 static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1641 {
1642         struct perf_event *e = __node_2_pe(a);
1643         return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1644                                      __node_2_pe(b)) < 0;
1645 }
1646 
1647 struct __group_key {
1648         int cpu;
1649         struct cgroup *cgroup;
1650 };
1651 
1652 static inline int __group_cmp(const void *key, const struct rb_node *node)
1653 {
1654         const struct __group_key *a = key;
1655         const struct perf_event *b = __node_2_pe(node);
1656 
1657         /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
1658         return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1659 }
1660 
1661 /*
1662  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1663  * key (see perf_event_groups_less). This places it last inside the CPU
1664  * subtree.
1665  */
1666 static void
1667 perf_event_groups_insert(struct perf_event_groups *groups,
1668                          struct perf_event *event)
1669 {
1670         event->group_index = ++groups->index;
1671 
1672         rb_add(&event->group_node, &groups->tree, __group_less);
1673 }
1674 
1675 /*
1676  * Helper function to insert event into the pinned or flexible groups.
1677  */
1678 static void
1679 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1680 {
1681         struct perf_event_groups *groups;
1682 
1683         groups = get_event_groups(event, ctx);
1684         perf_event_groups_insert(groups, event);
1685 }
1686 
1687 /*
1688  * Delete a group from a tree.
1689  */
1690 static void
1691 perf_event_groups_delete(struct perf_event_groups *groups,
1692                          struct perf_event *event)
1693 {
1694         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1695                      RB_EMPTY_ROOT(&groups->tree));
1696 
1697         rb_erase(&event->group_node, &groups->tree);
1698         init_event_group(event);
1699 }
1700 
1701 /*
1702  * Helper function to delete event from its groups.
1703  */
1704 static void
1705 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1706 {
1707         struct perf_event_groups *groups;
1708 
1709         groups = get_event_groups(event, ctx);
1710         perf_event_groups_delete(groups, event);
1711 }
1712 
1713 /*
1714  * Get the leftmost event in the cpu/cgroup subtree.
1715  */
1716 static struct perf_event *
1717 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1718                         struct cgroup *cgrp)
1719 {
1720         struct __group_key key = {
1721                 .cpu = cpu,
1722                 .cgroup = cgrp,
1723         };
1724         struct rb_node *node;
1725 
1726         node = rb_find_first(&key, &groups->tree, __group_cmp);
1727         if (node)
1728                 return __node_2_pe(node);
1729 
1730         return NULL;
1731 }
1732 
1733 /*
1734  * Like rb_entry_next_safe() for the @cpu subtree.
1735  */
1736 static struct perf_event *
1737 perf_event_groups_next(struct perf_event *event)
1738 {
1739         struct __group_key key = {
1740                 .cpu = event->cpu,
1741                 .cgroup = event_cgroup(event),
1742         };
1743         struct rb_node *next;
1744 
1745         next = rb_next_match(&key, &event->group_node, __group_cmp);
1746         if (next)
1747                 return __node_2_pe(next);
1748 
1749         return NULL;
1750 }
1751 
1752 /*
1753  * Iterate through the whole groups tree.
1754  */
1755 #define perf_event_groups_for_each(event, groups)                       \
1756         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1757                                 typeof(*event), group_node); event;     \
1758                 event = rb_entry_safe(rb_next(&event->group_node),      \
1759                                 typeof(*event), group_node))
1760 
1761 /*
1762  * Add an event from the lists for its context.
1763  * Must be called with ctx->mutex and ctx->lock held.
1764  */
1765 static void
1766 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1767 {
1768         lockdep_assert_held(&ctx->lock);
1769 
1770         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1771         event->attach_state |= PERF_ATTACH_CONTEXT;
1772 
1773         event->tstamp = perf_event_time(event);
1774 
1775         /*
1776          * If we're a stand alone event or group leader, we go to the context
1777          * list, group events are kept attached to the group so that
1778          * perf_group_detach can, at all times, locate all siblings.
1779          */
1780         if (event->group_leader == event) {
1781                 event->group_caps = event->event_caps;
1782                 add_event_to_groups(event, ctx);
1783         }
1784 
1785         list_add_rcu(&event->event_entry, &ctx->event_list);
1786         ctx->nr_events++;
1787         if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1788                 ctx->nr_user++;
1789         if (event->attr.inherit_stat)
1790                 ctx->nr_stat++;
1791 
1792         if (event->state > PERF_EVENT_STATE_OFF)
1793                 perf_cgroup_event_enable(event, ctx);
1794 
1795         ctx->generation++;
1796 }
1797 
1798 /*
1799  * Initialize event state based on the perf_event_attr::disabled.
1800  */
1801 static inline void perf_event__state_init(struct perf_event *event)
1802 {
1803         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1804                                               PERF_EVENT_STATE_INACTIVE;
1805 }
1806 
1807 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1808 {
1809         int entry = sizeof(u64); /* value */
1810         int size = 0;
1811         int nr = 1;
1812 
1813         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1814                 size += sizeof(u64);
1815 
1816         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1817                 size += sizeof(u64);
1818 
1819         if (event->attr.read_format & PERF_FORMAT_ID)
1820                 entry += sizeof(u64);
1821 
1822         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1823                 nr += nr_siblings;
1824                 size += sizeof(u64);
1825         }
1826 
1827         size += entry * nr;
1828         event->read_size = size;
1829 }
1830 
1831 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1832 {
1833         struct perf_sample_data *data;
1834         u16 size = 0;
1835 
1836         if (sample_type & PERF_SAMPLE_IP)
1837                 size += sizeof(data->ip);
1838 
1839         if (sample_type & PERF_SAMPLE_ADDR)
1840                 size += sizeof(data->addr);
1841 
1842         if (sample_type & PERF_SAMPLE_PERIOD)
1843                 size += sizeof(data->period);
1844 
1845         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1846                 size += sizeof(data->weight.full);
1847 
1848         if (sample_type & PERF_SAMPLE_READ)
1849                 size += event->read_size;
1850 
1851         if (sample_type & PERF_SAMPLE_DATA_SRC)
1852                 size += sizeof(data->data_src.val);
1853 
1854         if (sample_type & PERF_SAMPLE_TRANSACTION)
1855                 size += sizeof(data->txn);
1856 
1857         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1858                 size += sizeof(data->phys_addr);
1859 
1860         if (sample_type & PERF_SAMPLE_CGROUP)
1861                 size += sizeof(data->cgroup);
1862 
1863         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1864                 size += sizeof(data->data_page_size);
1865 
1866         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1867                 size += sizeof(data->code_page_size);
1868 
1869         event->header_size = size;
1870 }
1871 
1872 /*
1873  * Called at perf_event creation and when events are attached/detached from a
1874  * group.
1875  */
1876 static void perf_event__header_size(struct perf_event *event)
1877 {
1878         __perf_event_read_size(event,
1879                                event->group_leader->nr_siblings);
1880         __perf_event_header_size(event, event->attr.sample_type);
1881 }
1882 
1883 static void perf_event__id_header_size(struct perf_event *event)
1884 {
1885         struct perf_sample_data *data;
1886         u64 sample_type = event->attr.sample_type;
1887         u16 size = 0;
1888 
1889         if (sample_type & PERF_SAMPLE_TID)
1890                 size += sizeof(data->tid_entry);
1891 
1892         if (sample_type & PERF_SAMPLE_TIME)
1893                 size += sizeof(data->time);
1894 
1895         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1896                 size += sizeof(data->id);
1897 
1898         if (sample_type & PERF_SAMPLE_ID)
1899                 size += sizeof(data->id);
1900 
1901         if (sample_type & PERF_SAMPLE_STREAM_ID)
1902                 size += sizeof(data->stream_id);
1903 
1904         if (sample_type & PERF_SAMPLE_CPU)
1905                 size += sizeof(data->cpu_entry);
1906 
1907         event->id_header_size = size;
1908 }
1909 
1910 static bool perf_event_validate_size(struct perf_event *event)
1911 {
1912         /*
1913          * The values computed here will be over-written when we actually
1914          * attach the event.
1915          */
1916         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1917         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1918         perf_event__id_header_size(event);
1919 
1920         /*
1921          * Sum the lot; should not exceed the 64k limit we have on records.
1922          * Conservative limit to allow for callchains and other variable fields.
1923          */
1924         if (event->read_size + event->header_size +
1925             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1926                 return false;
1927 
1928         return true;
1929 }
1930 
1931 static void perf_group_attach(struct perf_event *event)
1932 {
1933         struct perf_event *group_leader = event->group_leader, *pos;
1934 
1935         lockdep_assert_held(&event->ctx->lock);
1936 
1937         /*
1938          * We can have double attach due to group movement in perf_event_open.
1939          */
1940         if (event->attach_state & PERF_ATTACH_GROUP)
1941                 return;
1942 
1943         event->attach_state |= PERF_ATTACH_GROUP;
1944 
1945         if (group_leader == event)
1946                 return;
1947 
1948         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1949 
1950         group_leader->group_caps &= event->event_caps;
1951 
1952         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1953         group_leader->nr_siblings++;
1954 
1955         perf_event__header_size(group_leader);
1956 
1957         for_each_sibling_event(pos, group_leader)
1958                 perf_event__header_size(pos);
1959 }
1960 
1961 /*
1962  * Remove an event from the lists for its context.
1963  * Must be called with ctx->mutex and ctx->lock held.
1964  */
1965 static void
1966 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1967 {
1968         WARN_ON_ONCE(event->ctx != ctx);
1969         lockdep_assert_held(&ctx->lock);
1970 
1971         /*
1972          * We can have double detach due to exit/hot-unplug + close.
1973          */
1974         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1975                 return;
1976 
1977         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1978 
1979         ctx->nr_events--;
1980         if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1981                 ctx->nr_user--;
1982         if (event->attr.inherit_stat)
1983                 ctx->nr_stat--;
1984 
1985         list_del_rcu(&event->event_entry);
1986 
1987         if (event->group_leader == event)
1988                 del_event_from_groups(event, ctx);
1989 
1990         /*
1991          * If event was in error state, then keep it
1992          * that way, otherwise bogus counts will be
1993          * returned on read(). The only way to get out
1994          * of error state is by explicit re-enabling
1995          * of the event
1996          */
1997         if (event->state > PERF_EVENT_STATE_OFF) {
1998                 perf_cgroup_event_disable(event, ctx);
1999                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2000         }
2001 
2002         ctx->generation++;
2003 }
2004 
2005 static int
2006 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2007 {
2008         if (!has_aux(aux_event))
2009                 return 0;
2010 
2011         if (!event->pmu->aux_output_match)
2012                 return 0;
2013 
2014         return event->pmu->aux_output_match(aux_event);
2015 }
2016 
2017 static void put_event(struct perf_event *event);
2018 static void event_sched_out(struct perf_event *event,
2019                             struct perf_cpu_context *cpuctx,
2020                             struct perf_event_context *ctx);
2021 
2022 static void perf_put_aux_event(struct perf_event *event)
2023 {
2024         struct perf_event_context *ctx = event->ctx;
2025         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2026         struct perf_event *iter;
2027 
2028         /*
2029          * If event uses aux_event tear down the link
2030          */
2031         if (event->aux_event) {
2032                 iter = event->aux_event;
2033                 event->aux_event = NULL;
2034                 put_event(iter);
2035                 return;
2036         }
2037 
2038         /*
2039          * If the event is an aux_event, tear down all links to
2040          * it from other events.
2041          */
2042         for_each_sibling_event(iter, event->group_leader) {
2043                 if (iter->aux_event != event)
2044                         continue;
2045 
2046                 iter->aux_event = NULL;
2047                 put_event(event);
2048 
2049                 /*
2050                  * If it's ACTIVE, schedule it out and put it into ERROR
2051                  * state so that we don't try to schedule it again. Note
2052                  * that perf_event_enable() will clear the ERROR status.
2053                  */
2054                 event_sched_out(iter, cpuctx, ctx);
2055                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2056         }
2057 }
2058 
2059 static bool perf_need_aux_event(struct perf_event *event)
2060 {
2061         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2062 }
2063 
2064 static int perf_get_aux_event(struct perf_event *event,
2065                               struct perf_event *group_leader)
2066 {
2067         /*
2068          * Our group leader must be an aux event if we want to be
2069          * an aux_output. This way, the aux event will precede its
2070          * aux_output events in the group, and therefore will always
2071          * schedule first.
2072          */
2073         if (!group_leader)
2074                 return 0;
2075 
2076         /*
2077          * aux_output and aux_sample_size are mutually exclusive.
2078          */
2079         if (event->attr.aux_output && event->attr.aux_sample_size)
2080                 return 0;
2081 
2082         if (event->attr.aux_output &&
2083             !perf_aux_output_match(event, group_leader))
2084                 return 0;
2085 
2086         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2087                 return 0;
2088 
2089         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2090                 return 0;
2091 
2092         /*
2093          * Link aux_outputs to their aux event; this is undone in
2094          * perf_group_detach() by perf_put_aux_event(). When the
2095          * group in torn down, the aux_output events loose their
2096          * link to the aux_event and can't schedule any more.
2097          */
2098         event->aux_event = group_leader;
2099 
2100         return 1;
2101 }
2102 
2103 static inline struct list_head *get_event_list(struct perf_event *event)
2104 {
2105         struct perf_event_context *ctx = event->ctx;
2106         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2107 }
2108 
2109 /*
2110  * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2111  * cannot exist on their own, schedule them out and move them into the ERROR
2112  * state. Also see _perf_event_enable(), it will not be able to recover
2113  * this ERROR state.
2114  */
2115 static inline void perf_remove_sibling_event(struct perf_event *event)
2116 {
2117         struct perf_event_context *ctx = event->ctx;
2118         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2119 
2120         event_sched_out(event, cpuctx, ctx);
2121         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2122 }
2123 
2124 static void perf_group_detach(struct perf_event *event)
2125 {
2126         struct perf_event *leader = event->group_leader;
2127         struct perf_event *sibling, *tmp;
2128         struct perf_event_context *ctx = event->ctx;
2129 
2130         lockdep_assert_held(&ctx->lock);
2131 
2132         /*
2133          * We can have double detach due to exit/hot-unplug + close.
2134          */
2135         if (!(event->attach_state & PERF_ATTACH_GROUP))
2136                 return;
2137 
2138         event->attach_state &= ~PERF_ATTACH_GROUP;
2139 
2140         perf_put_aux_event(event);
2141 
2142         /*
2143          * If this is a sibling, remove it from its group.
2144          */
2145         if (leader != event) {
2146                 list_del_init(&event->sibling_list);
2147                 event->group_leader->nr_siblings--;
2148                 goto out;
2149         }
2150 
2151         /*
2152          * If this was a group event with sibling events then
2153          * upgrade the siblings to singleton events by adding them
2154          * to whatever list we are on.
2155          */
2156         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2157 
2158                 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2159                         perf_remove_sibling_event(sibling);
2160 
2161                 sibling->group_leader = sibling;
2162                 list_del_init(&sibling->sibling_list);
2163 
2164                 /* Inherit group flags from the previous leader */
2165                 sibling->group_caps = event->group_caps;
2166 
2167                 if (!RB_EMPTY_NODE(&event->group_node)) {
2168                         add_event_to_groups(sibling, event->ctx);
2169 
2170                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2171                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2172                 }
2173 
2174                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2175         }
2176 
2177 out:
2178         for_each_sibling_event(tmp, leader)
2179                 perf_event__header_size(tmp);
2180 
2181         perf_event__header_size(leader);
2182 }
2183 
2184 static void sync_child_event(struct perf_event *child_event);
2185 
2186 static void perf_child_detach(struct perf_event *event)
2187 {
2188         struct perf_event *parent_event = event->parent;
2189 
2190         if (!(event->attach_state & PERF_ATTACH_CHILD))
2191                 return;
2192 
2193         event->attach_state &= ~PERF_ATTACH_CHILD;
2194 
2195         if (WARN_ON_ONCE(!parent_event))
2196                 return;
2197 
2198         lockdep_assert_held(&parent_event->child_mutex);
2199 
2200         sync_child_event(event);
2201         list_del_init(&event->child_list);
2202 }
2203 
2204 static bool is_orphaned_event(struct perf_event *event)
2205 {
2206         return event->state == PERF_EVENT_STATE_DEAD;
2207 }
2208 
2209 static inline int __pmu_filter_match(struct perf_event *event)
2210 {
2211         struct pmu *pmu = event->pmu;
2212         return pmu->filter_match ? pmu->filter_match(event) : 1;
2213 }
2214 
2215 /*
2216  * Check whether we should attempt to schedule an event group based on
2217  * PMU-specific filtering. An event group can consist of HW and SW events,
2218  * potentially with a SW leader, so we must check all the filters, to
2219  * determine whether a group is schedulable:
2220  */
2221 static inline int pmu_filter_match(struct perf_event *event)
2222 {
2223         struct perf_event *sibling;
2224 
2225         if (!__pmu_filter_match(event))
2226                 return 0;
2227 
2228         for_each_sibling_event(sibling, event) {
2229                 if (!__pmu_filter_match(sibling))
2230                         return 0;
2231         }
2232 
2233         return 1;
2234 }
2235 
2236 static inline int
2237 event_filter_match(struct perf_event *event)
2238 {
2239         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2240                perf_cgroup_match(event) && pmu_filter_match(event);
2241 }
2242 
2243 static void
2244 event_sched_out(struct perf_event *event,
2245                   struct perf_cpu_context *cpuctx,
2246                   struct perf_event_context *ctx)
2247 {
2248         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2249 
2250         WARN_ON_ONCE(event->ctx != ctx);
2251         lockdep_assert_held(&ctx->lock);
2252 
2253         if (event->state != PERF_EVENT_STATE_ACTIVE)
2254                 return;
2255 
2256         /*
2257          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2258          * we can schedule events _OUT_ individually through things like
2259          * __perf_remove_from_context().
2260          */
2261         list_del_init(&event->active_list);
2262 
2263         perf_pmu_disable(event->pmu);
2264 
2265         event->pmu->del(event, 0);
2266         event->oncpu = -1;
2267 
2268         if (READ_ONCE(event->pending_disable) >= 0) {
2269                 WRITE_ONCE(event->pending_disable, -1);
2270                 perf_cgroup_event_disable(event, ctx);
2271                 state = PERF_EVENT_STATE_OFF;
2272         }
2273         perf_event_set_state(event, state);
2274 
2275         if (!is_software_event(event))
2276                 cpuctx->active_oncpu--;
2277         if (!--ctx->nr_active)
2278                 perf_event_ctx_deactivate(ctx);
2279         if (event->attr.freq && event->attr.sample_freq)
2280                 ctx->nr_freq--;
2281         if (event->attr.exclusive || !cpuctx->active_oncpu)
2282                 cpuctx->exclusive = 0;
2283 
2284         perf_pmu_enable(event->pmu);
2285 }
2286 
2287 static void
2288 group_sched_out(struct perf_event *group_event,
2289                 struct perf_cpu_context *cpuctx,
2290                 struct perf_event_context *ctx)
2291 {
2292         struct perf_event *event;
2293 
2294         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2295                 return;
2296 
2297         perf_pmu_disable(ctx->pmu);
2298 
2299         event_sched_out(group_event, cpuctx, ctx);
2300 
2301         /*
2302          * Schedule out siblings (if any):
2303          */
2304         for_each_sibling_event(event, group_event)
2305                 event_sched_out(event, cpuctx, ctx);
2306 
2307         perf_pmu_enable(ctx->pmu);
2308 }
2309 
2310 #define DETACH_GROUP    0x01UL
2311 #define DETACH_CHILD    0x02UL
2312 
2313 /*
2314  * Cross CPU call to remove a performance event
2315  *
2316  * We disable the event on the hardware level first. After that we
2317  * remove it from the context list.
2318  */
2319 static void
2320 __perf_remove_from_context(struct perf_event *event,
2321                            struct perf_cpu_context *cpuctx,
2322                            struct perf_event_context *ctx,
2323                            void *info)
2324 {
2325         unsigned long flags = (unsigned long)info;
2326 
2327         if (ctx->is_active & EVENT_TIME) {
2328                 update_context_time(ctx);
2329                 update_cgrp_time_from_cpuctx(cpuctx, false);
2330         }
2331 
2332         event_sched_out(event, cpuctx, ctx);
2333         if (flags & DETACH_GROUP)
2334                 perf_group_detach(event);
2335         if (flags & DETACH_CHILD)
2336                 perf_child_detach(event);
2337         list_del_event(event, ctx);
2338 
2339         if (!ctx->nr_events && ctx->is_active) {
2340                 if (ctx == &cpuctx->ctx)
2341                         update_cgrp_time_from_cpuctx(cpuctx, true);
2342 
2343                 ctx->is_active = 0;
2344                 ctx->rotate_necessary = 0;
2345                 if (ctx->task) {
2346                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2347                         cpuctx->task_ctx = NULL;
2348                 }
2349         }
2350 }
2351 
2352 /*
2353  * Remove the event from a task's (or a CPU's) list of events.
2354  *
2355  * If event->ctx is a cloned context, callers must make sure that
2356  * every task struct that event->ctx->task could possibly point to
2357  * remains valid.  This is OK when called from perf_release since
2358  * that only calls us on the top-level context, which can't be a clone.
2359  * When called from perf_event_exit_task, it's OK because the
2360  * context has been detached from its task.
2361  */
2362 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2363 {
2364         struct perf_event_context *ctx = event->ctx;
2365 
2366         lockdep_assert_held(&ctx->mutex);
2367 
2368         /*
2369          * Because of perf_event_exit_task(), perf_remove_from_context() ought
2370          * to work in the face of TASK_TOMBSTONE, unlike every other
2371          * event_function_call() user.
2372          */
2373         raw_spin_lock_irq(&ctx->lock);
2374         /*
2375          * Cgroup events are per-cpu events, and must IPI because of
2376          * cgrp_cpuctx_list.
2377          */
2378         if (!ctx->is_active && !is_cgroup_event(event)) {
2379                 __perf_remove_from_context(event, __get_cpu_context(ctx),
2380                                            ctx, (void *)flags);
2381                 raw_spin_unlock_irq(&ctx->lock);
2382                 return;
2383         }
2384         raw_spin_unlock_irq(&ctx->lock);
2385 
2386         event_function_call(event, __perf_remove_from_context, (void *)flags);
2387 }
2388 
2389 /*
2390  * Cross CPU call to disable a performance event
2391  */
2392 static void __perf_event_disable(struct perf_event *event,
2393                                  struct perf_cpu_context *cpuctx,
2394                                  struct perf_event_context *ctx,
2395                                  void *info)
2396 {
2397         if (event->state < PERF_EVENT_STATE_INACTIVE)
2398                 return;
2399 
2400         if (ctx->is_active & EVENT_TIME) {
2401                 update_context_time(ctx);
2402                 update_cgrp_time_from_event(event);
2403         }
2404 
2405         if (event == event->group_leader)
2406                 group_sched_out(event, cpuctx, ctx);
2407         else
2408                 event_sched_out(event, cpuctx, ctx);
2409 
2410         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2411         perf_cgroup_event_disable(event, ctx);
2412 }
2413 
2414 /*
2415  * Disable an event.
2416  *
2417  * If event->ctx is a cloned context, callers must make sure that
2418  * every task struct that event->ctx->task could possibly point to
2419  * remains valid.  This condition is satisfied when called through
2420  * perf_event_for_each_child or perf_event_for_each because they
2421  * hold the top-level event's child_mutex, so any descendant that
2422  * goes to exit will block in perf_event_exit_event().
2423  *
2424  * When called from perf_pending_event it's OK because event->ctx
2425  * is the current context on this CPU and preemption is disabled,
2426  * hence we can't get into perf_event_task_sched_out for this context.
2427  */
2428 static void _perf_event_disable(struct perf_event *event)
2429 {
2430         struct perf_event_context *ctx = event->ctx;
2431 
2432         raw_spin_lock_irq(&ctx->lock);
2433         if (event->state <= PERF_EVENT_STATE_OFF) {
2434                 raw_spin_unlock_irq(&ctx->lock);
2435                 return;
2436         }
2437         raw_spin_unlock_irq(&ctx->lock);
2438 
2439         event_function_call(event, __perf_event_disable, NULL);
2440 }
2441 
2442 void perf_event_disable_local(struct perf_event *event)
2443 {
2444         event_function_local(event, __perf_event_disable, NULL);
2445 }
2446 
2447 /*
2448  * Strictly speaking kernel users cannot create groups and therefore this
2449  * interface does not need the perf_event_ctx_lock() magic.
2450  */
2451 void perf_event_disable(struct perf_event *event)
2452 {
2453         struct perf_event_context *ctx;
2454 
2455         ctx = perf_event_ctx_lock(event);
2456         _perf_event_disable(event);
2457         perf_event_ctx_unlock(event, ctx);
2458 }
2459 EXPORT_SYMBOL_GPL(perf_event_disable);
2460 
2461 void perf_event_disable_inatomic(struct perf_event *event)
2462 {
2463         WRITE_ONCE(event->pending_disable, smp_processor_id());
2464         /* can fail, see perf_pending_event_disable() */
2465         irq_work_queue(&event->pending);
2466 }
2467 
2468 #define MAX_INTERRUPTS (~0ULL)
2469 
2470 static void perf_log_throttle(struct perf_event *event, int enable);
2471 static void perf_log_itrace_start(struct perf_event *event);
2472 
2473 static int
2474 event_sched_in(struct perf_event *event,
2475                  struct perf_cpu_context *cpuctx,
2476                  struct perf_event_context *ctx)
2477 {
2478         int ret = 0;
2479 
2480         WARN_ON_ONCE(event->ctx != ctx);
2481 
2482         lockdep_assert_held(&ctx->lock);
2483 
2484         if (event->state <= PERF_EVENT_STATE_OFF)
2485                 return 0;
2486 
2487         WRITE_ONCE(event->oncpu, smp_processor_id());
2488         /*
2489          * Order event::oncpu write to happen before the ACTIVE state is
2490          * visible. This allows perf_event_{stop,read}() to observe the correct
2491          * ->oncpu if it sees ACTIVE.
2492          */
2493         smp_wmb();
2494         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2495 
2496         /*
2497          * Unthrottle events, since we scheduled we might have missed several
2498          * ticks already, also for a heavily scheduling task there is little
2499          * guarantee it'll get a tick in a timely manner.
2500          */
2501         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2502                 perf_log_throttle(event, 1);
2503                 event->hw.interrupts = 0;
2504         }
2505 
2506         perf_pmu_disable(event->pmu);
2507 
2508         perf_log_itrace_start(event);
2509 
2510         if (event->pmu->add(event, PERF_EF_START)) {
2511                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2512                 event->oncpu = -1;
2513                 ret = -EAGAIN;
2514                 goto out;
2515         }
2516 
2517         if (!is_software_event(event))
2518                 cpuctx->active_oncpu++;
2519         if (!ctx->nr_active++)
2520                 perf_event_ctx_activate(ctx);
2521         if (event->attr.freq && event->attr.sample_freq)
2522                 ctx->nr_freq++;
2523 
2524         if (event->attr.exclusive)
2525                 cpuctx->exclusive = 1;
2526 
2527 out:
2528         perf_pmu_enable(event->pmu);
2529 
2530         return ret;
2531 }
2532 
2533 static int
2534 group_sched_in(struct perf_event *group_event,
2535                struct perf_cpu_context *cpuctx,
2536                struct perf_event_context *ctx)
2537 {
2538         struct perf_event *event, *partial_group = NULL;
2539         struct pmu *pmu = ctx->pmu;
2540 
2541         if (group_event->state == PERF_EVENT_STATE_OFF)
2542                 return 0;
2543 
2544         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2545 
2546         if (event_sched_in(group_event, cpuctx, ctx))
2547                 goto error;
2548 
2549         /*
2550          * Schedule in siblings as one group (if any):
2551          */
2552         for_each_sibling_event(event, group_event) {
2553                 if (event_sched_in(event, cpuctx, ctx)) {
2554                         partial_group = event;
2555                         goto group_error;
2556                 }
2557         }
2558 
2559         if (!pmu->commit_txn(pmu))
2560                 return 0;
2561 
2562 group_error:
2563         /*
2564          * Groups can be scheduled in as one unit only, so undo any
2565          * partial group before returning:
2566          * The events up to the failed event are scheduled out normally.
2567          */
2568         for_each_sibling_event(event, group_event) {
2569                 if (event == partial_group)
2570                         break;
2571 
2572                 event_sched_out(event, cpuctx, ctx);
2573         }
2574         event_sched_out(group_event, cpuctx, ctx);
2575 
2576 error:
2577         pmu->cancel_txn(pmu);
2578         return -EAGAIN;
2579 }
2580 
2581 /*
2582  * Work out whether we can put this event group on the CPU now.
2583  */
2584 static int group_can_go_on(struct perf_event *event,
2585                            struct perf_cpu_context *cpuctx,
2586                            int can_add_hw)
2587 {
2588         /*
2589          * Groups consisting entirely of software events can always go on.
2590          */
2591         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2592                 return 1;
2593         /*
2594          * If an exclusive group is already on, no other hardware
2595          * events can go on.
2596          */
2597         if (cpuctx->exclusive)
2598                 return 0;
2599         /*
2600          * If this group is exclusive and there are already
2601          * events on the CPU, it can't go on.
2602          */
2603         if (event->attr.exclusive && !list_empty(get_event_list(event)))
2604                 return 0;
2605         /*
2606          * Otherwise, try to add it if all previous groups were able
2607          * to go on.
2608          */
2609         return can_add_hw;
2610 }
2611 
2612 static void add_event_to_ctx(struct perf_event *event,
2613                                struct perf_event_context *ctx)
2614 {
2615         list_add_event(event, ctx);
2616         perf_group_attach(event);
2617 }
2618 
2619 static void ctx_sched_out(struct perf_event_context *ctx,
2620                           struct perf_cpu_context *cpuctx,
2621                           enum event_type_t event_type);
2622 static void
2623 ctx_sched_in(struct perf_event_context *ctx,
2624              struct perf_cpu_context *cpuctx,
2625              enum event_type_t event_type);
2626 
2627 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2628                                struct perf_event_context *ctx,
2629                                enum event_type_t event_type)
2630 {
2631         if (!cpuctx->task_ctx)
2632                 return;
2633 
2634         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2635                 return;
2636 
2637         ctx_sched_out(ctx, cpuctx, event_type);
2638 }
2639 
2640 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2641                                 struct perf_event_context *ctx)
2642 {
2643         cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
2644         if (ctx)
2645                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
2646         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
2647         if (ctx)
2648                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
2649 }
2650 
2651 /*
2652  * We want to maintain the following priority of scheduling:
2653  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2654  *  - task pinned (EVENT_PINNED)
2655  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2656  *  - task flexible (EVENT_FLEXIBLE).
2657  *
2658  * In order to avoid unscheduling and scheduling back in everything every
2659  * time an event is added, only do it for the groups of equal priority and
2660  * below.
2661  *
2662  * This can be called after a batch operation on task events, in which case
2663  * event_type is a bit mask of the types of events involved. For CPU events,
2664  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2665  */
2666 static void ctx_resched(struct perf_cpu_context *cpuctx,
2667                         struct perf_event_context *task_ctx,
2668                         enum event_type_t event_type)
2669 {
2670         enum event_type_t ctx_event_type;
2671         bool cpu_event = !!(event_type & EVENT_CPU);
2672 
2673         /*
2674          * If pinned groups are involved, flexible groups also need to be
2675          * scheduled out.
2676          */
2677         if (event_type & EVENT_PINNED)
2678                 event_type |= EVENT_FLEXIBLE;
2679 
2680         ctx_event_type = event_type & EVENT_ALL;
2681 
2682         perf_pmu_disable(cpuctx->ctx.pmu);
2683         if (task_ctx)
2684                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2685 
2686         /*
2687          * Decide which cpu ctx groups to schedule out based on the types
2688          * of events that caused rescheduling:
2689          *  - EVENT_CPU: schedule out corresponding groups;
2690          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2691          *  - otherwise, do nothing more.
2692          */
2693         if (cpu_event)
2694                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2695         else if (ctx_event_type & EVENT_PINNED)
2696                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2697 
2698         perf_event_sched_in(cpuctx, task_ctx);
2699         perf_pmu_enable(cpuctx->ctx.pmu);
2700 }
2701 
2702 void perf_pmu_resched(struct pmu *pmu)
2703 {
2704         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2705         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2706 
2707         perf_ctx_lock(cpuctx, task_ctx);
2708         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2709         perf_ctx_unlock(cpuctx, task_ctx);
2710 }
2711 
2712 /*
2713  * Cross CPU call to install and enable a performance event
2714  *
2715  * Very similar to remote_function() + event_function() but cannot assume that
2716  * things like ctx->is_active and cpuctx->task_ctx are set.
2717  */
2718 static int  __perf_install_in_context(void *info)
2719 {
2720         struct perf_event *event = info;
2721         struct perf_event_context *ctx = event->ctx;
2722         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2723         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2724         bool reprogram = true;
2725         int ret = 0;
2726 
2727         raw_spin_lock(&cpuctx->ctx.lock);
2728         if (ctx->task) {
2729                 raw_spin_lock(&ctx->lock);
2730                 task_ctx = ctx;
2731 
2732                 reprogram = (ctx->task == current);
2733 
2734                 /*
2735                  * If the task is running, it must be running on this CPU,
2736                  * otherwise we cannot reprogram things.
2737                  *
2738                  * If its not running, we don't care, ctx->lock will
2739                  * serialize against it becoming runnable.
2740                  */
2741                 if (task_curr(ctx->task) && !reprogram) {
2742                         ret = -ESRCH;
2743                         goto unlock;
2744                 }
2745 
2746                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2747         } else if (task_ctx) {
2748                 raw_spin_lock(&task_ctx->lock);
2749         }
2750 
2751 #ifdef CONFIG_CGROUP_PERF
2752         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2753                 /*
2754                  * If the current cgroup doesn't match the event's
2755                  * cgroup, we should not try to schedule it.
2756                  */
2757                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2758                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2759                                         event->cgrp->css.cgroup);
2760         }
2761 #endif
2762 
2763         if (reprogram) {
2764                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2765                 add_event_to_ctx(event, ctx);
2766                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2767         } else {
2768                 add_event_to_ctx(event, ctx);
2769         }
2770 
2771 unlock:
2772         perf_ctx_unlock(cpuctx, task_ctx);
2773 
2774         return ret;
2775 }
2776 
2777 static bool exclusive_event_installable(struct perf_event *event,
2778                                         struct perf_event_context *ctx);
2779 
2780 /*
2781  * Attach a performance event to a context.
2782  *
2783  * Very similar to event_function_call, see comment there.
2784  */
2785 static void
2786 perf_install_in_context(struct perf_event_context *ctx,
2787                         struct perf_event *event,
2788                         int cpu)
2789 {
2790         struct task_struct *task = READ_ONCE(ctx->task);
2791 
2792         lockdep_assert_held(&ctx->mutex);
2793 
2794         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2795 
2796         if (event->cpu != -1)
2797                 event->cpu = cpu;
2798 
2799         /*
2800          * Ensures that if we can observe event->ctx, both the event and ctx
2801          * will be 'complete'. See perf_iterate_sb_cpu().
2802          */
2803         smp_store_release(&event->ctx, ctx);
2804 
2805         /*
2806          * perf_event_attr::disabled events will not run and can be initialized
2807          * without IPI. Except when this is the first event for the context, in
2808          * that case we need the magic of the IPI to set ctx->is_active.
2809          * Similarly, cgroup events for the context also needs the IPI to
2810          * manipulate the cgrp_cpuctx_list.
2811          *
2812          * The IOC_ENABLE that is sure to follow the creation of a disabled
2813          * event will issue the IPI and reprogram the hardware.
2814          */
2815         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2816             ctx->nr_events && !is_cgroup_event(event)) {
2817                 raw_spin_lock_irq(&ctx->lock);
2818                 if (ctx->task == TASK_TOMBSTONE) {
2819                         raw_spin_unlock_irq(&ctx->lock);
2820                         return;
2821                 }
2822                 add_event_to_ctx(event, ctx);
2823                 raw_spin_unlock_irq(&ctx->lock);
2824                 return;
2825         }
2826 
2827         if (!task) {
2828                 cpu_function_call(cpu, __perf_install_in_context, event);
2829                 return;
2830         }
2831 
2832         /*
2833          * Should not happen, we validate the ctx is still alive before calling.
2834          */
2835         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2836                 return;
2837 
2838         /*
2839          * Installing events is tricky because we cannot rely on ctx->is_active
2840          * to be set in case this is the nr_events 0 -> 1 transition.
2841          *
2842          * Instead we use task_curr(), which tells us if the task is running.
2843          * However, since we use task_curr() outside of rq::lock, we can race
2844          * against the actual state. This means the result can be wrong.
2845          *
2846          * If we get a false positive, we retry, this is harmless.
2847          *
2848          * If we get a false negative, things are complicated. If we are after
2849          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2850          * value must be correct. If we're before, it doesn't matter since
2851          * perf_event_context_sched_in() will program the counter.
2852          *
2853          * However, this hinges on the remote context switch having observed
2854          * our task->perf_event_ctxp[] store, such that it will in fact take
2855          * ctx::lock in perf_event_context_sched_in().
2856          *
2857          * We do this by task_function_call(), if the IPI fails to hit the task
2858          * we know any future context switch of task must see the
2859          * perf_event_ctpx[] store.
2860          */
2861 
2862         /*
2863          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2864          * task_cpu() load, such that if the IPI then does not find the task
2865          * running, a future context switch of that task must observe the
2866          * store.
2867          */
2868         smp_mb();
2869 again:
2870         if (!task_function_call(task, __perf_install_in_context, event))
2871                 return;
2872 
2873         raw_spin_lock_irq(&ctx->lock);
2874         task = ctx->task;
2875         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2876                 /*
2877                  * Cannot happen because we already checked above (which also
2878                  * cannot happen), and we hold ctx->mutex, which serializes us
2879                  * against perf_event_exit_task_context().
2880                  */
2881                 raw_spin_unlock_irq(&ctx->lock);
2882                 return;
2883         }
2884         /*
2885          * If the task is not running, ctx->lock will avoid it becoming so,
2886          * thus we can safely install the event.
2887          */
2888         if (task_curr(task)) {
2889                 raw_spin_unlock_irq(&ctx->lock);
2890                 goto again;
2891         }
2892         add_event_to_ctx(event, ctx);
2893         raw_spin_unlock_irq(&ctx->lock);
2894 }
2895 
2896 /*
2897  * Cross CPU call to enable a performance event
2898  */
2899 static void __perf_event_enable(struct perf_event *event,
2900                                 struct perf_cpu_context *cpuctx,
2901                                 struct perf_event_context *ctx,
2902                                 void *info)
2903 {
2904         struct perf_event *leader = event->group_leader;
2905         struct perf_event_context *task_ctx;
2906 
2907         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2908             event->state <= PERF_EVENT_STATE_ERROR)
2909                 return;
2910 
2911         if (ctx->is_active)
2912                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2913 
2914         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2915         perf_cgroup_event_enable(event, ctx);
2916 
2917         if (!ctx->is_active)
2918                 return;
2919 
2920         if (!event_filter_match(event)) {
2921                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2922                 return;
2923         }
2924 
2925         /*
2926          * If the event is in a group and isn't the group leader,
2927          * then don't put it on unless the group is on.
2928          */
2929         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2930                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2931                 return;
2932         }
2933 
2934         task_ctx = cpuctx->task_ctx;
2935         if (ctx->task)
2936                 WARN_ON_ONCE(task_ctx != ctx);
2937 
2938         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2939 }
2940 
2941 /*
2942  * Enable an event.
2943  *
2944  * If event->ctx is a cloned context, callers must make sure that
2945  * every task struct that event->ctx->task could possibly point to
2946  * remains valid.  This condition is satisfied when called through
2947  * perf_event_for_each_child or perf_event_for_each as described
2948  * for perf_event_disable.
2949  */
2950 static void _perf_event_enable(struct perf_event *event)
2951 {
2952         struct perf_event_context *ctx = event->ctx;
2953 
2954         raw_spin_lock_irq(&ctx->lock);
2955         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2956             event->state <  PERF_EVENT_STATE_ERROR) {
2957 out:
2958                 raw_spin_unlock_irq(&ctx->lock);
2959                 return;
2960         }
2961 
2962         /*
2963          * If the event is in error state, clear that first.
2964          *
2965          * That way, if we see the event in error state below, we know that it
2966          * has gone back into error state, as distinct from the task having
2967          * been scheduled away before the cross-call arrived.
2968          */
2969         if (event->state == PERF_EVENT_STATE_ERROR) {
2970                 /*
2971                  * Detached SIBLING events cannot leave ERROR state.
2972                  */
2973                 if (event->event_caps & PERF_EV_CAP_SIBLING &&
2974                     event->group_leader == event)
2975                         goto out;
2976 
2977                 event->state = PERF_EVENT_STATE_OFF;
2978         }
2979         raw_spin_unlock_irq(&ctx->lock);
2980 
2981         event_function_call(event, __perf_event_enable, NULL);
2982 }
2983 
2984 /*
2985  * See perf_event_disable();
2986  */
2987 void perf_event_enable(struct perf_event *event)
2988 {
2989         struct perf_event_context *ctx;
2990 
2991         ctx = perf_event_ctx_lock(event);
2992         _perf_event_enable(event);
2993         perf_event_ctx_unlock(event, ctx);
2994 }
2995 EXPORT_SYMBOL_GPL(perf_event_enable);
2996 
2997 struct stop_event_data {
2998         struct perf_event       *event;
2999         unsigned int            restart;
3000 };
3001 
3002 static int __perf_event_stop(void *info)
3003 {
3004         struct stop_event_data *sd = info;
3005         struct perf_event *event = sd->event;
3006 
3007         /* if it's already INACTIVE, do nothing */
3008         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3009                 return 0;
3010 
3011         /* matches smp_wmb() in event_sched_in() */
3012         smp_rmb();
3013 
3014         /*
3015          * There is a window with interrupts enabled before we get here,
3016          * so we need to check again lest we try to stop another CPU's event.
3017          */
3018         if (READ_ONCE(event->oncpu) != smp_processor_id())
3019                 return -EAGAIN;
3020 
3021         event->pmu->stop(event, PERF_EF_UPDATE);
3022 
3023         /*
3024          * May race with the actual stop (through perf_pmu_output_stop()),
3025          * but it is only used for events with AUX ring buffer, and such
3026          * events will refuse to restart because of rb::aux_mmap_count==0,
3027          * see comments in perf_aux_output_begin().
3028          *
3029          * Since this is happening on an event-local CPU, no trace is lost
3030          * while restarting.
3031          */
3032         if (sd->restart)
3033                 event->pmu->start(event, 0);
3034 
3035         return 0;
3036 }
3037 
3038 static int perf_event_stop(struct perf_event *event, int restart)
3039 {
3040         struct stop_event_data sd = {
3041                 .event          = event,
3042                 .restart        = restart,
3043         };
3044         int ret = 0;
3045 
3046         do {
3047                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3048                         return 0;
3049 
3050                 /* matches smp_wmb() in event_sched_in() */
3051                 smp_rmb();
3052 
3053                 /*
3054                  * We only want to restart ACTIVE events, so if the event goes
3055                  * inactive here (event->oncpu==-1), there's nothing more to do;
3056                  * fall through with ret==-ENXIO.
3057                  */
3058                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3059                                         __perf_event_stop, &sd);
3060         } while (ret == -EAGAIN);
3061 
3062         return ret;
3063 }
3064 
3065 /*
3066  * In order to contain the amount of racy and tricky in the address filter
3067  * configuration management, it is a two part process:
3068  *
3069  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3070  *      we update the addresses of corresponding vmas in
3071  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3072  * (p2) when an event is scheduled in (pmu::add), it calls
3073  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3074  *      if the generation has changed since the previous call.
3075  *
3076  * If (p1) happens while the event is active, we restart it to force (p2).
3077  *
3078  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3079  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3080  *     ioctl;
3081  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3082  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3083  *     for reading;
3084  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3085  *     of exec.
3086  */
3087 void perf_event_addr_filters_sync(struct perf_event *event)
3088 {
3089         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3090 
3091         if (!has_addr_filter(event))
3092                 return;
3093 
3094         raw_spin_lock(&ifh->lock);
3095         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3096                 event->pmu->addr_filters_sync(event);
3097                 event->hw.addr_filters_gen = event->addr_filters_gen;
3098         }
3099         raw_spin_unlock(&ifh->lock);
3100 }
3101 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3102 
3103 static int _perf_event_refresh(struct perf_event *event, int refresh)
3104 {
3105         /*
3106          * not supported on inherited events
3107          */
3108         if (event->attr.inherit || !is_sampling_event(event))
3109                 return -EINVAL;
3110 
3111         atomic_add(refresh, &event->event_limit);
3112         _perf_event_enable(event);
3113 
3114         return 0;
3115 }
3116 
3117 /*
3118  * See perf_event_disable()
3119  */
3120 int perf_event_refresh(struct perf_event *event, int refresh)
3121 {
3122         struct perf_event_context *ctx;
3123         int ret;
3124 
3125         ctx = perf_event_ctx_lock(event);
3126         ret = _perf_event_refresh(event, refresh);
3127         perf_event_ctx_unlock(event, ctx);
3128 
3129         return ret;
3130 }
3131 EXPORT_SYMBOL_GPL(perf_event_refresh);
3132 
3133 static int perf_event_modify_breakpoint(struct perf_event *bp,
3134                                          struct perf_event_attr *attr)
3135 {
3136         int err;
3137 
3138         _perf_event_disable(bp);
3139 
3140         err = modify_user_hw_breakpoint_check(bp, attr, true);
3141 
3142         if (!bp->attr.disabled)
3143                 _perf_event_enable(bp);
3144 
3145         return err;
3146 }
3147 
3148 /*
3149  * Copy event-type-independent attributes that may be modified.
3150  */
3151 static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3152                                         const struct perf_event_attr *from)
3153 {
3154         to->sig_data = from->sig_data;
3155 }
3156 
3157 static int perf_event_modify_attr(struct perf_event *event,
3158                                   struct perf_event_attr *attr)
3159 {
3160         int (*func)(struct perf_event *, struct perf_event_attr *);
3161         struct perf_event *child;
3162         int err;
3163 
3164         if (event->attr.type != attr->type)
3165                 return -EINVAL;
3166 
3167         switch (event->attr.type) {
3168         case PERF_TYPE_BREAKPOINT:
3169                 func = perf_event_modify_breakpoint;
3170                 break;
3171         default:
3172                 /* Place holder for future additions. */
3173                 return -EOPNOTSUPP;
3174         }
3175 
3176         WARN_ON_ONCE(event->ctx->parent_ctx);
3177 
3178         mutex_lock(&event->child_mutex);
3179         /*
3180          * Event-type-independent attributes must be copied before event-type
3181          * modification, which will validate that final attributes match the
3182          * source attributes after all relevant attributes have been copied.
3183          */
3184         perf_event_modify_copy_attr(&event->attr, attr);
3185         err = func(event, attr);
3186         if (err)
3187                 goto out;
3188         list_for_each_entry(child, &event->child_list, child_list) {
3189                 perf_event_modify_copy_attr(&child->attr, attr);
3190                 err = func(child, attr);
3191                 if (err)
3192                         goto out;
3193         }
3194 out:
3195         mutex_unlock(&event->child_mutex);
3196         return err;
3197 }
3198 
3199 static void ctx_sched_out(struct perf_event_context *ctx,
3200                           struct perf_cpu_context *cpuctx,
3201                           enum event_type_t event_type)
3202 {
3203         struct perf_event *event, *tmp;
3204         int is_active = ctx->is_active;
3205 
3206         lockdep_assert_held(&ctx->lock);
3207 
3208         if (likely(!ctx->nr_events)) {
3209                 /*
3210                  * See __perf_remove_from_context().
3211                  */
3212                 WARN_ON_ONCE(ctx->is_active);
3213                 if (ctx->task)
3214                         WARN_ON_ONCE(cpuctx->task_ctx);
3215                 return;
3216         }
3217 
3218         /*
3219          * Always update time if it was set; not only when it changes.
3220          * Otherwise we can 'forget' to update time for any but the last
3221          * context we sched out. For example:
3222          *
3223          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3224          *   ctx_sched_out(.event_type = EVENT_PINNED)
3225          *
3226          * would only update time for the pinned events.
3227          */
3228         if (is_active & EVENT_TIME) {
3229                 /* update (and stop) ctx time */
3230                 update_context_time(ctx);
3231                 update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3232                 /*
3233                  * CPU-release for the below ->is_active store,
3234                  * see __load_acquire() in perf_event_time_now()
3235                  */
3236                 barrier();
3237         }
3238 
3239         ctx->is_active &= ~event_type;
3240         if (!(ctx->is_active & EVENT_ALL))
3241                 ctx->is_active = 0;
3242 
3243         if (ctx->task) {
3244                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3245                 if (!ctx->is_active)
3246                         cpuctx->task_ctx = NULL;
3247         }
3248 
3249         is_active ^= ctx->is_active; /* changed bits */
3250 
3251         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3252                 return;
3253 
3254         perf_pmu_disable(ctx->pmu);
3255         if (is_active & EVENT_PINNED) {
3256                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3257                         group_sched_out(event, cpuctx, ctx);
3258         }
3259 
3260         if (is_active & EVENT_FLEXIBLE) {
3261                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3262                         group_sched_out(event, cpuctx, ctx);
3263 
3264                 /*
3265                  * Since we cleared EVENT_FLEXIBLE, also clear
3266                  * rotate_necessary, is will be reset by
3267                  * ctx_flexible_sched_in() when needed.
3268                  */
3269                 ctx->rotate_necessary = 0;
3270         }
3271         perf_pmu_enable(ctx->pmu);
3272 }
3273 
3274 /*
3275  * Test whether two contexts are equivalent, i.e. whether they have both been
3276  * cloned from the same version of the same context.
3277  *
3278  * Equivalence is measured using a generation number in the context that is
3279  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3280  * and list_del_event().
3281  */
3282 static int context_equiv(struct perf_event_context *ctx1,
3283                          struct perf_event_context *ctx2)
3284 {
3285         lockdep_assert_held(&ctx1->lock);
3286         lockdep_assert_held(&ctx2->lock);
3287 
3288         /* Pinning disables the swap optimization */
3289         if (ctx1->pin_count || ctx2->pin_count)
3290                 return 0;
3291 
3292         /* If ctx1 is the parent of ctx2 */
3293         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3294                 return 1;
3295 
3296         /* If ctx2 is the parent of ctx1 */
3297         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3298                 return 1;
3299 
3300         /*
3301          * If ctx1 and ctx2 have the same parent; we flatten the parent
3302          * hierarchy, see perf_event_init_context().
3303          */
3304         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3305                         ctx1->parent_gen == ctx2->parent_gen)
3306                 return 1;
3307 
3308         /* Unmatched */
3309         return 0;
3310 }
3311 
3312 static void __perf_event_sync_stat(struct perf_event *event,
3313                                      struct perf_event *next_event)
3314 {
3315         u64 value;
3316 
3317         if (!event->attr.inherit_stat)
3318                 return;
3319 
3320         /*
3321          * Update the event value, we cannot use perf_event_read()
3322          * because we're in the middle of a context switch and have IRQs
3323          * disabled, which upsets smp_call_function_single(), however
3324          * we know the event must be on the current CPU, therefore we
3325          * don't need to use it.
3326          */
3327         if (event->state == PERF_EVENT_STATE_ACTIVE)
3328                 event->pmu->read(event);
3329 
3330         perf_event_update_time(event);
3331 
3332         /*
3333          * In order to keep per-task stats reliable we need to flip the event
3334          * values when we flip the contexts.
3335          */
3336         value = local64_read(&next_event->count);
3337         value = local64_xchg(&event->count, value);
3338         local64_set(&next_event->count, value);
3339 
3340         swap(event->total_time_enabled, next_event->total_time_enabled);
3341         swap(event->total_time_running, next_event->total_time_running);
3342 
3343         /*
3344          * Since we swizzled the values, update the user visible data too.
3345          */
3346         perf_event_update_userpage(event);
3347         perf_event_update_userpage(next_event);
3348 }
3349 
3350 static void perf_event_sync_stat(struct perf_event_context *ctx,
3351                                    struct perf_event_context *next_ctx)
3352 {
3353         struct perf_event *event, *next_event;
3354 
3355         if (!ctx->nr_stat)
3356                 return;
3357 
3358         update_context_time(ctx);
3359 
3360         event = list_first_entry(&ctx->event_list,
3361                                    struct perf_event, event_entry);
3362 
3363         next_event = list_first_entry(&next_ctx->event_list,
3364                                         struct perf_event, event_entry);
3365 
3366         while (&event->event_entry != &ctx->event_list &&
3367                &next_event->event_entry != &next_ctx->event_list) {
3368 
3369                 __perf_event_sync_stat(event, next_event);
3370 
3371                 event = list_next_entry(event, event_entry);
3372                 next_event = list_next_entry(next_event, event_entry);
3373         }
3374 }
3375 
3376 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3377                                          struct task_struct *next)
3378 {
3379         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3380         struct perf_event_context *next_ctx;
3381         struct perf_event_context *parent, *next_parent;
3382         struct perf_cpu_context *cpuctx;
3383         int do_switch = 1;
3384         struct pmu *pmu;
3385 
3386         if (likely(!ctx))
3387                 return;
3388 
3389         pmu = ctx->pmu;
3390         cpuctx = __get_cpu_context(ctx);
3391         if (!cpuctx->task_ctx)
3392                 return;
3393 
3394         rcu_read_lock();
3395         next_ctx = next->perf_event_ctxp[ctxn];
3396         if (!next_ctx)
3397                 goto unlock;
3398 
3399         parent = rcu_dereference(ctx->parent_ctx);
3400         next_parent = rcu_dereference(next_ctx->parent_ctx);
3401 
3402         /* If neither context have a parent context; they cannot be clones. */
3403         if (!parent && !next_parent)
3404                 goto unlock;
3405 
3406         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3407                 /*
3408                  * Looks like the two contexts are clones, so we might be
3409                  * able to optimize the context switch.  We lock both
3410                  * contexts and check that they are clones under the
3411                  * lock (including re-checking that neither has been
3412                  * uncloned in the meantime).  It doesn't matter which
3413                  * order we take the locks because no other cpu could
3414                  * be trying to lock both of these tasks.
3415                  */
3416                 raw_spin_lock(&ctx->lock);
3417                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3418                 if (context_equiv(ctx, next_ctx)) {
3419 
3420                         WRITE_ONCE(ctx->task, next);
3421                         WRITE_ONCE(next_ctx->task, task);
3422 
3423                         perf_pmu_disable(pmu);
3424 
3425                         if (cpuctx->sched_cb_usage && pmu->sched_task)
3426                                 pmu->sched_task(ctx, false);
3427 
3428                         /*
3429                          * PMU specific parts of task perf context can require
3430                          * additional synchronization. As an example of such
3431                          * synchronization see implementation details of Intel
3432                          * LBR call stack data profiling;
3433                          */
3434                         if (pmu->swap_task_ctx)
3435                                 pmu->swap_task_ctx(ctx, next_ctx);
3436                         else
3437                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3438 
3439                         perf_pmu_enable(pmu);
3440 
3441                         /*
3442                          * RCU_INIT_POINTER here is safe because we've not
3443                          * modified the ctx and the above modification of
3444                          * ctx->task and ctx->task_ctx_data are immaterial
3445                          * since those values are always verified under
3446                          * ctx->lock which we're now holding.
3447                          */
3448                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3449                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3450 
3451                         do_switch = 0;
3452 
3453                         perf_event_sync_stat(ctx, next_ctx);
3454                 }
3455                 raw_spin_unlock(&next_ctx->lock);
3456                 raw_spin_unlock(&ctx->lock);
3457         }
3458 unlock:
3459         rcu_read_unlock();
3460 
3461         if (do_switch) {
3462                 raw_spin_lock(&ctx->lock);
3463                 perf_pmu_disable(pmu);
3464 
3465                 if (cpuctx->sched_cb_usage && pmu->sched_task)
3466                         pmu->sched_task(ctx, false);
3467                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3468 
3469                 perf_pmu_enable(pmu);
3470                 raw_spin_unlock(&ctx->lock);
3471         }
3472 }
3473 
3474 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3475 
3476 void perf_sched_cb_dec(struct pmu *pmu)
3477 {
3478         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3479 
3480         this_cpu_dec(perf_sched_cb_usages);
3481 
3482         if (!--cpuctx->sched_cb_usage)
3483                 list_del(&cpuctx->sched_cb_entry);
3484 }
3485 
3486 
3487 void perf_sched_cb_inc(struct pmu *pmu)
3488 {
3489         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3490 
3491         if (!cpuctx->sched_cb_usage++)
3492                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3493 
3494         this_cpu_inc(perf_sched_cb_usages);
3495 }
3496 
3497 /*
3498  * This function provides the context switch callback to the lower code
3499  * layer. It is invoked ONLY when the context switch callback is enabled.
3500  *
3501  * This callback is relevant even to per-cpu events; for example multi event
3502  * PEBS requires this to provide PID/TID information. This requires we flush
3503  * all queued PEBS records before we context switch to a new task.
3504  */
3505 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3506 {
3507         struct pmu *pmu;
3508 
3509         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3510 
3511         if (WARN_ON_ONCE(!pmu->sched_task))
3512                 return;
3513 
3514         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3515         perf_pmu_disable(pmu);
3516 
3517         pmu->sched_task(cpuctx->task_ctx, sched_in);
3518 
3519         perf_pmu_enable(pmu);
3520         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3521 }
3522 
3523 static void perf_pmu_sched_task(struct task_struct *prev,
3524                                 struct task_struct *next,
3525                                 bool sched_in)
3526 {
3527         struct perf_cpu_context *cpuctx;
3528 
3529         if (prev == next)
3530                 return;
3531 
3532         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3533                 /* will be handled in perf_event_context_sched_in/out */
3534                 if (cpuctx->task_ctx)
3535                         continue;
3536 
3537                 __perf_pmu_sched_task(cpuctx, sched_in);
3538         }
3539 }
3540 
3541 static void perf_event_switch(struct task_struct *task,
3542                               struct task_struct *next_prev, bool sched_in);
3543 
3544 #define for_each_task_context_nr(ctxn)                                  \
3545         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3546 
3547 /*
3548  * Called from scheduler to remove the events of the current task,
3549  * with interrupts disabled.
3550  *
3551  * We stop each event and update the event value in event->count.
3552  *
3553  * This does not protect us against NMI, but disable()
3554  * sets the disabled bit in the control field of event _before_
3555  * accessing the event control register. If a NMI hits, then it will
3556  * not restart the event.
3557  */
3558 void __perf_event_task_sched_out(struct task_struct *task,
3559                                  struct task_struct *next)
3560 {
3561         int ctxn;
3562 
3563         if (__this_cpu_read(perf_sched_cb_usages))
3564                 perf_pmu_sched_task(task, next, false);
3565 
3566         if (atomic_read(&nr_switch_events))
3567                 perf_event_switch(task, next, false);
3568 
3569         for_each_task_context_nr(ctxn)
3570                 perf_event_context_sched_out(task, ctxn, next);
3571 
3572         /*
3573          * if cgroup events exist on this CPU, then we need
3574          * to check if we have to switch out PMU state.
3575          * cgroup event are system-wide mode only
3576          */
3577         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3578                 perf_cgroup_switch(next);
3579 }
3580 
3581 /*
3582  * Called with IRQs disabled
3583  */
3584 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3585                               enum event_type_t event_type)
3586 {
3587         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3588 }
3589 
3590 static bool perf_less_group_idx(const void *l, const void *r)
3591 {
3592         const struct perf_event *le = *(const struct perf_event **)l;
3593         const struct perf_event *re = *(const struct perf_event **)r;
3594 
3595         return le->group_index < re->group_index;
3596 }
3597 
3598 static void swap_ptr(void *l, void *r)
3599 {
3600         void **lp = l, **rp = r;
3601 
3602         swap(*lp, *rp);
3603 }
3604 
3605 static const struct min_heap_callbacks perf_min_heap = {
3606         .elem_size = sizeof(struct perf_event *),
3607         .less = perf_less_group_idx,
3608         .swp = swap_ptr,
3609 };
3610 
3611 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3612 {
3613         struct perf_event **itrs = heap->data;
3614 
3615         if (event) {
3616                 itrs[heap->nr] = event;
3617                 heap->nr++;
3618         }
3619 }
3620 
3621 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3622                                 struct perf_event_groups *groups, int cpu,
3623                                 int (*func)(struct perf_event *, void *),
3624                                 void *data)
3625 {
3626 #ifdef CONFIG_CGROUP_PERF
3627         struct cgroup_subsys_state *css = NULL;
3628 #endif
3629         /* Space for per CPU and/or any CPU event iterators. */
3630         struct perf_event *itrs[2];
3631         struct min_heap event_heap;
3632         struct perf_event **evt;
3633         int ret;
3634 
3635         if (cpuctx) {
3636                 event_heap = (struct min_heap){
3637                         .data = cpuctx->heap,
3638                         .nr = 0,
3639                         .size = cpuctx->heap_size,
3640                 };
3641 
3642                 lockdep_assert_held(&cpuctx->ctx.lock);
3643 
3644 #ifdef CONFIG_CGROUP_PERF
3645                 if (cpuctx->cgrp)
3646                         css = &cpuctx->cgrp->css;
3647 #endif
3648         } else {
3649                 event_heap = (struct min_heap){
3650                         .data = itrs,
3651                         .nr = 0,
3652                         .size = ARRAY_SIZE(itrs),
3653                 };
3654                 /* Events not within a CPU context may be on any CPU. */
3655                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3656         }
3657         evt = event_heap.data;
3658 
3659         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3660 
3661 #ifdef CONFIG_CGROUP_PERF
3662         for (; css; css = css->parent)
3663                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3664 #endif
3665 
3666         min_heapify_all(&event_heap, &perf_min_heap);
3667 
3668         while (event_heap.nr) {
3669                 ret = func(*evt, data);
3670                 if (ret)
3671                         return ret;
3672 
3673                 *evt = perf_event_groups_next(*evt);
3674                 if (*evt)
3675                         min_heapify(&event_heap, 0, &perf_min_heap);
3676                 else
3677                         min_heap_pop(&event_heap, &perf_min_heap);
3678         }
3679 
3680         return 0;
3681 }
3682 
3683 /*
3684  * Because the userpage is strictly per-event (there is no concept of context,
3685  * so there cannot be a context indirection), every userpage must be updated
3686  * when context time starts :-(
3687  *
3688  * IOW, we must not miss EVENT_TIME edges.
3689  */
3690 static inline bool event_update_userpage(struct perf_event *event)
3691 {
3692         if (likely(!atomic_read(&event->mmap_count)))
3693                 return false;
3694 
3695         perf_event_update_time(event);
3696         perf_event_update_userpage(event);
3697 
3698         return true;
3699 }
3700 
3701 static inline void group_update_userpage(struct perf_event *group_event)
3702 {
3703         struct perf_event *event;
3704 
3705         if (!event_update_userpage(group_event))
3706                 return;
3707 
3708         for_each_sibling_event(event, group_event)
3709                 event_update_userpage(event);
3710 }
3711 
3712 static int merge_sched_in(struct perf_event *event, void *data)
3713 {
3714         struct perf_event_context *ctx = event->ctx;
3715         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3716         int *can_add_hw = data;
3717 
3718         if (event->state <= PERF_EVENT_STATE_OFF)
3719                 return 0;
3720 
3721         if (!event_filter_match(event))
3722                 return 0;
3723 
3724         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3725                 if (!group_sched_in(event, cpuctx, ctx))
3726                         list_add_tail(&event->active_list, get_event_list(event));
3727         }
3728 
3729         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3730                 *can_add_hw = 0;
3731                 if (event->attr.pinned) {
3732                         perf_cgroup_event_disable(event, ctx);
3733                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3734                 } else {
3735                         ctx->rotate_necessary = 1;
3736                         perf_mux_hrtimer_restart(cpuctx);
3737                         group_update_userpage(event);
3738                 }
3739         }
3740 
3741         return 0;
3742 }
3743 
3744 static void
3745 ctx_pinned_sched_in(struct perf_event_context *ctx,
3746                     struct perf_cpu_context *cpuctx)
3747 {
3748         int can_add_hw = 1;
3749 
3750         if (ctx != &cpuctx->ctx)
3751                 cpuctx = NULL;
3752 
3753         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3754                            smp_processor_id(),
3755                            merge_sched_in, &can_add_hw);
3756 }
3757 
3758 static void
3759 ctx_flexible_sched_in(struct perf_event_context *ctx,
3760                       struct perf_cpu_context *cpuctx)
3761 {
3762         int can_add_hw = 1;
3763 
3764         if (ctx != &cpuctx->ctx)
3765                 cpuctx = NULL;
3766 
3767         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3768                            smp_processor_id(),
3769                            merge_sched_in, &can_add_hw);
3770 }
3771 
3772 static void
3773 ctx_sched_in(struct perf_event_context *ctx,
3774              struct perf_cpu_context *cpuctx,
3775              enum event_type_t event_type)
3776 {
3777         int is_active = ctx->is_active;
3778 
3779         lockdep_assert_held(&ctx->lock);
3780 
3781         if (likely(!ctx->nr_events))
3782                 return;
3783 
3784         if (is_active ^ EVENT_TIME) {
3785                 /* start ctx time */
3786                 __update_context_time(ctx, false);
3787                 perf_cgroup_set_timestamp(cpuctx);
3788                 /*
3789                  * CPU-release for the below ->is_active store,
3790                  * see __load_acquire() in perf_event_time_now()
3791                  */
3792                 barrier();
3793         }
3794 
3795         ctx->is_active |= (event_type | EVENT_TIME);
3796         if (ctx->task) {
3797                 if (!is_active)
3798                         cpuctx->task_ctx = ctx;
3799                 else
3800                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3801         }
3802 
3803         is_active ^= ctx->is_active; /* changed bits */
3804 
3805         /*
3806          * First go through the list and put on any pinned groups
3807          * in order to give them the best chance of going on.
3808          */
3809         if (is_active & EVENT_PINNED)
3810                 ctx_pinned_sched_in(ctx, cpuctx);
3811 
3812         /* Then walk through the lower prio flexible groups */
3813         if (is_active & EVENT_FLEXIBLE)
3814                 ctx_flexible_sched_in(ctx, cpuctx);
3815 }
3816 
3817 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3818                              enum event_type_t event_type)
3819 {
3820         struct perf_event_context *ctx = &cpuctx->ctx;
3821 
3822         ctx_sched_in(ctx, cpuctx, event_type);
3823 }
3824 
3825 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3826                                         struct task_struct *task)
3827 {
3828         struct perf_cpu_context *cpuctx;
3829         struct pmu *pmu;
3830 
3831         cpuctx = __get_cpu_context(ctx);
3832 
3833         /*
3834          * HACK: for HETEROGENEOUS the task context might have switched to a
3835          * different PMU, force (re)set the context,
3836          */
3837         pmu = ctx->pmu = cpuctx->ctx.pmu;
3838 
3839         if (cpuctx->task_ctx == ctx) {
3840                 if (cpuctx->sched_cb_usage)
3841                         __perf_pmu_sched_task(cpuctx, true);
3842                 return;
3843         }
3844 
3845         perf_ctx_lock(cpuctx, ctx);
3846         /*
3847          * We must check ctx->nr_events while holding ctx->lock, such
3848          * that we serialize against perf_install_in_context().
3849          */
3850         if (!ctx->nr_events)
3851                 goto unlock;
3852 
3853         perf_pmu_disable(pmu);
3854         /*
3855          * We want to keep the following priority order:
3856          * cpu pinned (that don't need to move), task pinned,
3857          * cpu flexible, task flexible.
3858          *
3859          * However, if task's ctx is not carrying any pinned
3860          * events, no need to flip the cpuctx's events around.
3861          */
3862         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3863                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3864         perf_event_sched_in(cpuctx, ctx);
3865 
3866         if (cpuctx->sched_cb_usage && pmu->sched_task)
3867                 pmu->sched_task(cpuctx->task_ctx, true);
3868 
3869         perf_pmu_enable(pmu);
3870 
3871 unlock:
3872         perf_ctx_unlock(cpuctx, ctx);
3873 }
3874 
3875 /*
3876  * Called from scheduler to add the events of the current task
3877  * with interrupts disabled.
3878  *
3879  * We restore the event value and then enable it.
3880  *
3881  * This does not protect us against NMI, but enable()
3882  * sets the enabled bit in the control field of event _before_
3883  * accessing the event control register. If a NMI hits, then it will
3884  * keep the event running.
3885  */
3886 void __perf_event_task_sched_in(struct task_struct *prev,
3887                                 struct task_struct *task)
3888 {
3889         struct perf_event_context *ctx;
3890         int ctxn;
3891 
3892         for_each_task_context_nr(ctxn) {
3893                 ctx = task->perf_event_ctxp[ctxn];
3894                 if (likely(!ctx))
3895                         continue;
3896 
3897                 perf_event_context_sched_in(ctx, task);
3898         }
3899 
3900         if (atomic_read(&nr_switch_events))
3901                 perf_event_switch(task, prev, true);
3902 
3903         if (__this_cpu_read(perf_sched_cb_usages))
3904                 perf_pmu_sched_task(prev, task, true);
3905 }
3906 
3907 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3908 {
3909         u64 frequency = event->attr.sample_freq;
3910         u64 sec = NSEC_PER_SEC;
3911         u64 divisor, dividend;
3912 
3913         int count_fls, nsec_fls, frequency_fls, sec_fls;
3914 
3915         count_fls = fls64(count);
3916         nsec_fls = fls64(nsec);
3917         frequency_fls = fls64(frequency);
3918         sec_fls = 30;
3919 
3920         /*
3921          * We got @count in @nsec, with a target of sample_freq HZ
3922          * the target period becomes:
3923          *
3924          *             @count * 10^9
3925          * period = -------------------
3926          *          @nsec * sample_freq
3927          *
3928          */
3929 
3930         /*
3931          * Reduce accuracy by one bit such that @a and @b converge
3932          * to a similar magnitude.
3933          */
3934 #define REDUCE_FLS(a, b)                \
3935 do {                                    \
3936         if (a##_fls > b##_fls) {        \
3937                 a >>= 1;                \
3938                 a##_fls--;              \
3939         } else {                        \
3940                 b >>= 1;                \
3941                 b##_fls--;              \
3942         }                               \
3943 } while (0)
3944 
3945         /*
3946          * Reduce accuracy until either term fits in a u64, then proceed with
3947          * the other, so that finally we can do a u64/u64 division.
3948          */
3949         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3950                 REDUCE_FLS(nsec, frequency);
3951                 REDUCE_FLS(sec, count);
3952         }
3953 
3954         if (count_fls + sec_fls > 64) {
3955                 divisor = nsec * frequency;
3956 
3957                 while (count_fls + sec_fls > 64) {
3958                         REDUCE_FLS(count, sec);
3959                         divisor >>= 1;
3960                 }
3961 
3962                 dividend = count * sec;
3963         } else {
3964                 dividend = count * sec;
3965 
3966                 while (nsec_fls + frequency_fls > 64) {
3967                         REDUCE_FLS(nsec, frequency);
3968                         dividend >>= 1;
3969                 }
3970 
3971                 divisor = nsec * frequency;
3972         }
3973 
3974         if (!divisor)
3975                 return dividend;
3976 
3977         return div64_u64(dividend, divisor);
3978 }
3979 
3980 static DEFINE_PER_CPU(int, perf_throttled_count);
3981 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3982 
3983 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3984 {
3985         struct hw_perf_event *hwc = &event->hw;
3986         s64 period, sample_period;
3987         s64 delta;
3988 
3989         period = perf_calculate_period(event, nsec, count);
3990 
3991         delta = (s64)(period - hwc->sample_period);
3992         delta = (delta + 7) / 8; /* low pass filter */
3993 
3994         sample_period = hwc->sample_period + delta;
3995 
3996         if (!sample_period)
3997                 sample_period = 1;
3998 
3999         hwc->sample_period = sample_period;
4000 
4001         if (local64_read(&hwc->period_left) > 8*sample_period) {
4002                 if (disable)
4003                         event->pmu->stop(event, PERF_EF_UPDATE);
4004 
4005                 local64_set(&hwc->period_left, 0);
4006 
4007                 if (disable)
4008                         event->pmu->start(event, PERF_EF_RELOAD);
4009         }
4010 }
4011 
4012 /*
4013  * combine freq adjustment with unthrottling to avoid two passes over the
4014  * events. At the same time, make sure, having freq events does not change
4015  * the rate of unthrottling as that would introduce bias.
4016  */
4017 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4018                                            int needs_unthr)
4019 {
4020         struct perf_event *event;
4021         struct hw_perf_event *hwc;
4022         u64 now, period = TICK_NSEC;
4023         s64 delta;
4024 
4025         /*
4026          * only need to iterate over all events iff:
4027          * - context have events in frequency mode (needs freq adjust)
4028          * - there are events to unthrottle on this cpu
4029          */
4030         if (!(ctx->nr_freq || needs_unthr))
4031                 return;
4032 
4033         raw_spin_lock(&ctx->lock);
4034         perf_pmu_disable(ctx->pmu);
4035 
4036         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4037                 if (event->state != PERF_EVENT_STATE_ACTIVE)
4038                         continue;
4039 
4040                 if (!event_filter_match(event))
4041                         continue;
4042 
4043                 perf_pmu_disable(event->pmu);
4044 
4045                 hwc = &event->hw;
4046 
4047                 if (hwc->interrupts == MAX_INTERRUPTS) {
4048                         hwc->interrupts = 0;
4049                         perf_log_throttle(event, 1);
4050                         event->pmu->start(event, 0);
4051                 }
4052 
4053                 if (!event->attr.freq || !event->attr.sample_freq)
4054                         goto next;
4055 
4056                 /*
4057                  * stop the event and update event->count
4058                  */
4059                 event->pmu->stop(event, PERF_EF_UPDATE);
4060 
4061                 now = local64_read(&event->count);
4062                 delta = now - hwc->freq_count_stamp;
4063                 hwc->freq_count_stamp = now;
4064 
4065                 /*
4066                  * restart the event
4067                  * reload only if value has changed
4068                  * we have stopped the event so tell that
4069                  * to perf_adjust_period() to avoid stopping it
4070                  * twice.
4071                  */
4072                 if (delta > 0)
4073                         perf_adjust_period(event, period, delta, false);
4074 
4075                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4076         next:
4077                 perf_pmu_enable(event->pmu);
4078         }
4079 
4080         perf_pmu_enable(ctx->pmu);
4081         raw_spin_unlock(&ctx->lock);
4082 }
4083 
4084 /*
4085  * Move @event to the tail of the @ctx's elegible events.
4086  */
4087 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4088 {
4089         /*
4090          * Rotate the first entry last of non-pinned groups. Rotation might be
4091          * disabled by the inheritance code.
4092          */
4093         if (ctx->rotate_disable)
4094                 return;
4095 
4096         perf_event_groups_delete(&ctx->flexible_groups, event);
4097         perf_event_groups_insert(&ctx->flexible_groups, event);
4098 }
4099 
4100 /* pick an event from the flexible_groups to rotate */
4101 static inline struct perf_event *
4102 ctx_event_to_rotate(struct perf_event_context *ctx)
4103 {
4104         struct perf_event *event;
4105 
4106         /* pick the first active flexible event */
4107         event = list_first_entry_or_null(&ctx->flexible_active,
4108                                          struct perf_event, active_list);
4109 
4110         /* if no active flexible event, pick the first event */
4111         if (!event) {
4112                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4113                                       typeof(*event), group_node);
4114         }
4115 
4116         /*
4117          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4118          * finds there are unschedulable events, it will set it again.
4119          */
4120         ctx->rotate_necessary = 0;
4121 
4122         return event;
4123 }
4124 
4125 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4126 {
4127         struct perf_event *cpu_event = NULL, *task_event = NULL;
4128         struct perf_event_context *task_ctx = NULL;
4129         int cpu_rotate, task_rotate;
4130 
4131         /*
4132          * Since we run this from IRQ context, nobody can install new
4133          * events, thus the event count values are stable.
4134          */
4135 
4136         cpu_rotate = cpuctx->ctx.rotate_necessary;
4137         task_ctx = cpuctx->task_ctx;
4138         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4139 
4140         if (!(cpu_rotate || task_rotate))
4141                 return false;
4142 
4143         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4144         perf_pmu_disable(cpuctx->ctx.pmu);
4145 
4146         if (task_rotate)
4147                 task_event = ctx_event_to_rotate(task_ctx);
4148         if (cpu_rotate)
4149                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4150 
4151         /*
4152          * As per the order given at ctx_resched() first 'pop' task flexible
4153          * and then, if needed CPU flexible.
4154          */
4155         if (task_event || (task_ctx && cpu_event))
4156                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4157         if (cpu_event)
4158                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4159 
4160         if (task_event)
4161                 rotate_ctx(task_ctx, task_event);
4162         if (cpu_event)
4163                 rotate_ctx(&cpuctx->ctx, cpu_event);
4164 
4165         perf_event_sched_in(cpuctx, task_ctx);
4166 
4167         perf_pmu_enable(cpuctx->ctx.pmu);
4168         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4169 
4170         return true;
4171 }
4172 
4173 void perf_event_task_tick(void)
4174 {
4175         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4176         struct perf_event_context *ctx, *tmp;
4177         int throttled;
4178 
4179         lockdep_assert_irqs_disabled();
4180 
4181         __this_cpu_inc(perf_throttled_seq);
4182         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4183         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4184 
4185         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4186                 perf_adjust_freq_unthr_context(ctx, throttled);
4187 }
4188 
4189 static int event_enable_on_exec(struct perf_event *event,
4190                                 struct perf_event_context *ctx)
4191 {
4192         if (!event->attr.enable_on_exec)
4193                 return 0;
4194 
4195         event->attr.enable_on_exec = 0;
4196         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4197                 return 0;
4198 
4199         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4200 
4201         return 1;
4202 }
4203 
4204 /*
4205  * Enable all of a task's events that have been marked enable-on-exec.
4206  * This expects task == current.
4207  */
4208 static void perf_event_enable_on_exec(int ctxn)
4209 {
4210         struct perf_event_context *ctx, *clone_ctx = NULL;
4211         enum event_type_t event_type = 0;
4212         struct perf_cpu_context *cpuctx;
4213         struct perf_event *event;
4214         unsigned long flags;
4215         int enabled = 0;
4216 
4217         local_irq_save(flags);
4218         ctx = current->perf_event_ctxp[ctxn];
4219         if (!ctx || !ctx->nr_events)
4220                 goto out;
4221 
4222         cpuctx = __get_cpu_context(ctx);
4223         perf_ctx_lock(cpuctx, ctx);
4224         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4225         list_for_each_entry(event, &ctx->event_list, event_entry) {
4226                 enabled |= event_enable_on_exec(event, ctx);
4227                 event_type |= get_event_type(event);
4228         }
4229 
4230         /*
4231          * Unclone and reschedule this context if we enabled any event.
4232          */
4233         if (enabled) {
4234                 clone_ctx = unclone_ctx(ctx);
4235                 ctx_resched(cpuctx, ctx, event_type);
4236         } else {
4237                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
4238         }
4239         perf_ctx_unlock(cpuctx, ctx);
4240 
4241 out:
4242         local_irq_restore(flags);
4243 
4244         if (clone_ctx)
4245                 put_ctx(clone_ctx);
4246 }
4247 
4248 static void perf_remove_from_owner(struct perf_event *event);
4249 static void perf_event_exit_event(struct perf_event *event,
4250                                   struct perf_event_context *ctx);
4251 
4252 /*
4253  * Removes all events from the current task that have been marked
4254  * remove-on-exec, and feeds their values back to parent events.
4255  */
4256 static void perf_event_remove_on_exec(int ctxn)
4257 {
4258         struct perf_event_context *ctx, *clone_ctx = NULL;
4259         struct perf_event *event, *next;
4260         LIST_HEAD(free_list);
4261         unsigned long flags;
4262         bool modified = false;
4263 
4264         ctx = perf_pin_task_context(current, ctxn);
4265         if (!ctx)
4266                 return;
4267 
4268         mutex_lock(&ctx->mutex);
4269 
4270         if (WARN_ON_ONCE(ctx->task != current))
4271                 goto unlock;
4272 
4273         list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4274                 if (!event->attr.remove_on_exec)
4275                         continue;
4276 
4277                 if (!is_kernel_event(event))
4278                         perf_remove_from_owner(event);
4279 
4280                 modified = true;
4281 
4282                 perf_event_exit_event(event, ctx);
4283         }
4284 
4285         raw_spin_lock_irqsave(&ctx->lock, flags);
4286         if (modified)
4287                 clone_ctx = unclone_ctx(ctx);
4288         --ctx->pin_count;
4289         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4290 
4291 unlock:
4292         mutex_unlock(&ctx->mutex);
4293 
4294         put_ctx(ctx);
4295         if (clone_ctx)
4296                 put_ctx(clone_ctx);
4297 }
4298 
4299 struct perf_read_data {
4300         struct perf_event *event;
4301         bool group;
4302         int ret;
4303 };
4304 
4305 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4306 {
4307         u16 local_pkg, event_pkg;
4308 
4309         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4310                 int local_cpu = smp_processor_id();
4311 
4312                 event_pkg = topology_physical_package_id(event_cpu);
4313                 local_pkg = topology_physical_package_id(local_cpu);
4314 
4315                 if (event_pkg == local_pkg)
4316                         return local_cpu;
4317         }
4318 
4319         return event_cpu;
4320 }
4321 
4322 /*
4323  * Cross CPU call to read the hardware event
4324  */
4325 static void __perf_event_read(void *info)
4326 {
4327         struct perf_read_data *data = info;
4328         struct perf_event *sub, *event = data->event;
4329         struct perf_event_context *ctx = event->ctx;
4330         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4331         struct pmu *pmu = event->pmu;
4332 
4333         /*
4334          * If this is a task context, we need to check whether it is
4335          * the current task context of this cpu.  If not it has been
4336          * scheduled out before the smp call arrived.  In that case
4337          * event->count would have been updated to a recent sample
4338          * when the event was scheduled out.
4339          */
4340         if (ctx->task && cpuctx->task_ctx != ctx)
4341                 return;
4342 
4343         raw_spin_lock(&ctx->lock);
4344         if (ctx->is_active & EVENT_TIME) {
4345                 update_context_time(ctx);
4346                 update_cgrp_time_from_event(event);
4347         }
4348 
4349         perf_event_update_time(event);
4350         if (data->group)
4351                 perf_event_update_sibling_time(event);
4352 
4353         if (event->state != PERF_EVENT_STATE_ACTIVE)
4354                 goto unlock;
4355 
4356         if (!data->group) {
4357                 pmu->read(event);
4358                 data->ret = 0;
4359                 goto unlock;
4360         }
4361 
4362         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4363 
4364         pmu->read(event);
4365 
4366         for_each_sibling_event(sub, event) {
4367                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4368                         /*
4369                          * Use sibling's PMU rather than @event's since
4370                          * sibling could be on different (eg: software) PMU.
4371                          */
4372                         sub->pmu->read(sub);
4373                 }
4374         }
4375 
4376         data->ret = pmu->commit_txn(pmu);
4377 
4378 unlock:
4379         raw_spin_unlock(&ctx->lock);
4380 }
4381 
4382 static inline u64 perf_event_count(struct perf_event *event)
4383 {
4384         return local64_read(&event->count) + atomic64_read(&event->child_count);
4385 }
4386 
4387 static void calc_timer_values(struct perf_event *event,
4388                                 u64 *now,
4389                                 u64 *enabled,
4390                                 u64 *running)
4391 {
4392         u64 ctx_time;
4393 
4394         *now = perf_clock();
4395         ctx_time = perf_event_time_now(event, *now);
4396         __perf_update_times(event, ctx_time, enabled, running);
4397 }
4398 
4399 /*
4400  * NMI-safe method to read a local event, that is an event that
4401  * is:
4402  *   - either for the current task, or for this CPU
4403  *   - does not have inherit set, for inherited task events
4404  *     will not be local and we cannot read them atomically
4405  *   - must not have a pmu::count method
4406  */
4407 int perf_event_read_local(struct perf_event *event, u64 *value,
4408                           u64 *enabled, u64 *running)
4409 {
4410         unsigned long flags;
4411         int ret = 0;
4412 
4413         /*
4414          * Disabling interrupts avoids all counter scheduling (context
4415          * switches, timer based rotation and IPIs).
4416          */
4417         local_irq_save(flags);
4418 
4419         /*
4420          * It must not be an event with inherit set, we cannot read
4421          * all child counters from atomic context.
4422          */
4423         if (event->attr.inherit) {
4424                 ret = -EOPNOTSUPP;
4425                 goto out;
4426         }
4427 
4428         /* If this is a per-task event, it must be for current */
4429         if ((event->attach_state & PERF_ATTACH_TASK) &&
4430             event->hw.target != current) {
4431                 ret = -EINVAL;
4432                 goto out;
4433         }
4434 
4435         /* If this is a per-CPU event, it must be for this CPU */
4436         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4437             event->cpu != smp_processor_id()) {
4438                 ret = -EINVAL;
4439                 goto out;
4440         }
4441 
4442         /* If this is a pinned event it must be running on this CPU */
4443         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4444                 ret = -EBUSY;
4445                 goto out;
4446         }
4447 
4448         /*
4449          * If the event is currently on this CPU, its either a per-task event,
4450          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4451          * oncpu == -1).
4452          */
4453         if (event->oncpu == smp_processor_id())
4454                 event->pmu->read(event);
4455 
4456         *value = local64_read(&event->count);
4457         if (enabled || running) {
4458                 u64 __enabled, __running, __now;;
4459 
4460                 calc_timer_values(event, &__now, &__enabled, &__running);
4461                 if (enabled)
4462                         *enabled = __enabled;
4463                 if (running)
4464                         *running = __running;
4465         }
4466 out:
4467         local_irq_restore(flags);
4468 
4469         return ret;
4470 }
4471 
4472 static int perf_event_read(struct perf_event *event, bool group)
4473 {
4474         enum perf_event_state state = READ_ONCE(event->state);
4475         int event_cpu, ret = 0;
4476 
4477         /*
4478          * If event is enabled and currently active on a CPU, update the
4479          * value in the event structure:
4480          */
4481 again:
4482         if (state == PERF_EVENT_STATE_ACTIVE) {
4483                 struct perf_read_data data;
4484 
4485                 /*
4486                  * Orders the ->state and ->oncpu loads such that if we see
4487                  * ACTIVE we must also see the right ->oncpu.
4488                  *
4489                  * Matches the smp_wmb() from event_sched_in().
4490                  */
4491                 smp_rmb();
4492 
4493                 event_cpu = READ_ONCE(event->oncpu);
4494                 if ((unsigned)event_cpu >= nr_cpu_ids)
4495                         return 0;
4496 
4497                 data = (struct perf_read_data){
4498                         .event = event,
4499                         .group = group,
4500                         .ret = 0,
4501                 };
4502 
4503                 preempt_disable();
4504                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4505 
4506                 /*
4507                  * Purposely ignore the smp_call_function_single() return
4508                  * value.
4509                  *
4510                  * If event_cpu isn't a valid CPU it means the event got
4511                  * scheduled out and that will have updated the event count.
4512                  *
4513                  * Therefore, either way, we'll have an up-to-date event count
4514                  * after this.
4515                  */
4516                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4517                 preempt_enable();
4518                 ret = data.ret;
4519 
4520         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4521                 struct perf_event_context *ctx = event->ctx;
4522                 unsigned long flags;
4523 
4524                 raw_spin_lock_irqsave(&ctx->lock, flags);
4525                 state = event->state;
4526                 if (state != PERF_EVENT_STATE_INACTIVE) {
4527                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4528                         goto again;
4529                 }
4530 
4531                 /*
4532                  * May read while context is not active (e.g., thread is
4533                  * blocked), in that case we cannot update context time
4534                  */
4535                 if (ctx->is_active & EVENT_TIME) {
4536                         update_context_time(ctx);
4537                         update_cgrp_time_from_event(event);
4538                 }
4539 
4540                 perf_event_update_time(event);
4541                 if (group)
4542                         perf_event_update_sibling_time(event);
4543                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4544         }
4545 
4546         return ret;
4547 }
4548 
4549 /*
4550  * Initialize the perf_event context in a task_struct:
4551  */
4552 static void __perf_event_init_context(struct perf_event_context *ctx)
4553 {
4554         raw_spin_lock_init(&ctx->lock);
4555         mutex_init(&ctx->mutex);
4556         INIT_LIST_HEAD(&ctx->active_ctx_list);
4557         perf_event_groups_init(&ctx->pinned_groups);
4558         perf_event_groups_init(&ctx->flexible_groups);
4559         INIT_LIST_HEAD(&ctx->event_list);
4560         INIT_LIST_HEAD(&ctx->pinned_active);
4561         INIT_LIST_HEAD(&ctx->flexible_active);
4562         refcount_set(&ctx->refcount, 1);
4563 }
4564 
4565 static struct perf_event_context *
4566 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4567 {
4568         struct perf_event_context *ctx;
4569 
4570         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4571         if (!ctx)
4572                 return NULL;
4573 
4574         __perf_event_init_context(ctx);
4575         if (task)
4576                 ctx->task = get_task_struct(task);
4577         ctx->pmu = pmu;
4578 
4579         return ctx;
4580 }
4581 
4582 static struct task_struct *
4583 find_lively_task_by_vpid(pid_t vpid)
4584 {
4585         struct task_struct *task;
4586 
4587         rcu_read_lock();
4588         if (!vpid)
4589                 task = current;
4590         else
4591                 task = find_task_by_vpid(vpid);
4592         if (task)
4593                 get_task_struct(task);
4594         rcu_read_unlock();
4595 
4596         if (!task)
4597                 return ERR_PTR(-ESRCH);
4598 
4599         return task;
4600 }
4601 
4602 /*
4603  * Returns a matching context with refcount and pincount.
4604  */
4605 static struct perf_event_context *
4606 find_get_context(struct pmu *pmu, struct task_struct *task,
4607                 struct perf_event *event)
4608 {
4609         struct perf_event_context *ctx, *clone_ctx = NULL;
4610         struct perf_cpu_context *cpuctx;
4611         void *task_ctx_data = NULL;
4612         unsigned long flags;
4613         int ctxn, err;
4614         int cpu = event->cpu;
4615 
4616         if (!task) {
4617                 /* Must be root to operate on a CPU event: */
4618                 err = perf_allow_cpu(&event->attr);
4619                 if (err)
4620                         return ERR_PTR(err);
4621 
4622                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4623                 ctx = &cpuctx->ctx;
4624                 get_ctx(ctx);
4625                 raw_spin_lock_irqsave(&ctx->lock, flags);
4626                 ++ctx->pin_count;
4627                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4628 
4629                 return ctx;
4630         }
4631 
4632         err = -EINVAL;
4633         ctxn = pmu->task_ctx_nr;
4634         if (ctxn < 0)
4635                 goto errout;
4636 
4637         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4638                 task_ctx_data = alloc_task_ctx_data(pmu);
4639                 if (!task_ctx_data) {
4640                         err = -ENOMEM;
4641                         goto errout;
4642                 }
4643         }
4644 
4645 retry:
4646         ctx = perf_lock_task_context(task, ctxn, &flags);
4647         if (ctx) {
4648                 clone_ctx = unclone_ctx(ctx);
4649                 ++ctx->pin_count;
4650 
4651                 if (task_ctx_data && !ctx->task_ctx_data) {
4652                         ctx->task_ctx_data = task_ctx_data;
4653                         task_ctx_data = NULL;
4654                 }
4655                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4656 
4657                 if (clone_ctx)
4658                         put_ctx(clone_ctx);
4659         } else {
4660                 ctx = alloc_perf_context(pmu, task);
4661                 err = -ENOMEM;
4662                 if (!ctx)
4663                         goto errout;
4664 
4665                 if (task_ctx_data) {
4666                         ctx->task_ctx_data = task_ctx_data;
4667                         task_ctx_data = NULL;
4668                 }
4669 
4670                 err = 0;
4671                 mutex_lock(&task->perf_event_mutex);
4672                 /*
4673                  * If it has already passed perf_event_exit_task().
4674                  * we must see PF_EXITING, it takes this mutex too.
4675                  */
4676                 if (task->flags & PF_EXITING)
4677                         err = -ESRCH;
4678                 else if (task->perf_event_ctxp[ctxn])
4679                         err = -EAGAIN;
4680                 else {
4681                         get_ctx(ctx);
4682                         ++ctx->pin_count;
4683                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4684                 }
4685                 mutex_unlock(&task->perf_event_mutex);
4686 
4687                 if (unlikely(err)) {
4688                         put_ctx(ctx);
4689 
4690                         if (err == -EAGAIN)
4691                                 goto retry;
4692                         goto errout;
4693                 }
4694         }
4695 
4696         free_task_ctx_data(pmu, task_ctx_data);
4697         return ctx;
4698 
4699 errout:
4700         free_task_ctx_data(pmu, task_ctx_data);
4701         return ERR_PTR(err);
4702 }
4703 
4704 static void perf_event_free_filter(struct perf_event *event);
4705 
4706 static void free_event_rcu(struct rcu_head *head)
4707 {
4708         struct perf_event *event;
4709 
4710         event = container_of(head, struct perf_event, rcu_head);
4711         if (event->ns)
4712                 put_pid_ns(event->ns);
4713         perf_event_free_filter(event);
4714         kmem_cache_free(perf_event_cache, event);
4715 }
4716 
4717 static void ring_buffer_attach(struct perf_event *event,
4718                                struct perf_buffer *rb);
4719 
4720 static void detach_sb_event(struct perf_event *event)
4721 {
4722         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4723 
4724         raw_spin_lock(&pel->lock);
4725         list_del_rcu(&event->sb_list);
4726         raw_spin_unlock(&pel->lock);
4727 }
4728 
4729 static bool is_sb_event(struct perf_event *event)
4730 {
4731         struct perf_event_attr *attr = &event->attr;
4732 
4733         if (event->parent)
4734                 return false;
4735 
4736         if (event->attach_state & PERF_ATTACH_TASK)
4737                 return false;
4738 
4739         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4740             attr->comm || attr->comm_exec ||
4741             attr->task || attr->ksymbol ||
4742             attr->context_switch || attr->text_poke ||
4743             attr->bpf_event)
4744                 return true;
4745         return false;
4746 }
4747 
4748 static void unaccount_pmu_sb_event(struct perf_event *event)
4749 {
4750         if (is_sb_event(event))
4751                 detach_sb_event(event);
4752 }
4753 
4754 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4755 {
4756         if (event->parent)
4757                 return;
4758 
4759         if (is_cgroup_event(event))
4760                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4761 }
4762 
4763 #ifdef CONFIG_NO_HZ_FULL
4764 static DEFINE_SPINLOCK(nr_freq_lock);
4765 #endif
4766 
4767 static void unaccount_freq_event_nohz(void)
4768 {
4769 #ifdef CONFIG_NO_HZ_FULL
4770         spin_lock(&nr_freq_lock);
4771         if (atomic_dec_and_test(&nr_freq_events))
4772                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4773         spin_unlock(&nr_freq_lock);
4774 #endif
4775 }
4776 
4777 static void unaccount_freq_event(void)
4778 {
4779         if (tick_nohz_full_enabled())
4780                 unaccount_freq_event_nohz();
4781         else
4782                 atomic_dec(&nr_freq_events);
4783 }
4784 
4785 static void unaccount_event(struct perf_event *event)
4786 {
4787         bool dec = false;
4788 
4789         if (event->parent)
4790                 return;
4791 
4792         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4793                 dec = true;
4794         if (event->attr.mmap || event->attr.mmap_data)
4795                 atomic_dec(&nr_mmap_events);
4796         if (event->attr.build_id)
4797                 atomic_dec(&nr_build_id_events);
4798         if (event->attr.comm)
4799                 atomic_dec(&nr_comm_events);
4800         if (event->attr.namespaces)
4801                 atomic_dec(&nr_namespaces_events);
4802         if (event->attr.cgroup)
4803                 atomic_dec(&nr_cgroup_events);
4804         if (event->attr.task)
4805                 atomic_dec(&nr_task_events);
4806         if (event->attr.freq)
4807                 unaccount_freq_event();
4808         if (event->attr.context_switch) {
4809                 dec = true;
4810                 atomic_dec(&nr_switch_events);
4811         }
4812         if (is_cgroup_event(event))
4813                 dec = true;
4814         if (has_branch_stack(event))
4815                 dec = true;
4816         if (event->attr.ksymbol)
4817                 atomic_dec(&nr_ksymbol_events);
4818         if (event->attr.bpf_event)
4819                 atomic_dec(&nr_bpf_events);
4820         if (event->attr.text_poke)
4821                 atomic_dec(&nr_text_poke_events);
4822 
4823         if (dec) {
4824                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4825                         schedule_delayed_work(&perf_sched_work, HZ);
4826         }
4827 
4828         unaccount_event_cpu(event, event->cpu);
4829 
4830         unaccount_pmu_sb_event(event);
4831 }
4832 
4833 static void perf_sched_delayed(struct work_struct *work)
4834 {
4835         mutex_lock(&perf_sched_mutex);
4836         if (atomic_dec_and_test(&perf_sched_count))
4837                 static_branch_disable(&perf_sched_events);
4838         mutex_unlock(&perf_sched_mutex);
4839 }
4840 
4841 /*
4842  * The following implement mutual exclusion of events on "exclusive" pmus
4843  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4844  * at a time, so we disallow creating events that might conflict, namely:
4845  *
4846  *  1) cpu-wide events in the presence of per-task events,
4847  *  2) per-task events in the presence of cpu-wide events,
4848  *  3) two matching events on the same context.
4849  *
4850  * The former two cases are handled in the allocation path (perf_event_alloc(),
4851  * _free_event()), the latter -- before the first perf_install_in_context().
4852  */
4853 static int exclusive_event_init(struct perf_event *event)
4854 {
4855         struct pmu *pmu = event->pmu;
4856 
4857         if (!is_exclusive_pmu(pmu))
4858                 return 0;
4859 
4860         /*
4861          * Prevent co-existence of per-task and cpu-wide events on the
4862          * same exclusive pmu.
4863          *
4864          * Negative pmu::exclusive_cnt means there are cpu-wide
4865          * events on this "exclusive" pmu, positive means there are
4866          * per-task events.
4867          *
4868          * Since this is called in perf_event_alloc() path, event::ctx
4869          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4870          * to mean "per-task event", because unlike other attach states it
4871          * never gets cleared.
4872          */
4873         if (event->attach_state & PERF_ATTACH_TASK) {
4874                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4875                         return -EBUSY;
4876         } else {
4877                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4878                         return -EBUSY;
4879         }
4880 
4881         return 0;
4882 }
4883 
4884 static void exclusive_event_destroy(struct perf_event *event)
4885 {
4886         struct pmu *pmu = event->pmu;
4887 
4888         if (!is_exclusive_pmu(pmu))
4889                 return;
4890 
4891         /* see comment in exclusive_event_init() */
4892         if (event->attach_state & PERF_ATTACH_TASK)
4893                 atomic_dec(&pmu->exclusive_cnt);
4894         else
4895                 atomic_inc(&pmu->exclusive_cnt);
4896 }
4897 
4898 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4899 {
4900         if ((e1->pmu == e2->pmu) &&
4901             (e1->cpu == e2->cpu ||
4902              e1->cpu == -1 ||
4903              e2->cpu == -1))
4904                 return true;
4905         return false;
4906 }
4907 
4908 static bool exclusive_event_installable(struct perf_event *event,
4909                                         struct perf_event_context *ctx)
4910 {
4911         struct perf_event *iter_event;
4912         struct pmu *pmu = event->pmu;
4913 
4914         lockdep_assert_held(&ctx->mutex);
4915 
4916         if (!is_exclusive_pmu(pmu))
4917                 return true;
4918 
4919         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4920                 if (exclusive_event_match(iter_event, event))
4921                         return false;
4922         }
4923 
4924         return true;
4925 }
4926 
4927 static void perf_addr_filters_splice(struct perf_event *event,
4928                                        struct list_head *head);
4929 
4930 static void _free_event(struct perf_event *event)
4931 {
4932         irq_work_sync(&event->pending);
4933 
4934         unaccount_event(event);
4935 
4936         security_perf_event_free(event);
4937 
4938         if (event->rb) {
4939                 /*
4940                  * Can happen when we close an event with re-directed output.
4941                  *
4942                  * Since we have a 0 refcount, perf_mmap_close() will skip
4943                  * over us; possibly making our ring_buffer_put() the last.
4944                  */
4945                 mutex_lock(&event->mmap_mutex);
4946                 ring_buffer_attach(event, NULL);
4947                 mutex_unlock(&event->mmap_mutex);
4948         }
4949 
4950         if (is_cgroup_event(event))
4951                 perf_detach_cgroup(event);
4952 
4953         if (!event->parent) {
4954                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4955                         put_callchain_buffers();
4956         }
4957 
4958         perf_event_free_bpf_prog(event);
4959         perf_addr_filters_splice(event, NULL);
4960         kfree(event->addr_filter_ranges);
4961 
4962         if (event->destroy)
4963                 event->destroy(event);
4964 
4965         /*
4966          * Must be after ->destroy(), due to uprobe_perf_close() using
4967          * hw.target.
4968          */
4969         if (event->hw.target)
4970                 put_task_struct(event->hw.target);
4971 
4972         /*
4973          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4974          * all task references must be cleaned up.
4975          */
4976         if (event->ctx)
4977                 put_ctx(event->ctx);
4978 
4979         exclusive_event_destroy(event);
4980         module_put(event->pmu->module);
4981 
4982         call_rcu(&event->rcu_head, free_event_rcu);
4983 }
4984 
4985 /*
4986  * Used to free events which have a known refcount of 1, such as in error paths
4987  * where the event isn't exposed yet and inherited events.
4988  */
4989 static void free_event(struct perf_event *event)
4990 {
4991         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4992                                 "unexpected event refcount: %ld; ptr=%p\n",
4993                                 atomic_long_read(&event->refcount), event)) {
4994                 /* leak to avoid use-after-free */
4995                 return;
4996         }
4997 
4998         _free_event(event);
4999 }
5000 
5001 /*
5002  * Remove user event from the owner task.
5003  */
5004 static void perf_remove_from_owner(struct perf_event *event)
5005 {
5006         struct task_struct *owner;
5007 
5008         rcu_read_lock();
5009         /*
5010          * Matches the smp_store_release() in perf_event_exit_task(). If we
5011          * observe !owner it means the list deletion is complete and we can
5012          * indeed free this event, otherwise we need to serialize on
5013          * owner->perf_event_mutex.
5014          */
5015         owner = READ_ONCE(event->owner);
5016         if (owner) {
5017                 /*
5018                  * Since delayed_put_task_struct() also drops the last
5019                  * task reference we can safely take a new reference
5020                  * while holding the rcu_read_lock().
5021                  */
5022                 get_task_struct(owner);
5023         }
5024         rcu_read_unlock();
5025 
5026         if (owner) {
5027                 /*
5028                  * If we're here through perf_event_exit_task() we're already
5029                  * holding ctx->mutex which would be an inversion wrt. the
5030                  * normal lock order.
5031                  *
5032                  * However we can safely take this lock because its the child
5033                  * ctx->mutex.
5034                  */
5035                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5036 
5037                 /*
5038                  * We have to re-check the event->owner field, if it is cleared
5039                  * we raced with perf_event_exit_task(), acquiring the mutex
5040                  * ensured they're done, and we can proceed with freeing the
5041                  * event.
5042                  */
5043                 if (event->owner) {
5044                         list_del_init(&event->owner_entry);
5045                         smp_store_release(&event->owner, NULL);
5046                 }
5047                 mutex_unlock(&owner->perf_event_mutex);
5048                 put_task_struct(owner);
5049         }
5050 }
5051 
5052 static void put_event(struct perf_event *event)
5053 {
5054         if (!atomic_long_dec_and_test(&event->refcount))
5055                 return;
5056 
5057         _free_event(event);
5058 }
5059 
5060 /*
5061  * Kill an event dead; while event:refcount will preserve the event
5062  * object, it will not preserve its functionality. Once the last 'user'
5063  * gives up the object, we'll destroy the thing.
5064  */
5065 int perf_event_release_kernel(struct perf_event *event)
5066 {
5067         struct perf_event_context *ctx = event->ctx;
5068         struct perf_event *child, *tmp;
5069         LIST_HEAD(free_list);
5070 
5071         /*
5072          * If we got here through err_file: fput(event_file); we will not have
5073          * attached to a context yet.
5074          */
5075         if (!ctx) {
5076                 WARN_ON_ONCE(event->attach_state &
5077                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5078                 goto no_ctx;
5079         }
5080 
5081         if (!is_kernel_event(event))
5082                 perf_remove_from_owner(event);
5083 
5084         ctx = perf_event_ctx_lock(event);
5085         WARN_ON_ONCE(ctx->parent_ctx);
5086         perf_remove_from_context(event, DETACH_GROUP);
5087 
5088         raw_spin_lock_irq(&ctx->lock);
5089         /*
5090          * Mark this event as STATE_DEAD, there is no external reference to it
5091          * anymore.
5092          *
5093          * Anybody acquiring event->child_mutex after the below loop _must_
5094          * also see this, most importantly inherit_event() which will avoid
5095          * placing more children on the list.
5096          *
5097          * Thus this guarantees that we will in fact observe and kill _ALL_
5098          * child events.
5099          */
5100         event->state = PERF_EVENT_STATE_DEAD;
5101         raw_spin_unlock_irq(&ctx->lock);
5102 
5103         perf_event_ctx_unlock(event, ctx);
5104 
5105 again:
5106         mutex_lock(&event->child_mutex);
5107         list_for_each_entry(child, &event->child_list, child_list) {
5108 
5109                 /*
5110                  * Cannot change, child events are not migrated, see the
5111                  * comment with perf_event_ctx_lock_nested().
5112                  */
5113                 ctx = READ_ONCE(child->ctx);
5114                 /*
5115                  * Since child_mutex nests inside ctx::mutex, we must jump
5116                  * through hoops. We start by grabbing a reference on the ctx.
5117                  *
5118                  * Since the event cannot get freed while we hold the
5119                  * child_mutex, the context must also exist and have a !0
5120                  * reference count.
5121                  */
5122                 get_ctx(ctx);
5123 
5124                 /*
5125                  * Now that we have a ctx ref, we can drop child_mutex, and
5126                  * acquire ctx::mutex without fear of it going away. Then we
5127                  * can re-acquire child_mutex.
5128                  */
5129                 mutex_unlock(&event->child_mutex);
5130                 mutex_lock(&ctx->mutex);
5131                 mutex_lock(&event->child_mutex);
5132 
5133                 /*
5134                  * Now that we hold ctx::mutex and child_mutex, revalidate our
5135                  * state, if child is still the first entry, it didn't get freed
5136                  * and we can continue doing so.
5137                  */
5138                 tmp = list_first_entry_or_null(&event->child_list,
5139                                                struct perf_event, child_list);
5140                 if (tmp == child) {
5141                         perf_remove_from_context(child, DETACH_GROUP);
5142                         list_move(&child->child_list, &free_list);
5143                         /*
5144                          * This matches the refcount bump in inherit_event();
5145                          * this can't be the last reference.
5146                          */
5147                         put_event(event);
5148                 }
5149 
5150                 mutex_unlock(&event->child_mutex);
5151                 mutex_unlock(&ctx->mutex);
5152                 put_ctx(ctx);
5153                 goto again;
5154         }
5155         mutex_unlock(&event->child_mutex);
5156 
5157         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5158                 void *var = &child->ctx->refcount;
5159 
5160                 list_del(&child->child_list);
5161                 free_event(child);
5162 
5163                 /*
5164                  * Wake any perf_event_free_task() waiting for this event to be
5165                  * freed.
5166                  */
5167                 smp_mb(); /* pairs with wait_var_event() */
5168                 wake_up_var(var);
5169         }
5170 
5171 no_ctx:
5172         put_event(event); /* Must be the 'last' reference */
5173         return 0;
5174 }
5175 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5176 
5177 /*
5178  * Called when the last reference to the file is gone.
5179  */
5180 static int perf_release(struct inode *inode, struct file *file)
5181 {
5182         perf_event_release_kernel(file->private_data);
5183         return 0;
5184 }
5185 
5186 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5187 {
5188         struct perf_event *child;
5189         u64 total = 0;
5190 
5191         *enabled = 0;
5192         *running = 0;
5193 
5194         mutex_lock(&event->child_mutex);
5195 
5196         (void)perf_event_read(event, false);
5197         total += perf_event_count(event);
5198 
5199         *enabled += event->total_time_enabled +
5200                         atomic64_read(&event->child_total_time_enabled);
5201         *running += event->total_time_running +
5202                         atomic64_read(&event->child_total_time_running);
5203 
5204         list_for_each_entry(child, &event->child_list, child_list) {
5205                 (void)perf_event_read(child, false);
5206                 total += perf_event_count(child);
5207                 *enabled += child->total_time_enabled;
5208                 *running += child->total_time_running;
5209         }
5210         mutex_unlock(&event->child_mutex);
5211 
5212         return total;
5213 }
5214 
5215 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5216 {
5217         struct perf_event_context *ctx;
5218         u64 count;
5219 
5220         ctx = perf_event_ctx_lock(event);
5221         count = __perf_event_read_value(event, enabled, running);
5222         perf_event_ctx_unlock(event, ctx);
5223 
5224         return count;
5225 }
5226 EXPORT_SYMBOL_GPL(perf_event_read_value);
5227 
5228 static int __perf_read_group_add(struct perf_event *leader,
5229                                         u64 read_format, u64 *values)
5230 {
5231         struct perf_event_context *ctx = leader->ctx;
5232         struct perf_event *sub;
5233         unsigned long flags;
5234         int n = 1; /* skip @nr */
5235         int ret;
5236 
5237         ret = perf_event_read(leader, true);
5238         if (ret)
5239                 return ret;
5240 
5241         raw_spin_lock_irqsave(&ctx->lock, flags);
5242 
5243         /*
5244          * Since we co-schedule groups, {enabled,running} times of siblings
5245          * will be identical to those of the leader, so we only publish one
5246          * set.
5247          */
5248         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5249                 values[n++] += leader->total_time_enabled +
5250                         atomic64_read(&leader->child_total_time_enabled);
5251         }
5252 
5253         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5254                 values[n++] += leader->total_time_running +
5255                         atomic64_read(&leader->child_total_time_running);
5256         }
5257 
5258         /*
5259          * Write {count,id} tuples for every sibling.
5260          */
5261         values[n++] += perf_event_count(leader);
5262         if (read_format & PERF_FORMAT_ID)
5263                 values[n++] = primary_event_id(leader);
5264 
5265         for_each_sibling_event(sub, leader) {
5266                 values[n++] += perf_event_count(sub);
5267                 if (read_format & PERF_FORMAT_ID)
5268                         values[n++] = primary_event_id(sub);
5269         }
5270 
5271         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5272         return 0;
5273 }
5274 
5275 static int perf_read_group(struct perf_event *event,
5276                                    u64 read_format, char __user *buf)
5277 {
5278         struct perf_event *leader = event->group_leader, *child;
5279         struct perf_event_context *ctx = leader->ctx;
5280         int ret;
5281         u64 *values;
5282 
5283         lockdep_assert_held(&ctx->mutex);
5284 
5285         values = kzalloc(event->read_size, GFP_KERNEL);
5286         if (!values)
5287                 return -ENOMEM;
5288 
5289         values[0] = 1 + leader->nr_siblings;
5290 
5291         /*
5292          * By locking the child_mutex of the leader we effectively
5293          * lock the child list of all siblings.. XXX explain how.
5294          */
5295         mutex_lock(&leader->child_mutex);
5296 
5297         ret = __perf_read_group_add(leader, read_format, values);
5298         if (ret)
5299                 goto unlock;
5300 
5301         list_for_each_entry(child, &leader->child_list, child_list) {
5302                 ret = __perf_read_group_add(child, read_format, values);
5303                 if (ret)
5304                         goto unlock;
5305         }
5306 
5307         mutex_unlock(&leader->child_mutex);
5308 
5309         ret = event->read_size;
5310         if (copy_to_user(buf, values, event->read_size))
5311                 ret = -EFAULT;
5312         goto out;
5313 
5314 unlock:
5315         mutex_unlock(&leader->child_mutex);
5316 out:
5317         kfree(values);
5318         return ret;
5319 }
5320 
5321 static int perf_read_one(struct perf_event *event,
5322                                  u64 read_format, char __user *buf)
5323 {
5324         u64 enabled, running;
5325         u64 values[4];
5326         int n = 0;
5327 
5328         values[n++] = __perf_event_read_value(event, &enabled, &running);
5329         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5330                 values[n++] = enabled;
5331         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5332                 values[n++] = running;
5333         if (read_format & PERF_FORMAT_ID)
5334                 values[n++] = primary_event_id(event);
5335 
5336         if (copy_to_user(buf, values, n * sizeof(u64)))
5337                 return -EFAULT;
5338 
5339         return n * sizeof(u64);
5340 }
5341 
5342 static bool is_event_hup(struct perf_event *event)
5343 {
5344         bool no_children;
5345 
5346         if (event->state > PERF_EVENT_STATE_EXIT)
5347                 return false;
5348 
5349         mutex_lock(&event->child_mutex);
5350         no_children = list_empty(&event->child_list);
5351         mutex_unlock(&event->child_mutex);
5352         return no_children;
5353 }
5354 
5355 /*
5356  * Read the performance event - simple non blocking version for now
5357  */
5358 static ssize_t
5359 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5360 {
5361         u64 read_format = event->attr.read_format;
5362         int ret;
5363 
5364         /*
5365          * Return end-of-file for a read on an event that is in
5366          * error state (i.e. because it was pinned but it couldn't be
5367          * scheduled on to the CPU at some point).
5368          */
5369         if (event->state == PERF_EVENT_STATE_ERROR)
5370                 return 0;
5371 
5372         if (count < event->read_size)
5373                 return -ENOSPC;
5374 
5375         WARN_ON_ONCE(event->ctx->parent_ctx);
5376         if (read_format & PERF_FORMAT_GROUP)
5377                 ret = perf_read_group(event, read_format, buf);
5378         else
5379                 ret = perf_read_one(event, read_format, buf);
5380 
5381         return ret;
5382 }
5383 
5384 static ssize_t
5385 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5386 {
5387         struct perf_event *event = file->private_data;
5388         struct perf_event_context *ctx;
5389         int ret;
5390 
5391         ret = security_perf_event_read(event);
5392         if (ret)
5393                 return ret;
5394 
5395         ctx = perf_event_ctx_lock(event);
5396         ret = __perf_read(event, buf, count);
5397         perf_event_ctx_unlock(event, ctx);
5398 
5399         return ret;
5400 }
5401 
5402 static __poll_t perf_poll(struct file *file, poll_table *wait)
5403 {
5404         struct perf_event *event = file->private_data;
5405         struct perf_buffer *rb;
5406         __poll_t events = EPOLLHUP;
5407 
5408         poll_wait(file, &event->waitq, wait);
5409 
5410         if (is_event_hup(event))
5411                 return events;
5412 
5413         /*
5414          * Pin the event->rb by taking event->mmap_mutex; otherwise
5415          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5416          */
5417         mutex_lock(&event->mmap_mutex);
5418         rb = event->rb;
5419         if (rb)
5420                 events = atomic_xchg(&rb->poll, 0);
5421         mutex_unlock(&event->mmap_mutex);
5422         return events;
5423 }
5424 
5425 static void _perf_event_reset(struct perf_event *event)
5426 {
5427         (void)perf_event_read(event, false);
5428         local64_set(&event->count, 0);
5429         perf_event_update_userpage(event);
5430 }
5431 
5432 /* Assume it's not an event with inherit set. */
5433 u64 perf_event_pause(struct perf_event *event, bool reset)
5434 {
5435         struct perf_event_context *ctx;
5436         u64 count;
5437 
5438         ctx = perf_event_ctx_lock(event);
5439         WARN_ON_ONCE(event->attr.inherit);
5440         _perf_event_disable(event);
5441         count = local64_read(&event->count);
5442         if (reset)
5443                 local64_set(&event->count, 0);
5444         perf_event_ctx_unlock(event, ctx);
5445 
5446         return count;
5447 }
5448 EXPORT_SYMBOL_GPL(perf_event_pause);
5449 
5450 /*
5451  * Holding the top-level event's child_mutex means that any
5452  * descendant process that has inherited this event will block
5453  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5454  * task existence requirements of perf_event_enable/disable.
5455  */
5456 static void perf_event_for_each_child(struct perf_event *event,
5457                                         void (*func)(struct perf_event *))
5458 {
5459         struct perf_event *child;
5460 
5461         WARN_ON_ONCE(event->ctx->parent_ctx);
5462 
5463         mutex_lock(&event->child_mutex);
5464         func(event);
5465         list_for_each_entry(child, &event->child_list, child_list)
5466                 func(child);
5467         mutex_unlock(&event->child_mutex);
5468 }
5469 
5470 static void perf_event_for_each(struct perf_event *event,
5471                                   void (*func)(struct perf_event *))
5472 {
5473         struct perf_event_context *ctx = event->ctx;
5474         struct perf_event *sibling;
5475 
5476         lockdep_assert_held(&ctx->mutex);
5477 
5478         event = event->group_leader;
5479 
5480         perf_event_for_each_child(event, func);
5481         for_each_sibling_event(sibling, event)
5482                 perf_event_for_each_child(sibling, func);
5483 }
5484 
5485 static void __perf_event_period(struct perf_event *event,
5486                                 struct perf_cpu_context *cpuctx,
5487                                 struct perf_event_context *ctx,
5488                                 void *info)
5489 {
5490         u64 value = *((u64 *)info);
5491         bool active;
5492 
5493         if (event->attr.freq) {
5494                 event->attr.sample_freq = value;
5495         } else {
5496                 event->attr.sample_period = value;
5497                 event->hw.sample_period = value;
5498         }
5499 
5500         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5501         if (active) {
5502                 perf_pmu_disable(ctx->pmu);
5503                 /*
5504                  * We could be throttled; unthrottle now to avoid the tick
5505                  * trying to unthrottle while we already re-started the event.
5506                  */
5507                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5508                         event->hw.interrupts = 0;
5509                         perf_log_throttle(event, 1);
5510                 }
5511                 event->pmu->stop(event, PERF_EF_UPDATE);
5512         }
5513 
5514         local64_set(&event->hw.period_left, 0);
5515 
5516         if (active) {
5517                 event->pmu->start(event, PERF_EF_RELOAD);
5518                 perf_pmu_enable(ctx->pmu);
5519         }
5520 }
5521 
5522 static int perf_event_check_period(struct perf_event *event, u64 value)
5523 {
5524         return event->pmu->check_period(event, value);
5525 }
5526 
5527 static int _perf_event_period(struct perf_event *event, u64 value)
5528 {
5529         if (!is_sampling_event(event))
5530                 return -EINVAL;
5531 
5532         if (!value)
5533                 return -EINVAL;
5534 
5535         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5536                 return -EINVAL;
5537 
5538         if (perf_event_check_period(event, value))
5539                 return -EINVAL;
5540 
5541         if (!event->attr.freq && (value & (1ULL << 63)))
5542                 return -EINVAL;
5543 
5544         event_function_call(event, __perf_event_period, &value);
5545 
5546         return 0;
5547 }
5548 
5549 int perf_event_period(struct perf_event *event, u64 value)
5550 {
5551         struct perf_event_context *ctx;
5552         int ret;
5553 
5554         ctx = perf_event_ctx_lock(event);
5555         ret = _perf_event_period(event, value);
5556         perf_event_ctx_unlock(event, ctx);
5557 
5558         return ret;
5559 }
5560 EXPORT_SYMBOL_GPL(perf_event_period);
5561 
5562 static const struct file_operations perf_fops;
5563 
5564 static inline int perf_fget_light(int fd, struct fd *p)
5565 {
5566         struct fd f = fdget(fd);
5567         if (!f.file)
5568                 return -EBADF;
5569 
5570         if (f.file->f_op != &perf_fops) {
5571                 fdput(f);
5572                 return -EBADF;
5573         }
5574         *p = f;
5575         return 0;
5576 }
5577 
5578 static int perf_event_set_output(struct perf_event *event,
5579                                  struct perf_event *output_event);
5580 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5581 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5582                           struct perf_event_attr *attr);
5583 
5584 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5585 {
5586         void (*func)(struct perf_event *);
5587         u32 flags = arg;
5588 
5589         switch (cmd) {
5590         case PERF_EVENT_IOC_ENABLE:
5591                 func = _perf_event_enable;
5592                 break;
5593         case PERF_EVENT_IOC_DISABLE:
5594                 func = _perf_event_disable;
5595                 break;
5596         case PERF_EVENT_IOC_RESET:
5597                 func = _perf_event_reset;
5598                 break;
5599 
5600         case PERF_EVENT_IOC_REFRESH:
5601                 return _perf_event_refresh(event, arg);
5602 
5603         case PERF_EVENT_IOC_PERIOD:
5604         {
5605                 u64 value;
5606 
5607                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5608                         return -EFAULT;
5609 
5610                 return _perf_event_period(event, value);
5611         }
5612         case PERF_EVENT_IOC_ID:
5613         {
5614                 u64 id = primary_event_id(event);
5615 
5616                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5617                         return -EFAULT;
5618                 return 0;
5619         }
5620 
5621         case PERF_EVENT_IOC_SET_OUTPUT:
5622         {
5623                 int ret;
5624                 if (arg != -1) {
5625                         struct perf_event *output_event;
5626                         struct fd output;
5627                         ret = perf_fget_light(arg, &output);
5628                         if (ret)
5629                                 return ret;
5630                         output_event = output.file->private_data;
5631                         ret = perf_event_set_output(event, output_event);
5632                         fdput(output);
5633                 } else {
5634                         ret = perf_event_set_output(event, NULL);
5635                 }
5636                 return ret;
5637         }
5638 
5639         case PERF_EVENT_IOC_SET_FILTER:
5640                 return perf_event_set_filter(event, (void __user *)arg);
5641 
5642         case PERF_EVENT_IOC_SET_BPF:
5643         {
5644                 struct bpf_prog *prog;
5645                 int err;
5646 
5647                 prog = bpf_prog_get(arg);
5648                 if (IS_ERR(prog))
5649                         return PTR_ERR(prog);
5650 
5651                 err = perf_event_set_bpf_prog(event, prog, 0);
5652                 if (err) {
5653                         bpf_prog_put(prog);
5654                         return err;
5655                 }
5656 
5657                 return 0;
5658         }
5659 
5660         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5661                 struct perf_buffer *rb;
5662 
5663                 rcu_read_lock();
5664                 rb = rcu_dereference(event->rb);
5665                 if (!rb || !rb->nr_pages) {
5666                         rcu_read_unlock();
5667                         return -EINVAL;
5668                 }
5669                 rb_toggle_paused(rb, !!arg);
5670                 rcu_read_unlock();
5671                 return 0;
5672         }
5673 
5674         case PERF_EVENT_IOC_QUERY_BPF:
5675                 return perf_event_query_prog_array(event, (void __user *)arg);
5676 
5677         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5678                 struct perf_event_attr new_attr;
5679                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5680                                          &new_attr);
5681 
5682                 if (err)
5683                         return err;
5684 
5685                 return perf_event_modify_attr(event,  &new_attr);
5686         }
5687         default:
5688                 return -ENOTTY;
5689         }
5690 
5691         if (flags & PERF_IOC_FLAG_GROUP)
5692                 perf_event_for_each(event, func);
5693         else
5694                 perf_event_for_each_child(event, func);
5695 
5696         return 0;
5697 }
5698 
5699 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5700 {
5701         struct perf_event *event = file->private_data;
5702         struct perf_event_context *ctx;
5703         long ret;
5704 
5705         /* Treat ioctl like writes as it is likely a mutating operation. */
5706         ret = security_perf_event_write(event);
5707         if (ret)
5708                 return ret;
5709 
5710         ctx = perf_event_ctx_lock(event);
5711         ret = _perf_ioctl(event, cmd, arg);
5712         perf_event_ctx_unlock(event, ctx);
5713 
5714         return ret;
5715 }
5716 
5717 #ifdef CONFIG_COMPAT
5718 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5719                                 unsigned long arg)
5720 {
5721         switch (_IOC_NR(cmd)) {
5722         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5723         case _IOC_NR(PERF_EVENT_IOC_ID):
5724         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5725         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5726                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5727                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5728                         cmd &= ~IOCSIZE_MASK;
5729                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5730                 }
5731                 break;
5732         }
5733         return perf_ioctl(file, cmd, arg);
5734 }
5735 #else
5736 # define perf_compat_ioctl NULL
5737 #endif
5738 
5739 int perf_event_task_enable(void)
5740 {
5741         struct perf_event_context *ctx;
5742         struct perf_event *event;
5743 
5744         mutex_lock(&current->perf_event_mutex);
5745         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5746                 ctx = perf_event_ctx_lock(event);
5747                 perf_event_for_each_child(event, _perf_event_enable);
5748                 perf_event_ctx_unlock(event, ctx);
5749         }
5750         mutex_unlock(&current->perf_event_mutex);
5751 
5752         return 0;
5753 }
5754 
5755 int perf_event_task_disable(void)
5756 {
5757         struct perf_event_context *ctx;
5758         struct perf_event *event;
5759 
5760         mutex_lock(&current->perf_event_mutex);
5761         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5762                 ctx = perf_event_ctx_lock(event);
5763                 perf_event_for_each_child(event, _perf_event_disable);
5764                 perf_event_ctx_unlock(event, ctx);
5765         }
5766         mutex_unlock(&current->perf_event_mutex);
5767 
5768         return 0;
5769 }
5770 
5771 static int perf_event_index(struct perf_event *event)
5772 {
5773         if (event->hw.state & PERF_HES_STOPPED)
5774                 return 0;
5775 
5776         if (event->state != PERF_EVENT_STATE_ACTIVE)
5777                 return 0;
5778 
5779         return event->pmu->event_idx(event);
5780 }
5781 
5782 static void perf_event_init_userpage(struct perf_event *event)
5783 {
5784         struct perf_event_mmap_page *userpg;
5785         struct perf_buffer *rb;
5786 
5787         rcu_read_lock();
5788         rb = rcu_dereference(event->rb);
5789         if (!rb)
5790                 goto unlock;
5791 
5792         userpg = rb->user_page;
5793 
5794         /* Allow new userspace to detect that bit 0 is deprecated */
5795         userpg->cap_bit0_is_deprecated = 1;
5796         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5797         userpg->data_offset = PAGE_SIZE;
5798         userpg->data_size = perf_data_size(rb);
5799 
5800 unlock:
5801         rcu_read_unlock();
5802 }
5803 
5804 void __weak arch_perf_update_userpage(
5805         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5806 {
5807 }
5808 
5809 /*
5810  * Callers need to ensure there can be no nesting of this function, otherwise
5811  * the seqlock logic goes bad. We can not serialize this because the arch
5812  * code calls this from NMI context.
5813  */
5814 void perf_event_update_userpage(struct perf_event *event)
5815 {
5816         struct perf_event_mmap_page *userpg;
5817         struct perf_buffer *rb;
5818         u64 enabled, running, now;
5819 
5820         rcu_read_lock();
5821         rb = rcu_dereference(event->rb);
5822         if (!rb)
5823                 goto unlock;
5824 
5825         /*
5826          * compute total_time_enabled, total_time_running
5827          * based on snapshot values taken when the event
5828          * was last scheduled in.
5829          *
5830          * we cannot simply called update_context_time()
5831          * because of locking issue as we can be called in
5832          * NMI context
5833          */
5834         calc_timer_values(event, &now, &enabled, &running);
5835 
5836         userpg = rb->user_page;
5837         /*
5838          * Disable preemption to guarantee consistent time stamps are stored to
5839          * the user page.
5840          */
5841         preempt_disable();
5842         ++userpg->lock;
5843         barrier();
5844         userpg->index = perf_event_index(event);
5845         userpg->offset = perf_event_count(event);
5846         if (userpg->index)
5847                 userpg->offset -= local64_read(&event->hw.prev_count);
5848 
5849         userpg->time_enabled = enabled +
5850                         atomic64_read(&event->child_total_time_enabled);
5851 
5852         userpg->time_running = running +
5853                         atomic64_read(&event->child_total_time_running);
5854 
5855         arch_perf_update_userpage(event, userpg, now);
5856 
5857         barrier();
5858         ++userpg->lock;
5859         preempt_enable();
5860 unlock:
5861         rcu_read_unlock();
5862 }
5863 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5864 
5865 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5866 {
5867         struct perf_event *event = vmf->vma->vm_file->private_data;
5868         struct perf_buffer *rb;
5869         vm_fault_t ret = VM_FAULT_SIGBUS;
5870 
5871         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5872                 if (vmf->pgoff == 0)
5873                         ret = 0;
5874                 return ret;
5875         }
5876 
5877         rcu_read_lock();
5878         rb = rcu_dereference(event->rb);
5879         if (!rb)
5880                 goto unlock;
5881 
5882<