~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.11 ] ~ [ linux-5.10.17 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.99 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.176 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.221 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.257 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.257 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Performance events core code:
  4  *
  5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  9  */
 10 
 11 #include <linux/fs.h>
 12 #include <linux/mm.h>
 13 #include <linux/cpu.h>
 14 #include <linux/smp.h>
 15 #include <linux/idr.h>
 16 #include <linux/file.h>
 17 #include <linux/poll.h>
 18 #include <linux/slab.h>
 19 #include <linux/hash.h>
 20 #include <linux/tick.h>
 21 #include <linux/sysfs.h>
 22 #include <linux/dcache.h>
 23 #include <linux/percpu.h>
 24 #include <linux/ptrace.h>
 25 #include <linux/reboot.h>
 26 #include <linux/vmstat.h>
 27 #include <linux/device.h>
 28 #include <linux/export.h>
 29 #include <linux/vmalloc.h>
 30 #include <linux/hardirq.h>
 31 #include <linux/rculist.h>
 32 #include <linux/uaccess.h>
 33 #include <linux/syscalls.h>
 34 #include <linux/anon_inodes.h>
 35 #include <linux/kernel_stat.h>
 36 #include <linux/cgroup.h>
 37 #include <linux/perf_event.h>
 38 #include <linux/trace_events.h>
 39 #include <linux/hw_breakpoint.h>
 40 #include <linux/mm_types.h>
 41 #include <linux/module.h>
 42 #include <linux/mman.h>
 43 #include <linux/compat.h>
 44 #include <linux/bpf.h>
 45 #include <linux/filter.h>
 46 #include <linux/namei.h>
 47 #include <linux/parser.h>
 48 #include <linux/sched/clock.h>
 49 #include <linux/sched/mm.h>
 50 #include <linux/proc_ns.h>
 51 #include <linux/mount.h>
 52 
 53 #include "internal.h"
 54 
 55 #include <asm/irq_regs.h>
 56 
 57 typedef int (*remote_function_f)(void *);
 58 
 59 struct remote_function_call {
 60         struct task_struct      *p;
 61         remote_function_f       func;
 62         void                    *info;
 63         int                     ret;
 64 };
 65 
 66 static void remote_function(void *data)
 67 {
 68         struct remote_function_call *tfc = data;
 69         struct task_struct *p = tfc->p;
 70 
 71         if (p) {
 72                 /* -EAGAIN */
 73                 if (task_cpu(p) != smp_processor_id())
 74                         return;
 75 
 76                 /*
 77                  * Now that we're on right CPU with IRQs disabled, we can test
 78                  * if we hit the right task without races.
 79                  */
 80 
 81                 tfc->ret = -ESRCH; /* No such (running) process */
 82                 if (p != current)
 83                         return;
 84         }
 85 
 86         tfc->ret = tfc->func(tfc->info);
 87 }
 88 
 89 /**
 90  * task_function_call - call a function on the cpu on which a task runs
 91  * @p:          the task to evaluate
 92  * @func:       the function to be called
 93  * @info:       the function call argument
 94  *
 95  * Calls the function @func when the task is currently running. This might
 96  * be on the current CPU, which just calls the function directly
 97  *
 98  * returns: @func return value, or
 99  *          -ESRCH  - when the process isn't running
100  *          -EAGAIN - when the process moved away
101  */
102 static int
103 task_function_call(struct task_struct *p, remote_function_f func, void *info)
104 {
105         struct remote_function_call data = {
106                 .p      = p,
107                 .func   = func,
108                 .info   = info,
109                 .ret    = -EAGAIN,
110         };
111         int ret;
112 
113         do {
114                 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
115                 if (!ret)
116                         ret = data.ret;
117         } while (ret == -EAGAIN);
118 
119         return ret;
120 }
121 
122 /**
123  * cpu_function_call - call a function on the cpu
124  * @func:       the function to be called
125  * @info:       the function call argument
126  *
127  * Calls the function @func on the remote cpu.
128  *
129  * returns: @func return value or -ENXIO when the cpu is offline
130  */
131 static int cpu_function_call(int cpu, remote_function_f func, void *info)
132 {
133         struct remote_function_call data = {
134                 .p      = NULL,
135                 .func   = func,
136                 .info   = info,
137                 .ret    = -ENXIO, /* No such CPU */
138         };
139 
140         smp_call_function_single(cpu, remote_function, &data, 1);
141 
142         return data.ret;
143 }
144 
145 static inline struct perf_cpu_context *
146 __get_cpu_context(struct perf_event_context *ctx)
147 {
148         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
149 }
150 
151 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
152                           struct perf_event_context *ctx)
153 {
154         raw_spin_lock(&cpuctx->ctx.lock);
155         if (ctx)
156                 raw_spin_lock(&ctx->lock);
157 }
158 
159 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
160                             struct perf_event_context *ctx)
161 {
162         if (ctx)
163                 raw_spin_unlock(&ctx->lock);
164         raw_spin_unlock(&cpuctx->ctx.lock);
165 }
166 
167 #define TASK_TOMBSTONE ((void *)-1L)
168 
169 static bool is_kernel_event(struct perf_event *event)
170 {
171         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
172 }
173 
174 /*
175  * On task ctx scheduling...
176  *
177  * When !ctx->nr_events a task context will not be scheduled. This means
178  * we can disable the scheduler hooks (for performance) without leaving
179  * pending task ctx state.
180  *
181  * This however results in two special cases:
182  *
183  *  - removing the last event from a task ctx; this is relatively straight
184  *    forward and is done in __perf_remove_from_context.
185  *
186  *  - adding the first event to a task ctx; this is tricky because we cannot
187  *    rely on ctx->is_active and therefore cannot use event_function_call().
188  *    See perf_install_in_context().
189  *
190  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
191  */
192 
193 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
194                         struct perf_event_context *, void *);
195 
196 struct event_function_struct {
197         struct perf_event *event;
198         event_f func;
199         void *data;
200 };
201 
202 static int event_function(void *info)
203 {
204         struct event_function_struct *efs = info;
205         struct perf_event *event = efs->event;
206         struct perf_event_context *ctx = event->ctx;
207         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
208         struct perf_event_context *task_ctx = cpuctx->task_ctx;
209         int ret = 0;
210 
211         lockdep_assert_irqs_disabled();
212 
213         perf_ctx_lock(cpuctx, task_ctx);
214         /*
215          * Since we do the IPI call without holding ctx->lock things can have
216          * changed, double check we hit the task we set out to hit.
217          */
218         if (ctx->task) {
219                 if (ctx->task != current) {
220                         ret = -ESRCH;
221                         goto unlock;
222                 }
223 
224                 /*
225                  * We only use event_function_call() on established contexts,
226                  * and event_function() is only ever called when active (or
227                  * rather, we'll have bailed in task_function_call() or the
228                  * above ctx->task != current test), therefore we must have
229                  * ctx->is_active here.
230                  */
231                 WARN_ON_ONCE(!ctx->is_active);
232                 /*
233                  * And since we have ctx->is_active, cpuctx->task_ctx must
234                  * match.
235                  */
236                 WARN_ON_ONCE(task_ctx != ctx);
237         } else {
238                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
239         }
240 
241         efs->func(event, cpuctx, ctx, efs->data);
242 unlock:
243         perf_ctx_unlock(cpuctx, task_ctx);
244 
245         return ret;
246 }
247 
248 static void event_function_call(struct perf_event *event, event_f func, void *data)
249 {
250         struct perf_event_context *ctx = event->ctx;
251         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
252         struct event_function_struct efs = {
253                 .event = event,
254                 .func = func,
255                 .data = data,
256         };
257 
258         if (!event->parent) {
259                 /*
260                  * If this is a !child event, we must hold ctx::mutex to
261                  * stabilize the the event->ctx relation. See
262                  * perf_event_ctx_lock().
263                  */
264                 lockdep_assert_held(&ctx->mutex);
265         }
266 
267         if (!task) {
268                 cpu_function_call(event->cpu, event_function, &efs);
269                 return;
270         }
271 
272         if (task == TASK_TOMBSTONE)
273                 return;
274 
275 again:
276         if (!task_function_call(task, event_function, &efs))
277                 return;
278 
279         raw_spin_lock_irq(&ctx->lock);
280         /*
281          * Reload the task pointer, it might have been changed by
282          * a concurrent perf_event_context_sched_out().
283          */
284         task = ctx->task;
285         if (task == TASK_TOMBSTONE) {
286                 raw_spin_unlock_irq(&ctx->lock);
287                 return;
288         }
289         if (ctx->is_active) {
290                 raw_spin_unlock_irq(&ctx->lock);
291                 goto again;
292         }
293         func(event, NULL, ctx, data);
294         raw_spin_unlock_irq(&ctx->lock);
295 }
296 
297 /*
298  * Similar to event_function_call() + event_function(), but hard assumes IRQs
299  * are already disabled and we're on the right CPU.
300  */
301 static void event_function_local(struct perf_event *event, event_f func, void *data)
302 {
303         struct perf_event_context *ctx = event->ctx;
304         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
305         struct task_struct *task = READ_ONCE(ctx->task);
306         struct perf_event_context *task_ctx = NULL;
307 
308         lockdep_assert_irqs_disabled();
309 
310         if (task) {
311                 if (task == TASK_TOMBSTONE)
312                         return;
313 
314                 task_ctx = ctx;
315         }
316 
317         perf_ctx_lock(cpuctx, task_ctx);
318 
319         task = ctx->task;
320         if (task == TASK_TOMBSTONE)
321                 goto unlock;
322 
323         if (task) {
324                 /*
325                  * We must be either inactive or active and the right task,
326                  * otherwise we're screwed, since we cannot IPI to somewhere
327                  * else.
328                  */
329                 if (ctx->is_active) {
330                         if (WARN_ON_ONCE(task != current))
331                                 goto unlock;
332 
333                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
334                                 goto unlock;
335                 }
336         } else {
337                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
338         }
339 
340         func(event, cpuctx, ctx, data);
341 unlock:
342         perf_ctx_unlock(cpuctx, task_ctx);
343 }
344 
345 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
346                        PERF_FLAG_FD_OUTPUT  |\
347                        PERF_FLAG_PID_CGROUP |\
348                        PERF_FLAG_FD_CLOEXEC)
349 
350 /*
351  * branch priv levels that need permission checks
352  */
353 #define PERF_SAMPLE_BRANCH_PERM_PLM \
354         (PERF_SAMPLE_BRANCH_KERNEL |\
355          PERF_SAMPLE_BRANCH_HV)
356 
357 enum event_type_t {
358         EVENT_FLEXIBLE = 0x1,
359         EVENT_PINNED = 0x2,
360         EVENT_TIME = 0x4,
361         /* see ctx_resched() for details */
362         EVENT_CPU = 0x8,
363         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
364 };
365 
366 /*
367  * perf_sched_events : >0 events exist
368  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
369  */
370 
371 static void perf_sched_delayed(struct work_struct *work);
372 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
373 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
374 static DEFINE_MUTEX(perf_sched_mutex);
375 static atomic_t perf_sched_count;
376 
377 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
378 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
379 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
380 
381 static atomic_t nr_mmap_events __read_mostly;
382 static atomic_t nr_comm_events __read_mostly;
383 static atomic_t nr_namespaces_events __read_mostly;
384 static atomic_t nr_task_events __read_mostly;
385 static atomic_t nr_freq_events __read_mostly;
386 static atomic_t nr_switch_events __read_mostly;
387 static atomic_t nr_ksymbol_events __read_mostly;
388 static atomic_t nr_bpf_events __read_mostly;
389 
390 static LIST_HEAD(pmus);
391 static DEFINE_MUTEX(pmus_lock);
392 static struct srcu_struct pmus_srcu;
393 static cpumask_var_t perf_online_mask;
394 
395 /*
396  * perf event paranoia level:
397  *  -1 - not paranoid at all
398  *   0 - disallow raw tracepoint access for unpriv
399  *   1 - disallow cpu events for unpriv
400  *   2 - disallow kernel profiling for unpriv
401  */
402 int sysctl_perf_event_paranoid __read_mostly = 2;
403 
404 /* Minimum for 512 kiB + 1 user control page */
405 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
406 
407 /*
408  * max perf event sample rate
409  */
410 #define DEFAULT_MAX_SAMPLE_RATE         100000
411 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
412 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
413 
414 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
415 
416 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
417 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
418 
419 static int perf_sample_allowed_ns __read_mostly =
420         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
421 
422 static void update_perf_cpu_limits(void)
423 {
424         u64 tmp = perf_sample_period_ns;
425 
426         tmp *= sysctl_perf_cpu_time_max_percent;
427         tmp = div_u64(tmp, 100);
428         if (!tmp)
429                 tmp = 1;
430 
431         WRITE_ONCE(perf_sample_allowed_ns, tmp);
432 }
433 
434 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
435 
436 int perf_proc_update_handler(struct ctl_table *table, int write,
437                 void __user *buffer, size_t *lenp,
438                 loff_t *ppos)
439 {
440         int ret;
441         int perf_cpu = sysctl_perf_cpu_time_max_percent;
442         /*
443          * If throttling is disabled don't allow the write:
444          */
445         if (write && (perf_cpu == 100 || perf_cpu == 0))
446                 return -EINVAL;
447 
448         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
449         if (ret || !write)
450                 return ret;
451 
452         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
453         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
454         update_perf_cpu_limits();
455 
456         return 0;
457 }
458 
459 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
460 
461 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
462                                 void __user *buffer, size_t *lenp,
463                                 loff_t *ppos)
464 {
465         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
466 
467         if (ret || !write)
468                 return ret;
469 
470         if (sysctl_perf_cpu_time_max_percent == 100 ||
471             sysctl_perf_cpu_time_max_percent == 0) {
472                 printk(KERN_WARNING
473                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
474                 WRITE_ONCE(perf_sample_allowed_ns, 0);
475         } else {
476                 update_perf_cpu_limits();
477         }
478 
479         return 0;
480 }
481 
482 /*
483  * perf samples are done in some very critical code paths (NMIs).
484  * If they take too much CPU time, the system can lock up and not
485  * get any real work done.  This will drop the sample rate when
486  * we detect that events are taking too long.
487  */
488 #define NR_ACCUMULATED_SAMPLES 128
489 static DEFINE_PER_CPU(u64, running_sample_length);
490 
491 static u64 __report_avg;
492 static u64 __report_allowed;
493 
494 static void perf_duration_warn(struct irq_work *w)
495 {
496         printk_ratelimited(KERN_INFO
497                 "perf: interrupt took too long (%lld > %lld), lowering "
498                 "kernel.perf_event_max_sample_rate to %d\n",
499                 __report_avg, __report_allowed,
500                 sysctl_perf_event_sample_rate);
501 }
502 
503 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
504 
505 void perf_sample_event_took(u64 sample_len_ns)
506 {
507         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
508         u64 running_len;
509         u64 avg_len;
510         u32 max;
511 
512         if (max_len == 0)
513                 return;
514 
515         /* Decay the counter by 1 average sample. */
516         running_len = __this_cpu_read(running_sample_length);
517         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
518         running_len += sample_len_ns;
519         __this_cpu_write(running_sample_length, running_len);
520 
521         /*
522          * Note: this will be biased artifically low until we have
523          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
524          * from having to maintain a count.
525          */
526         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
527         if (avg_len <= max_len)
528                 return;
529 
530         __report_avg = avg_len;
531         __report_allowed = max_len;
532 
533         /*
534          * Compute a throttle threshold 25% below the current duration.
535          */
536         avg_len += avg_len / 4;
537         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
538         if (avg_len < max)
539                 max /= (u32)avg_len;
540         else
541                 max = 1;
542 
543         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
544         WRITE_ONCE(max_samples_per_tick, max);
545 
546         sysctl_perf_event_sample_rate = max * HZ;
547         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
548 
549         if (!irq_work_queue(&perf_duration_work)) {
550                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
551                              "kernel.perf_event_max_sample_rate to %d\n",
552                              __report_avg, __report_allowed,
553                              sysctl_perf_event_sample_rate);
554         }
555 }
556 
557 static atomic64_t perf_event_id;
558 
559 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
560                               enum event_type_t event_type);
561 
562 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
563                              enum event_type_t event_type,
564                              struct task_struct *task);
565 
566 static void update_context_time(struct perf_event_context *ctx);
567 static u64 perf_event_time(struct perf_event *event);
568 
569 void __weak perf_event_print_debug(void)        { }
570 
571 extern __weak const char *perf_pmu_name(void)
572 {
573         return "pmu";
574 }
575 
576 static inline u64 perf_clock(void)
577 {
578         return local_clock();
579 }
580 
581 static inline u64 perf_event_clock(struct perf_event *event)
582 {
583         return event->clock();
584 }
585 
586 /*
587  * State based event timekeeping...
588  *
589  * The basic idea is to use event->state to determine which (if any) time
590  * fields to increment with the current delta. This means we only need to
591  * update timestamps when we change state or when they are explicitly requested
592  * (read).
593  *
594  * Event groups make things a little more complicated, but not terribly so. The
595  * rules for a group are that if the group leader is OFF the entire group is
596  * OFF, irrespecive of what the group member states are. This results in
597  * __perf_effective_state().
598  *
599  * A futher ramification is that when a group leader flips between OFF and
600  * !OFF, we need to update all group member times.
601  *
602  *
603  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
604  * need to make sure the relevant context time is updated before we try and
605  * update our timestamps.
606  */
607 
608 static __always_inline enum perf_event_state
609 __perf_effective_state(struct perf_event *event)
610 {
611         struct perf_event *leader = event->group_leader;
612 
613         if (leader->state <= PERF_EVENT_STATE_OFF)
614                 return leader->state;
615 
616         return event->state;
617 }
618 
619 static __always_inline void
620 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
621 {
622         enum perf_event_state state = __perf_effective_state(event);
623         u64 delta = now - event->tstamp;
624 
625         *enabled = event->total_time_enabled;
626         if (state >= PERF_EVENT_STATE_INACTIVE)
627                 *enabled += delta;
628 
629         *running = event->total_time_running;
630         if (state >= PERF_EVENT_STATE_ACTIVE)
631                 *running += delta;
632 }
633 
634 static void perf_event_update_time(struct perf_event *event)
635 {
636         u64 now = perf_event_time(event);
637 
638         __perf_update_times(event, now, &event->total_time_enabled,
639                                         &event->total_time_running);
640         event->tstamp = now;
641 }
642 
643 static void perf_event_update_sibling_time(struct perf_event *leader)
644 {
645         struct perf_event *sibling;
646 
647         for_each_sibling_event(sibling, leader)
648                 perf_event_update_time(sibling);
649 }
650 
651 static void
652 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
653 {
654         if (event->state == state)
655                 return;
656 
657         perf_event_update_time(event);
658         /*
659          * If a group leader gets enabled/disabled all its siblings
660          * are affected too.
661          */
662         if ((event->state < 0) ^ (state < 0))
663                 perf_event_update_sibling_time(event);
664 
665         WRITE_ONCE(event->state, state);
666 }
667 
668 #ifdef CONFIG_CGROUP_PERF
669 
670 static inline bool
671 perf_cgroup_match(struct perf_event *event)
672 {
673         struct perf_event_context *ctx = event->ctx;
674         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
675 
676         /* @event doesn't care about cgroup */
677         if (!event->cgrp)
678                 return true;
679 
680         /* wants specific cgroup scope but @cpuctx isn't associated with any */
681         if (!cpuctx->cgrp)
682                 return false;
683 
684         /*
685          * Cgroup scoping is recursive.  An event enabled for a cgroup is
686          * also enabled for all its descendant cgroups.  If @cpuctx's
687          * cgroup is a descendant of @event's (the test covers identity
688          * case), it's a match.
689          */
690         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
691                                     event->cgrp->css.cgroup);
692 }
693 
694 static inline void perf_detach_cgroup(struct perf_event *event)
695 {
696         css_put(&event->cgrp->css);
697         event->cgrp = NULL;
698 }
699 
700 static inline int is_cgroup_event(struct perf_event *event)
701 {
702         return event->cgrp != NULL;
703 }
704 
705 static inline u64 perf_cgroup_event_time(struct perf_event *event)
706 {
707         struct perf_cgroup_info *t;
708 
709         t = per_cpu_ptr(event->cgrp->info, event->cpu);
710         return t->time;
711 }
712 
713 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
714 {
715         struct perf_cgroup_info *info;
716         u64 now;
717 
718         now = perf_clock();
719 
720         info = this_cpu_ptr(cgrp->info);
721 
722         info->time += now - info->timestamp;
723         info->timestamp = now;
724 }
725 
726 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
727 {
728         struct perf_cgroup *cgrp = cpuctx->cgrp;
729         struct cgroup_subsys_state *css;
730 
731         if (cgrp) {
732                 for (css = &cgrp->css; css; css = css->parent) {
733                         cgrp = container_of(css, struct perf_cgroup, css);
734                         __update_cgrp_time(cgrp);
735                 }
736         }
737 }
738 
739 static inline void update_cgrp_time_from_event(struct perf_event *event)
740 {
741         struct perf_cgroup *cgrp;
742 
743         /*
744          * ensure we access cgroup data only when needed and
745          * when we know the cgroup is pinned (css_get)
746          */
747         if (!is_cgroup_event(event))
748                 return;
749 
750         cgrp = perf_cgroup_from_task(current, event->ctx);
751         /*
752          * Do not update time when cgroup is not active
753          */
754         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
755                 __update_cgrp_time(event->cgrp);
756 }
757 
758 static inline void
759 perf_cgroup_set_timestamp(struct task_struct *task,
760                           struct perf_event_context *ctx)
761 {
762         struct perf_cgroup *cgrp;
763         struct perf_cgroup_info *info;
764         struct cgroup_subsys_state *css;
765 
766         /*
767          * ctx->lock held by caller
768          * ensure we do not access cgroup data
769          * unless we have the cgroup pinned (css_get)
770          */
771         if (!task || !ctx->nr_cgroups)
772                 return;
773 
774         cgrp = perf_cgroup_from_task(task, ctx);
775 
776         for (css = &cgrp->css; css; css = css->parent) {
777                 cgrp = container_of(css, struct perf_cgroup, css);
778                 info = this_cpu_ptr(cgrp->info);
779                 info->timestamp = ctx->timestamp;
780         }
781 }
782 
783 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
784 
785 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
786 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
787 
788 /*
789  * reschedule events based on the cgroup constraint of task.
790  *
791  * mode SWOUT : schedule out everything
792  * mode SWIN : schedule in based on cgroup for next
793  */
794 static void perf_cgroup_switch(struct task_struct *task, int mode)
795 {
796         struct perf_cpu_context *cpuctx;
797         struct list_head *list;
798         unsigned long flags;
799 
800         /*
801          * Disable interrupts and preemption to avoid this CPU's
802          * cgrp_cpuctx_entry to change under us.
803          */
804         local_irq_save(flags);
805 
806         list = this_cpu_ptr(&cgrp_cpuctx_list);
807         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
808                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
809 
810                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
811                 perf_pmu_disable(cpuctx->ctx.pmu);
812 
813                 if (mode & PERF_CGROUP_SWOUT) {
814                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
815                         /*
816                          * must not be done before ctxswout due
817                          * to event_filter_match() in event_sched_out()
818                          */
819                         cpuctx->cgrp = NULL;
820                 }
821 
822                 if (mode & PERF_CGROUP_SWIN) {
823                         WARN_ON_ONCE(cpuctx->cgrp);
824                         /*
825                          * set cgrp before ctxsw in to allow
826                          * event_filter_match() to not have to pass
827                          * task around
828                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
829                          * because cgorup events are only per-cpu
830                          */
831                         cpuctx->cgrp = perf_cgroup_from_task(task,
832                                                              &cpuctx->ctx);
833                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
834                 }
835                 perf_pmu_enable(cpuctx->ctx.pmu);
836                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
837         }
838 
839         local_irq_restore(flags);
840 }
841 
842 static inline void perf_cgroup_sched_out(struct task_struct *task,
843                                          struct task_struct *next)
844 {
845         struct perf_cgroup *cgrp1;
846         struct perf_cgroup *cgrp2 = NULL;
847 
848         rcu_read_lock();
849         /*
850          * we come here when we know perf_cgroup_events > 0
851          * we do not need to pass the ctx here because we know
852          * we are holding the rcu lock
853          */
854         cgrp1 = perf_cgroup_from_task(task, NULL);
855         cgrp2 = perf_cgroup_from_task(next, NULL);
856 
857         /*
858          * only schedule out current cgroup events if we know
859          * that we are switching to a different cgroup. Otherwise,
860          * do no touch the cgroup events.
861          */
862         if (cgrp1 != cgrp2)
863                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
864 
865         rcu_read_unlock();
866 }
867 
868 static inline void perf_cgroup_sched_in(struct task_struct *prev,
869                                         struct task_struct *task)
870 {
871         struct perf_cgroup *cgrp1;
872         struct perf_cgroup *cgrp2 = NULL;
873 
874         rcu_read_lock();
875         /*
876          * we come here when we know perf_cgroup_events > 0
877          * we do not need to pass the ctx here because we know
878          * we are holding the rcu lock
879          */
880         cgrp1 = perf_cgroup_from_task(task, NULL);
881         cgrp2 = perf_cgroup_from_task(prev, NULL);
882 
883         /*
884          * only need to schedule in cgroup events if we are changing
885          * cgroup during ctxsw. Cgroup events were not scheduled
886          * out of ctxsw out if that was not the case.
887          */
888         if (cgrp1 != cgrp2)
889                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
890 
891         rcu_read_unlock();
892 }
893 
894 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
895                                       struct perf_event_attr *attr,
896                                       struct perf_event *group_leader)
897 {
898         struct perf_cgroup *cgrp;
899         struct cgroup_subsys_state *css;
900         struct fd f = fdget(fd);
901         int ret = 0;
902 
903         if (!f.file)
904                 return -EBADF;
905 
906         css = css_tryget_online_from_dir(f.file->f_path.dentry,
907                                          &perf_event_cgrp_subsys);
908         if (IS_ERR(css)) {
909                 ret = PTR_ERR(css);
910                 goto out;
911         }
912 
913         cgrp = container_of(css, struct perf_cgroup, css);
914         event->cgrp = cgrp;
915 
916         /*
917          * all events in a group must monitor
918          * the same cgroup because a task belongs
919          * to only one perf cgroup at a time
920          */
921         if (group_leader && group_leader->cgrp != cgrp) {
922                 perf_detach_cgroup(event);
923                 ret = -EINVAL;
924         }
925 out:
926         fdput(f);
927         return ret;
928 }
929 
930 static inline void
931 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
932 {
933         struct perf_cgroup_info *t;
934         t = per_cpu_ptr(event->cgrp->info, event->cpu);
935         event->shadow_ctx_time = now - t->timestamp;
936 }
937 
938 /*
939  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
940  * cleared when last cgroup event is removed.
941  */
942 static inline void
943 list_update_cgroup_event(struct perf_event *event,
944                          struct perf_event_context *ctx, bool add)
945 {
946         struct perf_cpu_context *cpuctx;
947         struct list_head *cpuctx_entry;
948 
949         if (!is_cgroup_event(event))
950                 return;
951 
952         /*
953          * Because cgroup events are always per-cpu events,
954          * this will always be called from the right CPU.
955          */
956         cpuctx = __get_cpu_context(ctx);
957 
958         /*
959          * Since setting cpuctx->cgrp is conditional on the current @cgrp
960          * matching the event's cgroup, we must do this for every new event,
961          * because if the first would mismatch, the second would not try again
962          * and we would leave cpuctx->cgrp unset.
963          */
964         if (add && !cpuctx->cgrp) {
965                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
966 
967                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
968                         cpuctx->cgrp = cgrp;
969         }
970 
971         if (add && ctx->nr_cgroups++)
972                 return;
973         else if (!add && --ctx->nr_cgroups)
974                 return;
975 
976         /* no cgroup running */
977         if (!add)
978                 cpuctx->cgrp = NULL;
979 
980         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
981         if (add)
982                 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
983         else
984                 list_del(cpuctx_entry);
985 }
986 
987 #else /* !CONFIG_CGROUP_PERF */
988 
989 static inline bool
990 perf_cgroup_match(struct perf_event *event)
991 {
992         return true;
993 }
994 
995 static inline void perf_detach_cgroup(struct perf_event *event)
996 {}
997 
998 static inline int is_cgroup_event(struct perf_event *event)
999 {
1000         return 0;
1001 }
1002 
1003 static inline void update_cgrp_time_from_event(struct perf_event *event)
1004 {
1005 }
1006 
1007 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1008 {
1009 }
1010 
1011 static inline void perf_cgroup_sched_out(struct task_struct *task,
1012                                          struct task_struct *next)
1013 {
1014 }
1015 
1016 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1017                                         struct task_struct *task)
1018 {
1019 }
1020 
1021 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1022                                       struct perf_event_attr *attr,
1023                                       struct perf_event *group_leader)
1024 {
1025         return -EINVAL;
1026 }
1027 
1028 static inline void
1029 perf_cgroup_set_timestamp(struct task_struct *task,
1030                           struct perf_event_context *ctx)
1031 {
1032 }
1033 
1034 void
1035 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1036 {
1037 }
1038 
1039 static inline void
1040 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1041 {
1042 }
1043 
1044 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1045 {
1046         return 0;
1047 }
1048 
1049 static inline void
1050 list_update_cgroup_event(struct perf_event *event,
1051                          struct perf_event_context *ctx, bool add)
1052 {
1053 }
1054 
1055 #endif
1056 
1057 /*
1058  * set default to be dependent on timer tick just
1059  * like original code
1060  */
1061 #define PERF_CPU_HRTIMER (1000 / HZ)
1062 /*
1063  * function must be called with interrupts disabled
1064  */
1065 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1066 {
1067         struct perf_cpu_context *cpuctx;
1068         bool rotations;
1069 
1070         lockdep_assert_irqs_disabled();
1071 
1072         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1073         rotations = perf_rotate_context(cpuctx);
1074 
1075         raw_spin_lock(&cpuctx->hrtimer_lock);
1076         if (rotations)
1077                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1078         else
1079                 cpuctx->hrtimer_active = 0;
1080         raw_spin_unlock(&cpuctx->hrtimer_lock);
1081 
1082         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1083 }
1084 
1085 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1086 {
1087         struct hrtimer *timer = &cpuctx->hrtimer;
1088         struct pmu *pmu = cpuctx->ctx.pmu;
1089         u64 interval;
1090 
1091         /* no multiplexing needed for SW PMU */
1092         if (pmu->task_ctx_nr == perf_sw_context)
1093                 return;
1094 
1095         /*
1096          * check default is sane, if not set then force to
1097          * default interval (1/tick)
1098          */
1099         interval = pmu->hrtimer_interval_ms;
1100         if (interval < 1)
1101                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1102 
1103         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1104 
1105         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1106         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1107         timer->function = perf_mux_hrtimer_handler;
1108 }
1109 
1110 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1111 {
1112         struct hrtimer *timer = &cpuctx->hrtimer;
1113         struct pmu *pmu = cpuctx->ctx.pmu;
1114         unsigned long flags;
1115 
1116         /* not for SW PMU */
1117         if (pmu->task_ctx_nr == perf_sw_context)
1118                 return 0;
1119 
1120         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1121         if (!cpuctx->hrtimer_active) {
1122                 cpuctx->hrtimer_active = 1;
1123                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1124                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1125         }
1126         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1127 
1128         return 0;
1129 }
1130 
1131 void perf_pmu_disable(struct pmu *pmu)
1132 {
1133         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1134         if (!(*count)++)
1135                 pmu->pmu_disable(pmu);
1136 }
1137 
1138 void perf_pmu_enable(struct pmu *pmu)
1139 {
1140         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1141         if (!--(*count))
1142                 pmu->pmu_enable(pmu);
1143 }
1144 
1145 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1146 
1147 /*
1148  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1149  * perf_event_task_tick() are fully serialized because they're strictly cpu
1150  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1151  * disabled, while perf_event_task_tick is called from IRQ context.
1152  */
1153 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1154 {
1155         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1156 
1157         lockdep_assert_irqs_disabled();
1158 
1159         WARN_ON(!list_empty(&ctx->active_ctx_list));
1160 
1161         list_add(&ctx->active_ctx_list, head);
1162 }
1163 
1164 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1165 {
1166         lockdep_assert_irqs_disabled();
1167 
1168         WARN_ON(list_empty(&ctx->active_ctx_list));
1169 
1170         list_del_init(&ctx->active_ctx_list);
1171 }
1172 
1173 static void get_ctx(struct perf_event_context *ctx)
1174 {
1175         refcount_inc(&ctx->refcount);
1176 }
1177 
1178 static void free_ctx(struct rcu_head *head)
1179 {
1180         struct perf_event_context *ctx;
1181 
1182         ctx = container_of(head, struct perf_event_context, rcu_head);
1183         kfree(ctx->task_ctx_data);
1184         kfree(ctx);
1185 }
1186 
1187 static void put_ctx(struct perf_event_context *ctx)
1188 {
1189         if (refcount_dec_and_test(&ctx->refcount)) {
1190                 if (ctx->parent_ctx)
1191                         put_ctx(ctx->parent_ctx);
1192                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1193                         put_task_struct(ctx->task);
1194                 call_rcu(&ctx->rcu_head, free_ctx);
1195         }
1196 }
1197 
1198 /*
1199  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1200  * perf_pmu_migrate_context() we need some magic.
1201  *
1202  * Those places that change perf_event::ctx will hold both
1203  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1204  *
1205  * Lock ordering is by mutex address. There are two other sites where
1206  * perf_event_context::mutex nests and those are:
1207  *
1208  *  - perf_event_exit_task_context()    [ child , 0 ]
1209  *      perf_event_exit_event()
1210  *        put_event()                   [ parent, 1 ]
1211  *
1212  *  - perf_event_init_context()         [ parent, 0 ]
1213  *      inherit_task_group()
1214  *        inherit_group()
1215  *          inherit_event()
1216  *            perf_event_alloc()
1217  *              perf_init_event()
1218  *                perf_try_init_event() [ child , 1 ]
1219  *
1220  * While it appears there is an obvious deadlock here -- the parent and child
1221  * nesting levels are inverted between the two. This is in fact safe because
1222  * life-time rules separate them. That is an exiting task cannot fork, and a
1223  * spawning task cannot (yet) exit.
1224  *
1225  * But remember that that these are parent<->child context relations, and
1226  * migration does not affect children, therefore these two orderings should not
1227  * interact.
1228  *
1229  * The change in perf_event::ctx does not affect children (as claimed above)
1230  * because the sys_perf_event_open() case will install a new event and break
1231  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1232  * concerned with cpuctx and that doesn't have children.
1233  *
1234  * The places that change perf_event::ctx will issue:
1235  *
1236  *   perf_remove_from_context();
1237  *   synchronize_rcu();
1238  *   perf_install_in_context();
1239  *
1240  * to affect the change. The remove_from_context() + synchronize_rcu() should
1241  * quiesce the event, after which we can install it in the new location. This
1242  * means that only external vectors (perf_fops, prctl) can perturb the event
1243  * while in transit. Therefore all such accessors should also acquire
1244  * perf_event_context::mutex to serialize against this.
1245  *
1246  * However; because event->ctx can change while we're waiting to acquire
1247  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1248  * function.
1249  *
1250  * Lock order:
1251  *    cred_guard_mutex
1252  *      task_struct::perf_event_mutex
1253  *        perf_event_context::mutex
1254  *          perf_event::child_mutex;
1255  *            perf_event_context::lock
1256  *          perf_event::mmap_mutex
1257  *          mmap_sem
1258  *            perf_addr_filters_head::lock
1259  *
1260  *    cpu_hotplug_lock
1261  *      pmus_lock
1262  *        cpuctx->mutex / perf_event_context::mutex
1263  */
1264 static struct perf_event_context *
1265 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1266 {
1267         struct perf_event_context *ctx;
1268 
1269 again:
1270         rcu_read_lock();
1271         ctx = READ_ONCE(event->ctx);
1272         if (!refcount_inc_not_zero(&ctx->refcount)) {
1273                 rcu_read_unlock();
1274                 goto again;
1275         }
1276         rcu_read_unlock();
1277 
1278         mutex_lock_nested(&ctx->mutex, nesting);
1279         if (event->ctx != ctx) {
1280                 mutex_unlock(&ctx->mutex);
1281                 put_ctx(ctx);
1282                 goto again;
1283         }
1284 
1285         return ctx;
1286 }
1287 
1288 static inline struct perf_event_context *
1289 perf_event_ctx_lock(struct perf_event *event)
1290 {
1291         return perf_event_ctx_lock_nested(event, 0);
1292 }
1293 
1294 static void perf_event_ctx_unlock(struct perf_event *event,
1295                                   struct perf_event_context *ctx)
1296 {
1297         mutex_unlock(&ctx->mutex);
1298         put_ctx(ctx);
1299 }
1300 
1301 /*
1302  * This must be done under the ctx->lock, such as to serialize against
1303  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1304  * calling scheduler related locks and ctx->lock nests inside those.
1305  */
1306 static __must_check struct perf_event_context *
1307 unclone_ctx(struct perf_event_context *ctx)
1308 {
1309         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1310 
1311         lockdep_assert_held(&ctx->lock);
1312 
1313         if (parent_ctx)
1314                 ctx->parent_ctx = NULL;
1315         ctx->generation++;
1316 
1317         return parent_ctx;
1318 }
1319 
1320 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1321                                 enum pid_type type)
1322 {
1323         u32 nr;
1324         /*
1325          * only top level events have the pid namespace they were created in
1326          */
1327         if (event->parent)
1328                 event = event->parent;
1329 
1330         nr = __task_pid_nr_ns(p, type, event->ns);
1331         /* avoid -1 if it is idle thread or runs in another ns */
1332         if (!nr && !pid_alive(p))
1333                 nr = -1;
1334         return nr;
1335 }
1336 
1337 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1338 {
1339         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1340 }
1341 
1342 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1343 {
1344         return perf_event_pid_type(event, p, PIDTYPE_PID);
1345 }
1346 
1347 /*
1348  * If we inherit events we want to return the parent event id
1349  * to userspace.
1350  */
1351 static u64 primary_event_id(struct perf_event *event)
1352 {
1353         u64 id = event->id;
1354 
1355         if (event->parent)
1356                 id = event->parent->id;
1357 
1358         return id;
1359 }
1360 
1361 /*
1362  * Get the perf_event_context for a task and lock it.
1363  *
1364  * This has to cope with with the fact that until it is locked,
1365  * the context could get moved to another task.
1366  */
1367 static struct perf_event_context *
1368 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1369 {
1370         struct perf_event_context *ctx;
1371 
1372 retry:
1373         /*
1374          * One of the few rules of preemptible RCU is that one cannot do
1375          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1376          * part of the read side critical section was irqs-enabled -- see
1377          * rcu_read_unlock_special().
1378          *
1379          * Since ctx->lock nests under rq->lock we must ensure the entire read
1380          * side critical section has interrupts disabled.
1381          */
1382         local_irq_save(*flags);
1383         rcu_read_lock();
1384         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1385         if (ctx) {
1386                 /*
1387                  * If this context is a clone of another, it might
1388                  * get swapped for another underneath us by
1389                  * perf_event_task_sched_out, though the
1390                  * rcu_read_lock() protects us from any context
1391                  * getting freed.  Lock the context and check if it
1392                  * got swapped before we could get the lock, and retry
1393                  * if so.  If we locked the right context, then it
1394                  * can't get swapped on us any more.
1395                  */
1396                 raw_spin_lock(&ctx->lock);
1397                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1398                         raw_spin_unlock(&ctx->lock);
1399                         rcu_read_unlock();
1400                         local_irq_restore(*flags);
1401                         goto retry;
1402                 }
1403 
1404                 if (ctx->task == TASK_TOMBSTONE ||
1405                     !refcount_inc_not_zero(&ctx->refcount)) {
1406                         raw_spin_unlock(&ctx->lock);
1407                         ctx = NULL;
1408                 } else {
1409                         WARN_ON_ONCE(ctx->task != task);
1410                 }
1411         }
1412         rcu_read_unlock();
1413         if (!ctx)
1414                 local_irq_restore(*flags);
1415         return ctx;
1416 }
1417 
1418 /*
1419  * Get the context for a task and increment its pin_count so it
1420  * can't get swapped to another task.  This also increments its
1421  * reference count so that the context can't get freed.
1422  */
1423 static struct perf_event_context *
1424 perf_pin_task_context(struct task_struct *task, int ctxn)
1425 {
1426         struct perf_event_context *ctx;
1427         unsigned long flags;
1428 
1429         ctx = perf_lock_task_context(task, ctxn, &flags);
1430         if (ctx) {
1431                 ++ctx->pin_count;
1432                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1433         }
1434         return ctx;
1435 }
1436 
1437 static void perf_unpin_context(struct perf_event_context *ctx)
1438 {
1439         unsigned long flags;
1440 
1441         raw_spin_lock_irqsave(&ctx->lock, flags);
1442         --ctx->pin_count;
1443         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1444 }
1445 
1446 /*
1447  * Update the record of the current time in a context.
1448  */
1449 static void update_context_time(struct perf_event_context *ctx)
1450 {
1451         u64 now = perf_clock();
1452 
1453         ctx->time += now - ctx->timestamp;
1454         ctx->timestamp = now;
1455 }
1456 
1457 static u64 perf_event_time(struct perf_event *event)
1458 {
1459         struct perf_event_context *ctx = event->ctx;
1460 
1461         if (is_cgroup_event(event))
1462                 return perf_cgroup_event_time(event);
1463 
1464         return ctx ? ctx->time : 0;
1465 }
1466 
1467 static enum event_type_t get_event_type(struct perf_event *event)
1468 {
1469         struct perf_event_context *ctx = event->ctx;
1470         enum event_type_t event_type;
1471 
1472         lockdep_assert_held(&ctx->lock);
1473 
1474         /*
1475          * It's 'group type', really, because if our group leader is
1476          * pinned, so are we.
1477          */
1478         if (event->group_leader != event)
1479                 event = event->group_leader;
1480 
1481         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1482         if (!ctx->task)
1483                 event_type |= EVENT_CPU;
1484 
1485         return event_type;
1486 }
1487 
1488 /*
1489  * Helper function to initialize event group nodes.
1490  */
1491 static void init_event_group(struct perf_event *event)
1492 {
1493         RB_CLEAR_NODE(&event->group_node);
1494         event->group_index = 0;
1495 }
1496 
1497 /*
1498  * Extract pinned or flexible groups from the context
1499  * based on event attrs bits.
1500  */
1501 static struct perf_event_groups *
1502 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1503 {
1504         if (event->attr.pinned)
1505                 return &ctx->pinned_groups;
1506         else
1507                 return &ctx->flexible_groups;
1508 }
1509 
1510 /*
1511  * Helper function to initializes perf_event_group trees.
1512  */
1513 static void perf_event_groups_init(struct perf_event_groups *groups)
1514 {
1515         groups->tree = RB_ROOT;
1516         groups->index = 0;
1517 }
1518 
1519 /*
1520  * Compare function for event groups;
1521  *
1522  * Implements complex key that first sorts by CPU and then by virtual index
1523  * which provides ordering when rotating groups for the same CPU.
1524  */
1525 static bool
1526 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1527 {
1528         if (left->cpu < right->cpu)
1529                 return true;
1530         if (left->cpu > right->cpu)
1531                 return false;
1532 
1533         if (left->group_index < right->group_index)
1534                 return true;
1535         if (left->group_index > right->group_index)
1536                 return false;
1537 
1538         return false;
1539 }
1540 
1541 /*
1542  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1543  * key (see perf_event_groups_less). This places it last inside the CPU
1544  * subtree.
1545  */
1546 static void
1547 perf_event_groups_insert(struct perf_event_groups *groups,
1548                          struct perf_event *event)
1549 {
1550         struct perf_event *node_event;
1551         struct rb_node *parent;
1552         struct rb_node **node;
1553 
1554         event->group_index = ++groups->index;
1555 
1556         node = &groups->tree.rb_node;
1557         parent = *node;
1558 
1559         while (*node) {
1560                 parent = *node;
1561                 node_event = container_of(*node, struct perf_event, group_node);
1562 
1563                 if (perf_event_groups_less(event, node_event))
1564                         node = &parent->rb_left;
1565                 else
1566                         node = &parent->rb_right;
1567         }
1568 
1569         rb_link_node(&event->group_node, parent, node);
1570         rb_insert_color(&event->group_node, &groups->tree);
1571 }
1572 
1573 /*
1574  * Helper function to insert event into the pinned or flexible groups.
1575  */
1576 static void
1577 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1578 {
1579         struct perf_event_groups *groups;
1580 
1581         groups = get_event_groups(event, ctx);
1582         perf_event_groups_insert(groups, event);
1583 }
1584 
1585 /*
1586  * Delete a group from a tree.
1587  */
1588 static void
1589 perf_event_groups_delete(struct perf_event_groups *groups,
1590                          struct perf_event *event)
1591 {
1592         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1593                      RB_EMPTY_ROOT(&groups->tree));
1594 
1595         rb_erase(&event->group_node, &groups->tree);
1596         init_event_group(event);
1597 }
1598 
1599 /*
1600  * Helper function to delete event from its groups.
1601  */
1602 static void
1603 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1604 {
1605         struct perf_event_groups *groups;
1606 
1607         groups = get_event_groups(event, ctx);
1608         perf_event_groups_delete(groups, event);
1609 }
1610 
1611 /*
1612  * Get the leftmost event in the @cpu subtree.
1613  */
1614 static struct perf_event *
1615 perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1616 {
1617         struct perf_event *node_event = NULL, *match = NULL;
1618         struct rb_node *node = groups->tree.rb_node;
1619 
1620         while (node) {
1621                 node_event = container_of(node, struct perf_event, group_node);
1622 
1623                 if (cpu < node_event->cpu) {
1624                         node = node->rb_left;
1625                 } else if (cpu > node_event->cpu) {
1626                         node = node->rb_right;
1627                 } else {
1628                         match = node_event;
1629                         node = node->rb_left;
1630                 }
1631         }
1632 
1633         return match;
1634 }
1635 
1636 /*
1637  * Like rb_entry_next_safe() for the @cpu subtree.
1638  */
1639 static struct perf_event *
1640 perf_event_groups_next(struct perf_event *event)
1641 {
1642         struct perf_event *next;
1643 
1644         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1645         if (next && next->cpu == event->cpu)
1646                 return next;
1647 
1648         return NULL;
1649 }
1650 
1651 /*
1652  * Iterate through the whole groups tree.
1653  */
1654 #define perf_event_groups_for_each(event, groups)                       \
1655         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1656                                 typeof(*event), group_node); event;     \
1657                 event = rb_entry_safe(rb_next(&event->group_node),      \
1658                                 typeof(*event), group_node))
1659 
1660 /*
1661  * Add an event from the lists for its context.
1662  * Must be called with ctx->mutex and ctx->lock held.
1663  */
1664 static void
1665 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1666 {
1667         lockdep_assert_held(&ctx->lock);
1668 
1669         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1670         event->attach_state |= PERF_ATTACH_CONTEXT;
1671 
1672         event->tstamp = perf_event_time(event);
1673 
1674         /*
1675          * If we're a stand alone event or group leader, we go to the context
1676          * list, group events are kept attached to the group so that
1677          * perf_group_detach can, at all times, locate all siblings.
1678          */
1679         if (event->group_leader == event) {
1680                 event->group_caps = event->event_caps;
1681                 add_event_to_groups(event, ctx);
1682         }
1683 
1684         list_update_cgroup_event(event, ctx, true);
1685 
1686         list_add_rcu(&event->event_entry, &ctx->event_list);
1687         ctx->nr_events++;
1688         if (event->attr.inherit_stat)
1689                 ctx->nr_stat++;
1690 
1691         ctx->generation++;
1692 }
1693 
1694 /*
1695  * Initialize event state based on the perf_event_attr::disabled.
1696  */
1697 static inline void perf_event__state_init(struct perf_event *event)
1698 {
1699         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1700                                               PERF_EVENT_STATE_INACTIVE;
1701 }
1702 
1703 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1704 {
1705         int entry = sizeof(u64); /* value */
1706         int size = 0;
1707         int nr = 1;
1708 
1709         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1710                 size += sizeof(u64);
1711 
1712         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1713                 size += sizeof(u64);
1714 
1715         if (event->attr.read_format & PERF_FORMAT_ID)
1716                 entry += sizeof(u64);
1717 
1718         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1719                 nr += nr_siblings;
1720                 size += sizeof(u64);
1721         }
1722 
1723         size += entry * nr;
1724         event->read_size = size;
1725 }
1726 
1727 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1728 {
1729         struct perf_sample_data *data;
1730         u16 size = 0;
1731 
1732         if (sample_type & PERF_SAMPLE_IP)
1733                 size += sizeof(data->ip);
1734 
1735         if (sample_type & PERF_SAMPLE_ADDR)
1736                 size += sizeof(data->addr);
1737 
1738         if (sample_type & PERF_SAMPLE_PERIOD)
1739                 size += sizeof(data->period);
1740 
1741         if (sample_type & PERF_SAMPLE_WEIGHT)
1742                 size += sizeof(data->weight);
1743 
1744         if (sample_type & PERF_SAMPLE_READ)
1745                 size += event->read_size;
1746 
1747         if (sample_type & PERF_SAMPLE_DATA_SRC)
1748                 size += sizeof(data->data_src.val);
1749 
1750         if (sample_type & PERF_SAMPLE_TRANSACTION)
1751                 size += sizeof(data->txn);
1752 
1753         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1754                 size += sizeof(data->phys_addr);
1755 
1756         event->header_size = size;
1757 }
1758 
1759 /*
1760  * Called at perf_event creation and when events are attached/detached from a
1761  * group.
1762  */
1763 static void perf_event__header_size(struct perf_event *event)
1764 {
1765         __perf_event_read_size(event,
1766                                event->group_leader->nr_siblings);
1767         __perf_event_header_size(event, event->attr.sample_type);
1768 }
1769 
1770 static void perf_event__id_header_size(struct perf_event *event)
1771 {
1772         struct perf_sample_data *data;
1773         u64 sample_type = event->attr.sample_type;
1774         u16 size = 0;
1775 
1776         if (sample_type & PERF_SAMPLE_TID)
1777                 size += sizeof(data->tid_entry);
1778 
1779         if (sample_type & PERF_SAMPLE_TIME)
1780                 size += sizeof(data->time);
1781 
1782         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1783                 size += sizeof(data->id);
1784 
1785         if (sample_type & PERF_SAMPLE_ID)
1786                 size += sizeof(data->id);
1787 
1788         if (sample_type & PERF_SAMPLE_STREAM_ID)
1789                 size += sizeof(data->stream_id);
1790 
1791         if (sample_type & PERF_SAMPLE_CPU)
1792                 size += sizeof(data->cpu_entry);
1793 
1794         event->id_header_size = size;
1795 }
1796 
1797 static bool perf_event_validate_size(struct perf_event *event)
1798 {
1799         /*
1800          * The values computed here will be over-written when we actually
1801          * attach the event.
1802          */
1803         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1804         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1805         perf_event__id_header_size(event);
1806 
1807         /*
1808          * Sum the lot; should not exceed the 64k limit we have on records.
1809          * Conservative limit to allow for callchains and other variable fields.
1810          */
1811         if (event->read_size + event->header_size +
1812             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1813                 return false;
1814 
1815         return true;
1816 }
1817 
1818 static void perf_group_attach(struct perf_event *event)
1819 {
1820         struct perf_event *group_leader = event->group_leader, *pos;
1821 
1822         lockdep_assert_held(&event->ctx->lock);
1823 
1824         /*
1825          * We can have double attach due to group movement in perf_event_open.
1826          */
1827         if (event->attach_state & PERF_ATTACH_GROUP)
1828                 return;
1829 
1830         event->attach_state |= PERF_ATTACH_GROUP;
1831 
1832         if (group_leader == event)
1833                 return;
1834 
1835         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1836 
1837         group_leader->group_caps &= event->event_caps;
1838 
1839         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1840         group_leader->nr_siblings++;
1841 
1842         perf_event__header_size(group_leader);
1843 
1844         for_each_sibling_event(pos, group_leader)
1845                 perf_event__header_size(pos);
1846 }
1847 
1848 /*
1849  * Remove an event from the lists for its context.
1850  * Must be called with ctx->mutex and ctx->lock held.
1851  */
1852 static void
1853 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1854 {
1855         WARN_ON_ONCE(event->ctx != ctx);
1856         lockdep_assert_held(&ctx->lock);
1857 
1858         /*
1859          * We can have double detach due to exit/hot-unplug + close.
1860          */
1861         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1862                 return;
1863 
1864         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1865 
1866         list_update_cgroup_event(event, ctx, false);
1867 
1868         ctx->nr_events--;
1869         if (event->attr.inherit_stat)
1870                 ctx->nr_stat--;
1871 
1872         list_del_rcu(&event->event_entry);
1873 
1874         if (event->group_leader == event)
1875                 del_event_from_groups(event, ctx);
1876 
1877         /*
1878          * If event was in error state, then keep it
1879          * that way, otherwise bogus counts will be
1880          * returned on read(). The only way to get out
1881          * of error state is by explicit re-enabling
1882          * of the event
1883          */
1884         if (event->state > PERF_EVENT_STATE_OFF)
1885                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1886 
1887         ctx->generation++;
1888 }
1889 
1890 static void perf_group_detach(struct perf_event *event)
1891 {
1892         struct perf_event *sibling, *tmp;
1893         struct perf_event_context *ctx = event->ctx;
1894 
1895         lockdep_assert_held(&ctx->lock);
1896 
1897         /*
1898          * We can have double detach due to exit/hot-unplug + close.
1899          */
1900         if (!(event->attach_state & PERF_ATTACH_GROUP))
1901                 return;
1902 
1903         event->attach_state &= ~PERF_ATTACH_GROUP;
1904 
1905         /*
1906          * If this is a sibling, remove it from its group.
1907          */
1908         if (event->group_leader != event) {
1909                 list_del_init(&event->sibling_list);
1910                 event->group_leader->nr_siblings--;
1911                 goto out;
1912         }
1913 
1914         /*
1915          * If this was a group event with sibling events then
1916          * upgrade the siblings to singleton events by adding them
1917          * to whatever list we are on.
1918          */
1919         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1920 
1921                 sibling->group_leader = sibling;
1922                 list_del_init(&sibling->sibling_list);
1923 
1924                 /* Inherit group flags from the previous leader */
1925                 sibling->group_caps = event->group_caps;
1926 
1927                 if (!RB_EMPTY_NODE(&event->group_node)) {
1928                         add_event_to_groups(sibling, event->ctx);
1929 
1930                         if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1931                                 struct list_head *list = sibling->attr.pinned ?
1932                                         &ctx->pinned_active : &ctx->flexible_active;
1933 
1934                                 list_add_tail(&sibling->active_list, list);
1935                         }
1936                 }
1937 
1938                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1939         }
1940 
1941 out:
1942         perf_event__header_size(event->group_leader);
1943 
1944         for_each_sibling_event(tmp, event->group_leader)
1945                 perf_event__header_size(tmp);
1946 }
1947 
1948 static bool is_orphaned_event(struct perf_event *event)
1949 {
1950         return event->state == PERF_EVENT_STATE_DEAD;
1951 }
1952 
1953 static inline int __pmu_filter_match(struct perf_event *event)
1954 {
1955         struct pmu *pmu = event->pmu;
1956         return pmu->filter_match ? pmu->filter_match(event) : 1;
1957 }
1958 
1959 /*
1960  * Check whether we should attempt to schedule an event group based on
1961  * PMU-specific filtering. An event group can consist of HW and SW events,
1962  * potentially with a SW leader, so we must check all the filters, to
1963  * determine whether a group is schedulable:
1964  */
1965 static inline int pmu_filter_match(struct perf_event *event)
1966 {
1967         struct perf_event *sibling;
1968 
1969         if (!__pmu_filter_match(event))
1970                 return 0;
1971 
1972         for_each_sibling_event(sibling, event) {
1973                 if (!__pmu_filter_match(sibling))
1974                         return 0;
1975         }
1976 
1977         return 1;
1978 }
1979 
1980 static inline int
1981 event_filter_match(struct perf_event *event)
1982 {
1983         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1984                perf_cgroup_match(event) && pmu_filter_match(event);
1985 }
1986 
1987 static void
1988 event_sched_out(struct perf_event *event,
1989                   struct perf_cpu_context *cpuctx,
1990                   struct perf_event_context *ctx)
1991 {
1992         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1993 
1994         WARN_ON_ONCE(event->ctx != ctx);
1995         lockdep_assert_held(&ctx->lock);
1996 
1997         if (event->state != PERF_EVENT_STATE_ACTIVE)
1998                 return;
1999 
2000         /*
2001          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2002          * we can schedule events _OUT_ individually through things like
2003          * __perf_remove_from_context().
2004          */
2005         list_del_init(&event->active_list);
2006 
2007         perf_pmu_disable(event->pmu);
2008 
2009         event->pmu->del(event, 0);
2010         event->oncpu = -1;
2011 
2012         if (READ_ONCE(event->pending_disable) >= 0) {
2013                 WRITE_ONCE(event->pending_disable, -1);
2014                 state = PERF_EVENT_STATE_OFF;
2015         }
2016         perf_event_set_state(event, state);
2017 
2018         if (!is_software_event(event))
2019                 cpuctx->active_oncpu--;
2020         if (!--ctx->nr_active)
2021                 perf_event_ctx_deactivate(ctx);
2022         if (event->attr.freq && event->attr.sample_freq)
2023                 ctx->nr_freq--;
2024         if (event->attr.exclusive || !cpuctx->active_oncpu)
2025                 cpuctx->exclusive = 0;
2026 
2027         perf_pmu_enable(event->pmu);
2028 }
2029 
2030 static void
2031 group_sched_out(struct perf_event *group_event,
2032                 struct perf_cpu_context *cpuctx,
2033                 struct perf_event_context *ctx)
2034 {
2035         struct perf_event *event;
2036 
2037         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2038                 return;
2039 
2040         perf_pmu_disable(ctx->pmu);
2041 
2042         event_sched_out(group_event, cpuctx, ctx);
2043 
2044         /*
2045          * Schedule out siblings (if any):
2046          */
2047         for_each_sibling_event(event, group_event)
2048                 event_sched_out(event, cpuctx, ctx);
2049 
2050         perf_pmu_enable(ctx->pmu);
2051 
2052         if (group_event->attr.exclusive)
2053                 cpuctx->exclusive = 0;
2054 }
2055 
2056 #define DETACH_GROUP    0x01UL
2057 
2058 /*
2059  * Cross CPU call to remove a performance event
2060  *
2061  * We disable the event on the hardware level first. After that we
2062  * remove it from the context list.
2063  */
2064 static void
2065 __perf_remove_from_context(struct perf_event *event,
2066                            struct perf_cpu_context *cpuctx,
2067                            struct perf_event_context *ctx,
2068                            void *info)
2069 {
2070         unsigned long flags = (unsigned long)info;
2071 
2072         if (ctx->is_active & EVENT_TIME) {
2073                 update_context_time(ctx);
2074                 update_cgrp_time_from_cpuctx(cpuctx);
2075         }
2076 
2077         event_sched_out(event, cpuctx, ctx);
2078         if (flags & DETACH_GROUP)
2079                 perf_group_detach(event);
2080         list_del_event(event, ctx);
2081 
2082         if (!ctx->nr_events && ctx->is_active) {
2083                 ctx->is_active = 0;
2084                 if (ctx->task) {
2085                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2086                         cpuctx->task_ctx = NULL;
2087                 }
2088         }
2089 }
2090 
2091 /*
2092  * Remove the event from a task's (or a CPU's) list of events.
2093  *
2094  * If event->ctx is a cloned context, callers must make sure that
2095  * every task struct that event->ctx->task could possibly point to
2096  * remains valid.  This is OK when called from perf_release since
2097  * that only calls us on the top-level context, which can't be a clone.
2098  * When called from perf_event_exit_task, it's OK because the
2099  * context has been detached from its task.
2100  */
2101 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2102 {
2103         struct perf_event_context *ctx = event->ctx;
2104 
2105         lockdep_assert_held(&ctx->mutex);
2106 
2107         event_function_call(event, __perf_remove_from_context, (void *)flags);
2108 
2109         /*
2110          * The above event_function_call() can NO-OP when it hits
2111          * TASK_TOMBSTONE. In that case we must already have been detached
2112          * from the context (by perf_event_exit_event()) but the grouping
2113          * might still be in-tact.
2114          */
2115         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2116         if ((flags & DETACH_GROUP) &&
2117             (event->attach_state & PERF_ATTACH_GROUP)) {
2118                 /*
2119                  * Since in that case we cannot possibly be scheduled, simply
2120                  * detach now.
2121                  */
2122                 raw_spin_lock_irq(&ctx->lock);
2123                 perf_group_detach(event);
2124                 raw_spin_unlock_irq(&ctx->lock);
2125         }
2126 }
2127 
2128 /*
2129  * Cross CPU call to disable a performance event
2130  */
2131 static void __perf_event_disable(struct perf_event *event,
2132                                  struct perf_cpu_context *cpuctx,
2133                                  struct perf_event_context *ctx,
2134                                  void *info)
2135 {
2136         if (event->state < PERF_EVENT_STATE_INACTIVE)
2137                 return;
2138 
2139         if (ctx->is_active & EVENT_TIME) {
2140                 update_context_time(ctx);
2141                 update_cgrp_time_from_event(event);
2142         }
2143 
2144         if (event == event->group_leader)
2145                 group_sched_out(event, cpuctx, ctx);
2146         else
2147                 event_sched_out(event, cpuctx, ctx);
2148 
2149         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2150 }
2151 
2152 /*
2153  * Disable an event.
2154  *
2155  * If event->ctx is a cloned context, callers must make sure that
2156  * every task struct that event->ctx->task could possibly point to
2157  * remains valid.  This condition is satisifed when called through
2158  * perf_event_for_each_child or perf_event_for_each because they
2159  * hold the top-level event's child_mutex, so any descendant that
2160  * goes to exit will block in perf_event_exit_event().
2161  *
2162  * When called from perf_pending_event it's OK because event->ctx
2163  * is the current context on this CPU and preemption is disabled,
2164  * hence we can't get into perf_event_task_sched_out for this context.
2165  */
2166 static void _perf_event_disable(struct perf_event *event)
2167 {
2168         struct perf_event_context *ctx = event->ctx;
2169 
2170         raw_spin_lock_irq(&ctx->lock);
2171         if (event->state <= PERF_EVENT_STATE_OFF) {
2172                 raw_spin_unlock_irq(&ctx->lock);
2173                 return;
2174         }
2175         raw_spin_unlock_irq(&ctx->lock);
2176 
2177         event_function_call(event, __perf_event_disable, NULL);
2178 }
2179 
2180 void perf_event_disable_local(struct perf_event *event)
2181 {
2182         event_function_local(event, __perf_event_disable, NULL);
2183 }
2184 
2185 /*
2186  * Strictly speaking kernel users cannot create groups and therefore this
2187  * interface does not need the perf_event_ctx_lock() magic.
2188  */
2189 void perf_event_disable(struct perf_event *event)
2190 {
2191         struct perf_event_context *ctx;
2192 
2193         ctx = perf_event_ctx_lock(event);
2194         _perf_event_disable(event);
2195         perf_event_ctx_unlock(event, ctx);
2196 }
2197 EXPORT_SYMBOL_GPL(perf_event_disable);
2198 
2199 void perf_event_disable_inatomic(struct perf_event *event)
2200 {
2201         WRITE_ONCE(event->pending_disable, smp_processor_id());
2202         /* can fail, see perf_pending_event_disable() */
2203         irq_work_queue(&event->pending);
2204 }
2205 
2206 static void perf_set_shadow_time(struct perf_event *event,
2207                                  struct perf_event_context *ctx)
2208 {
2209         /*
2210          * use the correct time source for the time snapshot
2211          *
2212          * We could get by without this by leveraging the
2213          * fact that to get to this function, the caller
2214          * has most likely already called update_context_time()
2215          * and update_cgrp_time_xx() and thus both timestamp
2216          * are identical (or very close). Given that tstamp is,
2217          * already adjusted for cgroup, we could say that:
2218          *    tstamp - ctx->timestamp
2219          * is equivalent to
2220          *    tstamp - cgrp->timestamp.
2221          *
2222          * Then, in perf_output_read(), the calculation would
2223          * work with no changes because:
2224          * - event is guaranteed scheduled in
2225          * - no scheduled out in between
2226          * - thus the timestamp would be the same
2227          *
2228          * But this is a bit hairy.
2229          *
2230          * So instead, we have an explicit cgroup call to remain
2231          * within the time time source all along. We believe it
2232          * is cleaner and simpler to understand.
2233          */
2234         if (is_cgroup_event(event))
2235                 perf_cgroup_set_shadow_time(event, event->tstamp);
2236         else
2237                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2238 }
2239 
2240 #define MAX_INTERRUPTS (~0ULL)
2241 
2242 static void perf_log_throttle(struct perf_event *event, int enable);
2243 static void perf_log_itrace_start(struct perf_event *event);
2244 
2245 static int
2246 event_sched_in(struct perf_event *event,
2247                  struct perf_cpu_context *cpuctx,
2248                  struct perf_event_context *ctx)
2249 {
2250         int ret = 0;
2251 
2252         lockdep_assert_held(&ctx->lock);
2253 
2254         if (event->state <= PERF_EVENT_STATE_OFF)
2255                 return 0;
2256 
2257         WRITE_ONCE(event->oncpu, smp_processor_id());
2258         /*
2259          * Order event::oncpu write to happen before the ACTIVE state is
2260          * visible. This allows perf_event_{stop,read}() to observe the correct
2261          * ->oncpu if it sees ACTIVE.
2262          */
2263         smp_wmb();
2264         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2265 
2266         /*
2267          * Unthrottle events, since we scheduled we might have missed several
2268          * ticks already, also for a heavily scheduling task there is little
2269          * guarantee it'll get a tick in a timely manner.
2270          */
2271         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2272                 perf_log_throttle(event, 1);
2273                 event->hw.interrupts = 0;
2274         }
2275 
2276         perf_pmu_disable(event->pmu);
2277 
2278         perf_set_shadow_time(event, ctx);
2279 
2280         perf_log_itrace_start(event);
2281 
2282         if (event->pmu->add(event, PERF_EF_START)) {
2283                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2284                 event->oncpu = -1;
2285                 ret = -EAGAIN;
2286                 goto out;
2287         }
2288 
2289         if (!is_software_event(event))
2290                 cpuctx->active_oncpu++;
2291         if (!ctx->nr_active++)
2292                 perf_event_ctx_activate(ctx);
2293         if (event->attr.freq && event->attr.sample_freq)
2294                 ctx->nr_freq++;
2295 
2296         if (event->attr.exclusive)
2297                 cpuctx->exclusive = 1;
2298 
2299 out:
2300         perf_pmu_enable(event->pmu);
2301 
2302         return ret;
2303 }
2304 
2305 static int
2306 group_sched_in(struct perf_event *group_event,
2307                struct perf_cpu_context *cpuctx,
2308                struct perf_event_context *ctx)
2309 {
2310         struct perf_event *event, *partial_group = NULL;
2311         struct pmu *pmu = ctx->pmu;
2312 
2313         if (group_event->state == PERF_EVENT_STATE_OFF)
2314                 return 0;
2315 
2316         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2317 
2318         if (event_sched_in(group_event, cpuctx, ctx)) {
2319                 pmu->cancel_txn(pmu);
2320                 perf_mux_hrtimer_restart(cpuctx);
2321                 return -EAGAIN;
2322         }
2323 
2324         /*
2325          * Schedule in siblings as one group (if any):
2326          */
2327         for_each_sibling_event(event, group_event) {
2328                 if (event_sched_in(event, cpuctx, ctx)) {
2329                         partial_group = event;
2330                         goto group_error;
2331                 }
2332         }
2333 
2334         if (!pmu->commit_txn(pmu))
2335                 return 0;
2336 
2337 group_error:
2338         /*
2339          * Groups can be scheduled in as one unit only, so undo any
2340          * partial group before returning:
2341          * The events up to the failed event are scheduled out normally.
2342          */
2343         for_each_sibling_event(event, group_event) {
2344                 if (event == partial_group)
2345                         break;
2346 
2347                 event_sched_out(event, cpuctx, ctx);
2348         }
2349         event_sched_out(group_event, cpuctx, ctx);
2350 
2351         pmu->cancel_txn(pmu);
2352 
2353         perf_mux_hrtimer_restart(cpuctx);
2354 
2355         return -EAGAIN;
2356 }
2357 
2358 /*
2359  * Work out whether we can put this event group on the CPU now.
2360  */
2361 static int group_can_go_on(struct perf_event *event,
2362                            struct perf_cpu_context *cpuctx,
2363                            int can_add_hw)
2364 {
2365         /*
2366          * Groups consisting entirely of software events can always go on.
2367          */
2368         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2369                 return 1;
2370         /*
2371          * If an exclusive group is already on, no other hardware
2372          * events can go on.
2373          */
2374         if (cpuctx->exclusive)
2375                 return 0;
2376         /*
2377          * If this group is exclusive and there are already
2378          * events on the CPU, it can't go on.
2379          */
2380         if (event->attr.exclusive && cpuctx->active_oncpu)
2381                 return 0;
2382         /*
2383          * Otherwise, try to add it if all previous groups were able
2384          * to go on.
2385          */
2386         return can_add_hw;
2387 }
2388 
2389 static void add_event_to_ctx(struct perf_event *event,
2390                                struct perf_event_context *ctx)
2391 {
2392         list_add_event(event, ctx);
2393         perf_group_attach(event);
2394 }
2395 
2396 static void ctx_sched_out(struct perf_event_context *ctx,
2397                           struct perf_cpu_context *cpuctx,
2398                           enum event_type_t event_type);
2399 static void
2400 ctx_sched_in(struct perf_event_context *ctx,
2401              struct perf_cpu_context *cpuctx,
2402              enum event_type_t event_type,
2403              struct task_struct *task);
2404 
2405 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2406                                struct perf_event_context *ctx,
2407                                enum event_type_t event_type)
2408 {
2409         if (!cpuctx->task_ctx)
2410                 return;
2411 
2412         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2413                 return;
2414 
2415         ctx_sched_out(ctx, cpuctx, event_type);
2416 }
2417 
2418 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2419                                 struct perf_event_context *ctx,
2420                                 struct task_struct *task)
2421 {
2422         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2423         if (ctx)
2424                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2425         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2426         if (ctx)
2427                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2428 }
2429 
2430 /*
2431  * We want to maintain the following priority of scheduling:
2432  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2433  *  - task pinned (EVENT_PINNED)
2434  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2435  *  - task flexible (EVENT_FLEXIBLE).
2436  *
2437  * In order to avoid unscheduling and scheduling back in everything every
2438  * time an event is added, only do it for the groups of equal priority and
2439  * below.
2440  *
2441  * This can be called after a batch operation on task events, in which case
2442  * event_type is a bit mask of the types of events involved. For CPU events,
2443  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2444  */
2445 static void ctx_resched(struct perf_cpu_context *cpuctx,
2446                         struct perf_event_context *task_ctx,
2447                         enum event_type_t event_type)
2448 {
2449         enum event_type_t ctx_event_type;
2450         bool cpu_event = !!(event_type & EVENT_CPU);
2451 
2452         /*
2453          * If pinned groups are involved, flexible groups also need to be
2454          * scheduled out.
2455          */
2456         if (event_type & EVENT_PINNED)
2457                 event_type |= EVENT_FLEXIBLE;
2458 
2459         ctx_event_type = event_type & EVENT_ALL;
2460 
2461         perf_pmu_disable(cpuctx->ctx.pmu);
2462         if (task_ctx)
2463                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2464 
2465         /*
2466          * Decide which cpu ctx groups to schedule out based on the types
2467          * of events that caused rescheduling:
2468          *  - EVENT_CPU: schedule out corresponding groups;
2469          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2470          *  - otherwise, do nothing more.
2471          */
2472         if (cpu_event)
2473                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2474         else if (ctx_event_type & EVENT_PINNED)
2475                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2476 
2477         perf_event_sched_in(cpuctx, task_ctx, current);
2478         perf_pmu_enable(cpuctx->ctx.pmu);
2479 }
2480 
2481 void perf_pmu_resched(struct pmu *pmu)
2482 {
2483         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2484         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2485 
2486         perf_ctx_lock(cpuctx, task_ctx);
2487         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2488         perf_ctx_unlock(cpuctx, task_ctx);
2489 }
2490 
2491 /*
2492  * Cross CPU call to install and enable a performance event
2493  *
2494  * Very similar to remote_function() + event_function() but cannot assume that
2495  * things like ctx->is_active and cpuctx->task_ctx are set.
2496  */
2497 static int  __perf_install_in_context(void *info)
2498 {
2499         struct perf_event *event = info;
2500         struct perf_event_context *ctx = event->ctx;
2501         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2502         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2503         bool reprogram = true;
2504         int ret = 0;
2505 
2506         raw_spin_lock(&cpuctx->ctx.lock);
2507         if (ctx->task) {
2508                 raw_spin_lock(&ctx->lock);
2509                 task_ctx = ctx;
2510 
2511                 reprogram = (ctx->task == current);
2512 
2513                 /*
2514                  * If the task is running, it must be running on this CPU,
2515                  * otherwise we cannot reprogram things.
2516                  *
2517                  * If its not running, we don't care, ctx->lock will
2518                  * serialize against it becoming runnable.
2519                  */
2520                 if (task_curr(ctx->task) && !reprogram) {
2521                         ret = -ESRCH;
2522                         goto unlock;
2523                 }
2524 
2525                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2526         } else if (task_ctx) {
2527                 raw_spin_lock(&task_ctx->lock);
2528         }
2529 
2530 #ifdef CONFIG_CGROUP_PERF
2531         if (is_cgroup_event(event)) {
2532                 /*
2533                  * If the current cgroup doesn't match the event's
2534                  * cgroup, we should not try to schedule it.
2535                  */
2536                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2537                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2538                                         event->cgrp->css.cgroup);
2539         }
2540 #endif
2541 
2542         if (reprogram) {
2543                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2544                 add_event_to_ctx(event, ctx);
2545                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2546         } else {
2547                 add_event_to_ctx(event, ctx);
2548         }
2549 
2550 unlock:
2551         perf_ctx_unlock(cpuctx, task_ctx);
2552 
2553         return ret;
2554 }
2555 
2556 static bool exclusive_event_installable(struct perf_event *event,
2557                                         struct perf_event_context *ctx);
2558 
2559 /*
2560  * Attach a performance event to a context.
2561  *
2562  * Very similar to event_function_call, see comment there.
2563  */
2564 static void
2565 perf_install_in_context(struct perf_event_context *ctx,
2566                         struct perf_event *event,
2567                         int cpu)
2568 {
2569         struct task_struct *task = READ_ONCE(ctx->task);
2570 
2571         lockdep_assert_held(&ctx->mutex);
2572 
2573         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2574 
2575         if (event->cpu != -1)
2576                 event->cpu = cpu;
2577 
2578         /*
2579          * Ensures that if we can observe event->ctx, both the event and ctx
2580          * will be 'complete'. See perf_iterate_sb_cpu().
2581          */
2582         smp_store_release(&event->ctx, ctx);
2583 
2584         if (!task) {
2585                 cpu_function_call(cpu, __perf_install_in_context, event);
2586                 return;
2587         }
2588 
2589         /*
2590          * Should not happen, we validate the ctx is still alive before calling.
2591          */
2592         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2593                 return;
2594 
2595         /*
2596          * Installing events is tricky because we cannot rely on ctx->is_active
2597          * to be set in case this is the nr_events 0 -> 1 transition.
2598          *
2599          * Instead we use task_curr(), which tells us if the task is running.
2600          * However, since we use task_curr() outside of rq::lock, we can race
2601          * against the actual state. This means the result can be wrong.
2602          *
2603          * If we get a false positive, we retry, this is harmless.
2604          *
2605          * If we get a false negative, things are complicated. If we are after
2606          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2607          * value must be correct. If we're before, it doesn't matter since
2608          * perf_event_context_sched_in() will program the counter.
2609          *
2610          * However, this hinges on the remote context switch having observed
2611          * our task->perf_event_ctxp[] store, such that it will in fact take
2612          * ctx::lock in perf_event_context_sched_in().
2613          *
2614          * We do this by task_function_call(), if the IPI fails to hit the task
2615          * we know any future context switch of task must see the
2616          * perf_event_ctpx[] store.
2617          */
2618 
2619         /*
2620          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2621          * task_cpu() load, such that if the IPI then does not find the task
2622          * running, a future context switch of that task must observe the
2623          * store.
2624          */
2625         smp_mb();
2626 again:
2627         if (!task_function_call(task, __perf_install_in_context, event))
2628                 return;
2629 
2630         raw_spin_lock_irq(&ctx->lock);
2631         task = ctx->task;
2632         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2633                 /*
2634                  * Cannot happen because we already checked above (which also
2635                  * cannot happen), and we hold ctx->mutex, which serializes us
2636                  * against perf_event_exit_task_context().
2637                  */
2638                 raw_spin_unlock_irq(&ctx->lock);
2639                 return;
2640         }
2641         /*
2642          * If the task is not running, ctx->lock will avoid it becoming so,
2643          * thus we can safely install the event.
2644          */
2645         if (task_curr(task)) {
2646                 raw_spin_unlock_irq(&ctx->lock);
2647                 goto again;
2648         }
2649         add_event_to_ctx(event, ctx);
2650         raw_spin_unlock_irq(&ctx->lock);
2651 }
2652 
2653 /*
2654  * Cross CPU call to enable a performance event
2655  */
2656 static void __perf_event_enable(struct perf_event *event,
2657                                 struct perf_cpu_context *cpuctx,
2658                                 struct perf_event_context *ctx,
2659                                 void *info)
2660 {
2661         struct perf_event *leader = event->group_leader;
2662         struct perf_event_context *task_ctx;
2663 
2664         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2665             event->state <= PERF_EVENT_STATE_ERROR)
2666                 return;
2667 
2668         if (ctx->is_active)
2669                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2670 
2671         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2672 
2673         if (!ctx->is_active)
2674                 return;
2675 
2676         if (!event_filter_match(event)) {
2677                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2678                 return;
2679         }
2680 
2681         /*
2682          * If the event is in a group and isn't the group leader,
2683          * then don't put it on unless the group is on.
2684          */
2685         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2686                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2687                 return;
2688         }
2689 
2690         task_ctx = cpuctx->task_ctx;
2691         if (ctx->task)
2692                 WARN_ON_ONCE(task_ctx != ctx);
2693 
2694         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2695 }
2696 
2697 /*
2698  * Enable an event.
2699  *
2700  * If event->ctx is a cloned context, callers must make sure that
2701  * every task struct that event->ctx->task could possibly point to
2702  * remains valid.  This condition is satisfied when called through
2703  * perf_event_for_each_child or perf_event_for_each as described
2704  * for perf_event_disable.
2705  */
2706 static void _perf_event_enable(struct perf_event *event)
2707 {
2708         struct perf_event_context *ctx = event->ctx;
2709 
2710         raw_spin_lock_irq(&ctx->lock);
2711         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2712             event->state <  PERF_EVENT_STATE_ERROR) {
2713                 raw_spin_unlock_irq(&ctx->lock);
2714                 return;
2715         }
2716 
2717         /*
2718          * If the event is in error state, clear that first.
2719          *
2720          * That way, if we see the event in error state below, we know that it
2721          * has gone back into error state, as distinct from the task having
2722          * been scheduled away before the cross-call arrived.
2723          */
2724         if (event->state == PERF_EVENT_STATE_ERROR)
2725                 event->state = PERF_EVENT_STATE_OFF;
2726         raw_spin_unlock_irq(&ctx->lock);
2727 
2728         event_function_call(event, __perf_event_enable, NULL);
2729 }
2730 
2731 /*
2732  * See perf_event_disable();
2733  */
2734 void perf_event_enable(struct perf_event *event)
2735 {
2736         struct perf_event_context *ctx;
2737 
2738         ctx = perf_event_ctx_lock(event);
2739         _perf_event_enable(event);
2740         perf_event_ctx_unlock(event, ctx);
2741 }
2742 EXPORT_SYMBOL_GPL(perf_event_enable);
2743 
2744 struct stop_event_data {
2745         struct perf_event       *event;
2746         unsigned int            restart;
2747 };
2748 
2749 static int __perf_event_stop(void *info)
2750 {
2751         struct stop_event_data *sd = info;
2752         struct perf_event *event = sd->event;
2753 
2754         /* if it's already INACTIVE, do nothing */
2755         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2756                 return 0;
2757 
2758         /* matches smp_wmb() in event_sched_in() */
2759         smp_rmb();
2760 
2761         /*
2762          * There is a window with interrupts enabled before we get here,
2763          * so we need to check again lest we try to stop another CPU's event.
2764          */
2765         if (READ_ONCE(event->oncpu) != smp_processor_id())
2766                 return -EAGAIN;
2767 
2768         event->pmu->stop(event, PERF_EF_UPDATE);
2769 
2770         /*
2771          * May race with the actual stop (through perf_pmu_output_stop()),
2772          * but it is only used for events with AUX ring buffer, and such
2773          * events will refuse to restart because of rb::aux_mmap_count==0,
2774          * see comments in perf_aux_output_begin().
2775          *
2776          * Since this is happening on an event-local CPU, no trace is lost
2777          * while restarting.
2778          */
2779         if (sd->restart)
2780                 event->pmu->start(event, 0);
2781 
2782         return 0;
2783 }
2784 
2785 static int perf_event_stop(struct perf_event *event, int restart)
2786 {
2787         struct stop_event_data sd = {
2788                 .event          = event,
2789                 .restart        = restart,
2790         };
2791         int ret = 0;
2792 
2793         do {
2794                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2795                         return 0;
2796 
2797                 /* matches smp_wmb() in event_sched_in() */
2798                 smp_rmb();
2799 
2800                 /*
2801                  * We only want to restart ACTIVE events, so if the event goes
2802                  * inactive here (event->oncpu==-1), there's nothing more to do;
2803                  * fall through with ret==-ENXIO.
2804                  */
2805                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2806                                         __perf_event_stop, &sd);
2807         } while (ret == -EAGAIN);
2808 
2809         return ret;
2810 }
2811 
2812 /*
2813  * In order to contain the amount of racy and tricky in the address filter
2814  * configuration management, it is a two part process:
2815  *
2816  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2817  *      we update the addresses of corresponding vmas in
2818  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
2819  * (p2) when an event is scheduled in (pmu::add), it calls
2820  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2821  *      if the generation has changed since the previous call.
2822  *
2823  * If (p1) happens while the event is active, we restart it to force (p2).
2824  *
2825  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2826  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2827  *     ioctl;
2828  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2830  *     for reading;
2831  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2832  *     of exec.
2833  */
2834 void perf_event_addr_filters_sync(struct perf_event *event)
2835 {
2836         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2837 
2838         if (!has_addr_filter(event))
2839                 return;
2840 
2841         raw_spin_lock(&ifh->lock);
2842         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2843                 event->pmu->addr_filters_sync(event);
2844                 event->hw.addr_filters_gen = event->addr_filters_gen;
2845         }
2846         raw_spin_unlock(&ifh->lock);
2847 }
2848 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2849 
2850 static int _perf_event_refresh(struct perf_event *event, int refresh)
2851 {
2852         /*
2853          * not supported on inherited events
2854          */
2855         if (event->attr.inherit || !is_sampling_event(event))
2856                 return -EINVAL;
2857 
2858         atomic_add(refresh, &event->event_limit);
2859         _perf_event_enable(event);
2860 
2861         return 0;
2862 }
2863 
2864 /*
2865  * See perf_event_disable()
2866  */
2867 int perf_event_refresh(struct perf_event *event, int refresh)
2868 {
2869         struct perf_event_context *ctx;
2870         int ret;
2871 
2872         ctx = perf_event_ctx_lock(event);
2873         ret = _perf_event_refresh(event, refresh);
2874         perf_event_ctx_unlock(event, ctx);
2875 
2876         return ret;
2877 }
2878 EXPORT_SYMBOL_GPL(perf_event_refresh);
2879 
2880 static int perf_event_modify_breakpoint(struct perf_event *bp,
2881                                          struct perf_event_attr *attr)
2882 {
2883         int err;
2884 
2885         _perf_event_disable(bp);
2886 
2887         err = modify_user_hw_breakpoint_check(bp, attr, true);
2888 
2889         if (!bp->attr.disabled)
2890                 _perf_event_enable(bp);
2891 
2892         return err;
2893 }
2894 
2895 static int perf_event_modify_attr(struct perf_event *event,
2896                                   struct perf_event_attr *attr)
2897 {
2898         if (event->attr.type != attr->type)
2899                 return -EINVAL;
2900 
2901         switch (event->attr.type) {
2902         case PERF_TYPE_BREAKPOINT:
2903                 return perf_event_modify_breakpoint(event, attr);
2904         default:
2905                 /* Place holder for future additions. */
2906                 return -EOPNOTSUPP;
2907         }
2908 }
2909 
2910 static void ctx_sched_out(struct perf_event_context *ctx,
2911                           struct perf_cpu_context *cpuctx,
2912                           enum event_type_t event_type)
2913 {
2914         struct perf_event *event, *tmp;
2915         int is_active = ctx->is_active;
2916 
2917         lockdep_assert_held(&ctx->lock);
2918 
2919         if (likely(!ctx->nr_events)) {
2920                 /*
2921                  * See __perf_remove_from_context().
2922                  */
2923                 WARN_ON_ONCE(ctx->is_active);
2924                 if (ctx->task)
2925                         WARN_ON_ONCE(cpuctx->task_ctx);
2926                 return;
2927         }
2928 
2929         ctx->is_active &= ~event_type;
2930         if (!(ctx->is_active & EVENT_ALL))
2931                 ctx->is_active = 0;
2932 
2933         if (ctx->task) {
2934                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2935                 if (!ctx->is_active)
2936                         cpuctx->task_ctx = NULL;
2937         }
2938 
2939         /*
2940          * Always update time if it was set; not only when it changes.
2941          * Otherwise we can 'forget' to update time for any but the last
2942          * context we sched out. For example:
2943          *
2944          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2945          *   ctx_sched_out(.event_type = EVENT_PINNED)
2946          *
2947          * would only update time for the pinned events.
2948          */
2949         if (is_active & EVENT_TIME) {
2950                 /* update (and stop) ctx time */
2951                 update_context_time(ctx);
2952                 update_cgrp_time_from_cpuctx(cpuctx);
2953         }
2954 
2955         is_active ^= ctx->is_active; /* changed bits */
2956 
2957         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2958                 return;
2959 
2960         /*
2961          * If we had been multiplexing, no rotations are necessary, now no events
2962          * are active.
2963          */
2964         ctx->rotate_necessary = 0;
2965 
2966         perf_pmu_disable(ctx->pmu);
2967         if (is_active & EVENT_PINNED) {
2968                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2969                         group_sched_out(event, cpuctx, ctx);
2970         }
2971 
2972         if (is_active & EVENT_FLEXIBLE) {
2973                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2974                         group_sched_out(event, cpuctx, ctx);
2975         }
2976         perf_pmu_enable(ctx->pmu);
2977 }
2978 
2979 /*
2980  * Test whether two contexts are equivalent, i.e. whether they have both been
2981  * cloned from the same version of the same context.
2982  *
2983  * Equivalence is measured using a generation number in the context that is
2984  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2985  * and list_del_event().
2986  */
2987 static int context_equiv(struct perf_event_context *ctx1,
2988                          struct perf_event_context *ctx2)
2989 {
2990         lockdep_assert_held(&ctx1->lock);
2991         lockdep_assert_held(&ctx2->lock);
2992 
2993         /* Pinning disables the swap optimization */
2994         if (ctx1->pin_count || ctx2->pin_count)
2995                 return 0;
2996 
2997         /* If ctx1 is the parent of ctx2 */
2998         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2999                 return 1;
3000 
3001         /* If ctx2 is the parent of ctx1 */
3002         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3003                 return 1;
3004 
3005         /*
3006          * If ctx1 and ctx2 have the same parent; we flatten the parent
3007          * hierarchy, see perf_event_init_context().
3008          */
3009         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3010                         ctx1->parent_gen == ctx2->parent_gen)
3011                 return 1;
3012 
3013         /* Unmatched */
3014         return 0;
3015 }
3016 
3017 static void __perf_event_sync_stat(struct perf_event *event,
3018                                      struct perf_event *next_event)
3019 {
3020         u64 value;
3021 
3022         if (!event->attr.inherit_stat)
3023                 return;
3024 
3025         /*
3026          * Update the event value, we cannot use perf_event_read()
3027          * because we're in the middle of a context switch and have IRQs
3028          * disabled, which upsets smp_call_function_single(), however
3029          * we know the event must be on the current CPU, therefore we
3030          * don't need to use it.
3031          */
3032         if (event->state == PERF_EVENT_STATE_ACTIVE)
3033                 event->pmu->read(event);
3034 
3035         perf_event_update_time(event);
3036 
3037         /*
3038          * In order to keep per-task stats reliable we need to flip the event
3039          * values when we flip the contexts.
3040          */
3041         value = local64_read(&next_event->count);
3042         value = local64_xchg(&event->count, value);
3043         local64_set(&next_event->count, value);
3044 
3045         swap(event->total_time_enabled, next_event->total_time_enabled);
3046         swap(event->total_time_running, next_event->total_time_running);
3047 
3048         /*
3049          * Since we swizzled the values, update the user visible data too.
3050          */
3051         perf_event_update_userpage(event);
3052         perf_event_update_userpage(next_event);
3053 }
3054 
3055 static void perf_event_sync_stat(struct perf_event_context *ctx,
3056                                    struct perf_event_context *next_ctx)
3057 {
3058         struct perf_event *event, *next_event;
3059 
3060         if (!ctx->nr_stat)
3061                 return;
3062 
3063         update_context_time(ctx);
3064 
3065         event = list_first_entry(&ctx->event_list,
3066                                    struct perf_event, event_entry);
3067 
3068         next_event = list_first_entry(&next_ctx->event_list,
3069                                         struct perf_event, event_entry);
3070 
3071         while (&event->event_entry != &ctx->event_list &&
3072                &next_event->event_entry != &next_ctx->event_list) {
3073 
3074                 __perf_event_sync_stat(event, next_event);
3075 
3076                 event = list_next_entry(event, event_entry);
3077                 next_event = list_next_entry(next_event, event_entry);
3078         }
3079 }
3080 
3081 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3082                                          struct task_struct *next)
3083 {
3084         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3085         struct perf_event_context *next_ctx;
3086         struct perf_event_context *parent, *next_parent;
3087         struct perf_cpu_context *cpuctx;
3088         int do_switch = 1;
3089 
3090         if (likely(!ctx))
3091                 return;
3092 
3093         cpuctx = __get_cpu_context(ctx);
3094         if (!cpuctx->task_ctx)
3095                 return;
3096 
3097         rcu_read_lock();
3098         next_ctx = next->perf_event_ctxp[ctxn];
3099         if (!next_ctx)
3100                 goto unlock;
3101 
3102         parent = rcu_dereference(ctx->parent_ctx);
3103         next_parent = rcu_dereference(next_ctx->parent_ctx);
3104 
3105         /* If neither context have a parent context; they cannot be clones. */
3106         if (!parent && !next_parent)
3107                 goto unlock;
3108 
3109         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3110                 /*
3111                  * Looks like the two contexts are clones, so we might be
3112                  * able to optimize the context switch.  We lock both
3113                  * contexts and check that they are clones under the
3114                  * lock (including re-checking that neither has been
3115                  * uncloned in the meantime).  It doesn't matter which
3116                  * order we take the locks because no other cpu could
3117                  * be trying to lock both of these tasks.
3118                  */
3119                 raw_spin_lock(&ctx->lock);
3120                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3121                 if (context_equiv(ctx, next_ctx)) {
3122                         WRITE_ONCE(ctx->task, next);
3123                         WRITE_ONCE(next_ctx->task, task);
3124 
3125                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3126 
3127                         /*
3128                          * RCU_INIT_POINTER here is safe because we've not
3129                          * modified the ctx and the above modification of
3130                          * ctx->task and ctx->task_ctx_data are immaterial
3131                          * since those values are always verified under
3132                          * ctx->lock which we're now holding.
3133                          */
3134                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3135                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3136 
3137                         do_switch = 0;
3138 
3139                         perf_event_sync_stat(ctx, next_ctx);
3140                 }
3141                 raw_spin_unlock(&next_ctx->lock);
3142                 raw_spin_unlock(&ctx->lock);
3143         }
3144 unlock:
3145         rcu_read_unlock();
3146 
3147         if (do_switch) {
3148                 raw_spin_lock(&ctx->lock);
3149                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3150                 raw_spin_unlock(&ctx->lock);
3151         }
3152 }
3153 
3154 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3155 
3156 void perf_sched_cb_dec(struct pmu *pmu)
3157 {
3158         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3159 
3160         this_cpu_dec(perf_sched_cb_usages);
3161 
3162         if (!--cpuctx->sched_cb_usage)
3163                 list_del(&cpuctx->sched_cb_entry);
3164 }
3165 
3166 
3167 void perf_sched_cb_inc(struct pmu *pmu)
3168 {
3169         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3170 
3171         if (!cpuctx->sched_cb_usage++)
3172                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3173 
3174         this_cpu_inc(perf_sched_cb_usages);
3175 }
3176 
3177 /*
3178  * This function provides the context switch callback to the lower code
3179  * layer. It is invoked ONLY when the context switch callback is enabled.
3180  *
3181  * This callback is relevant even to per-cpu events; for example multi event
3182  * PEBS requires this to provide PID/TID information. This requires we flush
3183  * all queued PEBS records before we context switch to a new task.
3184  */
3185 static void perf_pmu_sched_task(struct task_struct *prev,
3186                                 struct task_struct *next,
3187                                 bool sched_in)
3188 {
3189         struct perf_cpu_context *cpuctx;
3190         struct pmu *pmu;
3191 
3192         if (prev == next)
3193                 return;
3194 
3195         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3196                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3197 
3198                 if (WARN_ON_ONCE(!pmu->sched_task))
3199                         continue;
3200 
3201                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3202                 perf_pmu_disable(pmu);
3203 
3204                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3205 
3206                 perf_pmu_enable(pmu);
3207                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3208         }
3209 }
3210 
3211 static void perf_event_switch(struct task_struct *task,
3212                               struct task_struct *next_prev, bool sched_in);
3213 
3214 #define for_each_task_context_nr(ctxn)                                  \
3215         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3216 
3217 /*
3218  * Called from scheduler to remove the events of the current task,
3219  * with interrupts disabled.
3220  *
3221  * We stop each event and update the event value in event->count.
3222  *
3223  * This does not protect us against NMI, but disable()
3224  * sets the disabled bit in the control field of event _before_
3225  * accessing the event control register. If a NMI hits, then it will
3226  * not restart the event.
3227  */
3228 void __perf_event_task_sched_out(struct task_struct *task,
3229                                  struct task_struct *next)
3230 {
3231         int ctxn;
3232 
3233         if (__this_cpu_read(perf_sched_cb_usages))
3234                 perf_pmu_sched_task(task, next, false);
3235 
3236         if (atomic_read(&nr_switch_events))
3237                 perf_event_switch(task, next, false);
3238 
3239         for_each_task_context_nr(ctxn)
3240                 perf_event_context_sched_out(task, ctxn, next);
3241 
3242         /*
3243          * if cgroup events exist on this CPU, then we need
3244          * to check if we have to switch out PMU state.
3245          * cgroup event are system-wide mode only
3246          */
3247         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3248                 perf_cgroup_sched_out(task, next);
3249 }
3250 
3251 /*
3252  * Called with IRQs disabled
3253  */
3254 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3255                               enum event_type_t event_type)
3256 {
3257         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3258 }
3259 
3260 static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3261                               int (*func)(struct perf_event *, void *), void *data)
3262 {
3263         struct perf_event **evt, *evt1, *evt2;
3264         int ret;
3265 
3266         evt1 = perf_event_groups_first(groups, -1);
3267         evt2 = perf_event_groups_first(groups, cpu);
3268 
3269         while (evt1 || evt2) {
3270                 if (evt1 && evt2) {
3271                         if (evt1->group_index < evt2->group_index)
3272                                 evt = &evt1;
3273                         else
3274                                 evt = &evt2;
3275                 } else if (evt1) {
3276                         evt = &evt1;
3277                 } else {
3278                         evt = &evt2;
3279                 }
3280 
3281                 ret = func(*evt, data);
3282                 if (ret)
3283                         return ret;
3284 
3285                 *evt = perf_event_groups_next(*evt);
3286         }
3287 
3288         return 0;
3289 }
3290 
3291 struct sched_in_data {
3292         struct perf_event_context *ctx;
3293         struct perf_cpu_context *cpuctx;
3294         int can_add_hw;
3295 };
3296 
3297 static int pinned_sched_in(struct perf_event *event, void *data)
3298 {
3299         struct sched_in_data *sid = data;
3300 
3301         if (event->state <= PERF_EVENT_STATE_OFF)
3302                 return 0;
3303 
3304         if (!event_filter_match(event))
3305                 return 0;
3306 
3307         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3308                 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3309                         list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3310         }
3311 
3312         /*
3313          * If this pinned group hasn't been scheduled,
3314          * put it in error state.
3315          */
3316         if (event->state == PERF_EVENT_STATE_INACTIVE)
3317                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3318 
3319         return 0;
3320 }
3321 
3322 static int flexible_sched_in(struct perf_event *event, void *data)
3323 {
3324         struct sched_in_data *sid = data;
3325 
3326         if (event->state <= PERF_EVENT_STATE_OFF)
3327                 return 0;
3328 
3329         if (!event_filter_match(event))
3330                 return 0;
3331 
3332         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3333                 int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
3334                 if (ret) {
3335                         sid->can_add_hw = 0;
3336                         sid->ctx->rotate_necessary = 1;
3337                         return 0;
3338                 }
3339                 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3340         }
3341 
3342         return 0;
3343 }
3344 
3345 static void
3346 ctx_pinned_sched_in(struct perf_event_context *ctx,
3347                     struct perf_cpu_context *cpuctx)
3348 {
3349         struct sched_in_data sid = {
3350                 .ctx = ctx,
3351                 .cpuctx = cpuctx,
3352                 .can_add_hw = 1,
3353         };
3354 
3355         visit_groups_merge(&ctx->pinned_groups,
3356                            smp_processor_id(),
3357                            pinned_sched_in, &sid);
3358 }
3359 
3360 static void
3361 ctx_flexible_sched_in(struct perf_event_context *ctx,
3362                       struct perf_cpu_context *cpuctx)
3363 {
3364         struct sched_in_data sid = {
3365                 .ctx = ctx,
3366                 .cpuctx = cpuctx,
3367                 .can_add_hw = 1,
3368         };
3369 
3370         visit_groups_merge(&ctx->flexible_groups,
3371                            smp_processor_id(),
3372                            flexible_sched_in, &sid);
3373 }
3374 
3375 static void
3376 ctx_sched_in(struct perf_event_context *ctx,
3377              struct perf_cpu_context *cpuctx,
3378              enum event_type_t event_type,
3379              struct task_struct *task)
3380 {
3381         int is_active = ctx->is_active;
3382         u64 now;
3383 
3384         lockdep_assert_held(&ctx->lock);
3385 
3386         if (likely(!ctx->nr_events))
3387                 return;
3388 
3389         ctx->is_active |= (event_type | EVENT_TIME);
3390         if (ctx->task) {
3391                 if (!is_active)
3392                         cpuctx->task_ctx = ctx;
3393                 else
3394                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3395         }
3396 
3397         is_active ^= ctx->is_active; /* changed bits */
3398 
3399         if (is_active & EVENT_TIME) {
3400                 /* start ctx time */
3401                 now = perf_clock();
3402                 ctx->timestamp = now;
3403                 perf_cgroup_set_timestamp(task, ctx);
3404         }
3405 
3406         /*
3407          * First go through the list and put on any pinned groups
3408          * in order to give them the best chance of going on.
3409          */
3410         if (is_active & EVENT_PINNED)
3411                 ctx_pinned_sched_in(ctx, cpuctx);
3412 
3413         /* Then walk through the lower prio flexible groups */
3414         if (is_active & EVENT_FLEXIBLE)
3415                 ctx_flexible_sched_in(ctx, cpuctx);
3416 }
3417 
3418 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3419                              enum event_type_t event_type,
3420                              struct task_struct *task)
3421 {
3422         struct perf_event_context *ctx = &cpuctx->ctx;
3423 
3424         ctx_sched_in(ctx, cpuctx, event_type, task);
3425 }
3426 
3427 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3428                                         struct task_struct *task)
3429 {
3430         struct perf_cpu_context *cpuctx;
3431 
3432         cpuctx = __get_cpu_context(ctx);
3433         if (cpuctx->task_ctx == ctx)
3434                 return;
3435 
3436         perf_ctx_lock(cpuctx, ctx);
3437         /*
3438          * We must check ctx->nr_events while holding ctx->lock, such
3439          * that we serialize against perf_install_in_context().
3440          */
3441         if (!ctx->nr_events)
3442                 goto unlock;
3443 
3444         perf_pmu_disable(ctx->pmu);
3445         /*
3446          * We want to keep the following priority order:
3447          * cpu pinned (that don't need to move), task pinned,
3448          * cpu flexible, task flexible.
3449          *
3450          * However, if task's ctx is not carrying any pinned
3451          * events, no need to flip the cpuctx's events around.
3452          */
3453         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3454                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3455         perf_event_sched_in(cpuctx, ctx, task);
3456         perf_pmu_enable(ctx->pmu);
3457 
3458 unlock:
3459         perf_ctx_unlock(cpuctx, ctx);
3460 }
3461 
3462 /*
3463  * Called from scheduler to add the events of the current task
3464  * with interrupts disabled.
3465  *
3466  * We restore the event value and then enable it.
3467  *
3468  * This does not protect us against NMI, but enable()
3469  * sets the enabled bit in the control field of event _before_
3470  * accessing the event control register. If a NMI hits, then it will
3471  * keep the event running.
3472  */
3473 void __perf_event_task_sched_in(struct task_struct *prev,
3474                                 struct task_struct *task)
3475 {
3476         struct perf_event_context *ctx;
3477         int ctxn;
3478 
3479         /*
3480          * If cgroup events exist on this CPU, then we need to check if we have
3481          * to switch in PMU state; cgroup event are system-wide mode only.
3482          *
3483          * Since cgroup events are CPU events, we must schedule these in before
3484          * we schedule in the task events.
3485          */
3486         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3487                 perf_cgroup_sched_in(prev, task);
3488 
3489         for_each_task_context_nr(ctxn) {
3490                 ctx = task->perf_event_ctxp[ctxn];
3491                 if (likely(!ctx))
3492                         continue;
3493 
3494                 perf_event_context_sched_in(ctx, task);
3495         }
3496 
3497         if (atomic_read(&nr_switch_events))
3498                 perf_event_switch(task, prev, true);
3499 
3500         if (__this_cpu_read(perf_sched_cb_usages))
3501                 perf_pmu_sched_task(prev, task, true);
3502 }
3503 
3504 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3505 {
3506         u64 frequency = event->attr.sample_freq;
3507         u64 sec = NSEC_PER_SEC;
3508         u64 divisor, dividend;
3509 
3510         int count_fls, nsec_fls, frequency_fls, sec_fls;
3511 
3512         count_fls = fls64(count);
3513         nsec_fls = fls64(nsec);
3514         frequency_fls = fls64(frequency);
3515         sec_fls = 30;
3516 
3517         /*
3518          * We got @count in @nsec, with a target of sample_freq HZ
3519          * the target period becomes:
3520          *
3521          *             @count * 10^9
3522          * period = -------------------
3523          *          @nsec * sample_freq
3524          *
3525          */
3526 
3527         /*
3528          * Reduce accuracy by one bit such that @a and @b converge
3529          * to a similar magnitude.
3530          */
3531 #define REDUCE_FLS(a, b)                \
3532 do {                                    \
3533         if (a##_fls > b##_fls) {        \
3534                 a >>= 1;                \
3535                 a##_fls--;              \
3536         } else {                        \
3537                 b >>= 1;                \
3538                 b##_fls--;              \
3539         }                               \
3540 } while (0)
3541 
3542         /*
3543          * Reduce accuracy until either term fits in a u64, then proceed with
3544          * the other, so that finally we can do a u64/u64 division.
3545          */
3546         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3547                 REDUCE_FLS(nsec, frequency);
3548                 REDUCE_FLS(sec, count);
3549         }
3550 
3551         if (count_fls + sec_fls > 64) {
3552                 divisor = nsec * frequency;
3553 
3554                 while (count_fls + sec_fls > 64) {
3555                         REDUCE_FLS(count, sec);
3556                         divisor >>= 1;
3557                 }
3558 
3559                 dividend = count * sec;
3560         } else {
3561                 dividend = count * sec;
3562 
3563                 while (nsec_fls + frequency_fls > 64) {
3564                         REDUCE_FLS(nsec, frequency);
3565                         dividend >>= 1;
3566                 }
3567 
3568                 divisor = nsec * frequency;
3569         }
3570 
3571         if (!divisor)
3572                 return dividend;
3573 
3574         return div64_u64(dividend, divisor);
3575 }
3576 
3577 static DEFINE_PER_CPU(int, perf_throttled_count);
3578 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3579 
3580 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3581 {
3582         struct hw_perf_event *hwc = &event->hw;
3583         s64 period, sample_period;
3584         s64 delta;
3585 
3586         period = perf_calculate_period(event, nsec, count);
3587 
3588         delta = (s64)(period - hwc->sample_period);
3589         delta = (delta + 7) / 8; /* low pass filter */
3590 
3591         sample_period = hwc->sample_period + delta;
3592 
3593         if (!sample_period)
3594                 sample_period = 1;
3595 
3596         hwc->sample_period = sample_period;
3597 
3598         if (local64_read(&hwc->period_left) > 8*sample_period) {
3599                 if (disable)
3600                         event->pmu->stop(event, PERF_EF_UPDATE);
3601 
3602                 local64_set(&hwc->period_left, 0);
3603 
3604                 if (disable)
3605                         event->pmu->start(event, PERF_EF_RELOAD);
3606         }
3607 }
3608 
3609 /*
3610  * combine freq adjustment with unthrottling to avoid two passes over the
3611  * events. At the same time, make sure, having freq events does not change
3612  * the rate of unthrottling as that would introduce bias.
3613  */
3614 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3615                                            int needs_unthr)
3616 {
3617         struct perf_event *event;
3618         struct hw_perf_event *hwc;
3619         u64 now, period = TICK_NSEC;
3620         s64 delta;
3621 
3622         /*
3623          * only need to iterate over all events iff:
3624          * - context have events in frequency mode (needs freq adjust)
3625          * - there are events to unthrottle on this cpu
3626          */
3627         if (!(ctx->nr_freq || needs_unthr))
3628                 return;
3629 
3630         raw_spin_lock(&ctx->lock);
3631         perf_pmu_disable(ctx->pmu);
3632 
3633         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3634                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3635                         continue;
3636 
3637                 if (!event_filter_match(event))
3638                         continue;
3639 
3640                 perf_pmu_disable(event->pmu);
3641 
3642                 hwc = &event->hw;
3643 
3644                 if (hwc->interrupts == MAX_INTERRUPTS) {
3645                         hwc->interrupts = 0;
3646                         perf_log_throttle(event, 1);
3647                         event->pmu->start(event, 0);
3648                 }
3649 
3650                 if (!event->attr.freq || !event->attr.sample_freq)
3651                         goto next;
3652 
3653                 /*
3654                  * stop the event and update event->count
3655                  */
3656                 event->pmu->stop(event, PERF_EF_UPDATE);
3657 
3658                 now = local64_read(&event->count);
3659                 delta = now - hwc->freq_count_stamp;
3660                 hwc->freq_count_stamp = now;
3661 
3662                 /*
3663                  * restart the event
3664                  * reload only if value has changed
3665                  * we have stopped the event so tell that
3666                  * to perf_adjust_period() to avoid stopping it
3667                  * twice.
3668                  */
3669                 if (delta > 0)
3670                         perf_adjust_period(event, period, delta, false);
3671 
3672                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3673         next:
3674                 perf_pmu_enable(event->pmu);
3675         }
3676 
3677         perf_pmu_enable(ctx->pmu);
3678         raw_spin_unlock(&ctx->lock);
3679 }
3680 
3681 /*
3682  * Move @event to the tail of the @ctx's elegible events.
3683  */
3684 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3685 {
3686         /*
3687          * Rotate the first entry last of non-pinned groups. Rotation might be
3688          * disabled by the inheritance code.
3689          */
3690         if (ctx->rotate_disable)
3691                 return;
3692 
3693         perf_event_groups_delete(&ctx->flexible_groups, event);
3694         perf_event_groups_insert(&ctx->flexible_groups, event);
3695 }
3696 
3697 /* pick an event from the flexible_groups to rotate */
3698 static inline struct perf_event *
3699 ctx_event_to_rotate(struct perf_event_context *ctx)
3700 {
3701         struct perf_event *event;
3702 
3703         /* pick the first active flexible event */
3704         event = list_first_entry_or_null(&ctx->flexible_active,
3705                                          struct perf_event, active_list);
3706 
3707         /* if no active flexible event, pick the first event */
3708         if (!event) {
3709                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
3710                                       typeof(*event), group_node);
3711         }
3712 
3713         return event;
3714 }
3715 
3716 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3717 {
3718         struct perf_event *cpu_event = NULL, *task_event = NULL;
3719         struct perf_event_context *task_ctx = NULL;
3720         int cpu_rotate, task_rotate;
3721 
3722         /*
3723          * Since we run this from IRQ context, nobody can install new
3724          * events, thus the event count values are stable.
3725          */
3726 
3727         cpu_rotate = cpuctx->ctx.rotate_necessary;
3728         task_ctx = cpuctx->task_ctx;
3729         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3730 
3731         if (!(cpu_rotate || task_rotate))
3732                 return false;
3733 
3734         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3735         perf_pmu_disable(cpuctx->ctx.pmu);
3736 
3737         if (task_rotate)
3738                 task_event = ctx_event_to_rotate(task_ctx);
3739         if (cpu_rotate)
3740                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
3741 
3742         /*
3743          * As per the order given at ctx_resched() first 'pop' task flexible
3744          * and then, if needed CPU flexible.
3745          */
3746         if (task_event || (task_ctx && cpu_event))
3747                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3748         if (cpu_event)
3749                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3750 
3751         if (task_event)
3752                 rotate_ctx(task_ctx, task_event);
3753         if (cpu_event)
3754                 rotate_ctx(&cpuctx->ctx, cpu_event);
3755 
3756         perf_event_sched_in(cpuctx, task_ctx, current);
3757 
3758         perf_pmu_enable(cpuctx->ctx.pmu);
3759         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3760 
3761         return true;
3762 }
3763 
3764 void perf_event_task_tick(void)
3765 {
3766         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3767         struct perf_event_context *ctx, *tmp;
3768         int throttled;
3769 
3770         lockdep_assert_irqs_disabled();
3771 
3772         __this_cpu_inc(perf_throttled_seq);
3773         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3774         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3775 
3776         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3777                 perf_adjust_freq_unthr_context(ctx, throttled);
3778 }
3779 
3780 static int event_enable_on_exec(struct perf_event *event,
3781                                 struct perf_event_context *ctx)
3782 {
3783         if (!event->attr.enable_on_exec)
3784                 return 0;
3785 
3786         event->attr.enable_on_exec = 0;
3787         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3788                 return 0;
3789 
3790         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3791 
3792         return 1;
3793 }
3794 
3795 /*
3796  * Enable all of a task's events that have been marked enable-on-exec.
3797  * This expects task == current.
3798  */
3799 static void perf_event_enable_on_exec(int ctxn)
3800 {
3801         struct perf_event_context *ctx, *clone_ctx = NULL;
3802         enum event_type_t event_type = 0;
3803         struct perf_cpu_context *cpuctx;
3804         struct perf_event *event;
3805         unsigned long flags;
3806         int enabled = 0;
3807 
3808         local_irq_save(flags);
3809         ctx = current->perf_event_ctxp[ctxn];
3810         if (!ctx || !ctx->nr_events)
3811                 goto out;
3812 
3813         cpuctx = __get_cpu_context(ctx);
3814         perf_ctx_lock(cpuctx, ctx);
3815         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3816         list_for_each_entry(event, &ctx->event_list, event_entry) {
3817                 enabled |= event_enable_on_exec(event, ctx);
3818                 event_type |= get_event_type(event);
3819         }
3820 
3821         /*
3822          * Unclone and reschedule this context if we enabled any event.
3823          */
3824         if (enabled) {
3825                 clone_ctx = unclone_ctx(ctx);
3826                 ctx_resched(cpuctx, ctx, event_type);
3827         } else {
3828                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3829         }
3830         perf_ctx_unlock(cpuctx, ctx);
3831 
3832 out:
3833         local_irq_restore(flags);
3834 
3835         if (clone_ctx)
3836                 put_ctx(clone_ctx);
3837 }
3838 
3839 struct perf_read_data {
3840         struct perf_event *event;
3841         bool group;
3842         int ret;
3843 };
3844 
3845 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3846 {
3847         u16 local_pkg, event_pkg;
3848 
3849         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3850                 int local_cpu = smp_processor_id();
3851 
3852                 event_pkg = topology_physical_package_id(event_cpu);
3853                 local_pkg = topology_physical_package_id(local_cpu);
3854 
3855                 if (event_pkg == local_pkg)
3856                         return local_cpu;
3857         }
3858 
3859         return event_cpu;
3860 }
3861 
3862 /*
3863  * Cross CPU call to read the hardware event
3864  */
3865 static void __perf_event_read(void *info)
3866 {
3867         struct perf_read_data *data = info;
3868         struct perf_event *sub, *event = data->event;
3869         struct perf_event_context *ctx = event->ctx;
3870         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3871         struct pmu *pmu = event->pmu;
3872 
3873         /*
3874          * If this is a task context, we need to check whether it is
3875          * the current task context of this cpu.  If not it has been
3876          * scheduled out before the smp call arrived.  In that case
3877          * event->count would have been updated to a recent sample
3878          * when the event was scheduled out.
3879          */
3880         if (ctx->task && cpuctx->task_ctx != ctx)
3881                 return;
3882 
3883         raw_spin_lock(&ctx->lock);
3884         if (ctx->is_active & EVENT_TIME) {
3885                 update_context_time(ctx);
3886                 update_cgrp_time_from_event(event);
3887         }
3888 
3889         perf_event_update_time(event);
3890         if (data->group)
3891                 perf_event_update_sibling_time(event);
3892 
3893         if (event->state != PERF_EVENT_STATE_ACTIVE)
3894                 goto unlock;
3895 
3896         if (!data->group) {
3897                 pmu->read(event);
3898                 data->ret = 0;
3899                 goto unlock;
3900         }
3901 
3902         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3903 
3904         pmu->read(event);
3905 
3906         for_each_sibling_event(sub, event) {
3907                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3908                         /*
3909                          * Use sibling's PMU rather than @event's since
3910                          * sibling could be on different (eg: software) PMU.
3911                          */
3912                         sub->pmu->read(sub);
3913                 }
3914         }
3915 
3916         data->ret = pmu->commit_txn(pmu);
3917 
3918 unlock:
3919         raw_spin_unlock(&ctx->lock);
3920 }
3921 
3922 static inline u64 perf_event_count(struct perf_event *event)
3923 {
3924         return local64_read(&event->count) + atomic64_read(&event->child_count);
3925 }
3926 
3927 /*
3928  * NMI-safe method to read a local event, that is an event that
3929  * is:
3930  *   - either for the current task, or for this CPU
3931  *   - does not have inherit set, for inherited task events
3932  *     will not be local and we cannot read them atomically
3933  *   - must not have a pmu::count method
3934  */
3935 int perf_event_read_local(struct perf_event *event, u64 *value,
3936                           u64 *enabled, u64 *running)
3937 {
3938         unsigned long flags;
3939         int ret = 0;
3940 
3941         /*
3942          * Disabling interrupts avoids all counter scheduling (context
3943          * switches, timer based rotation and IPIs).
3944          */
3945         local_irq_save(flags);
3946 
3947         /*
3948          * It must not be an event with inherit set, we cannot read
3949          * all child counters from atomic context.
3950          */
3951         if (event->attr.inherit) {
3952                 ret = -EOPNOTSUPP;
3953                 goto out;
3954         }
3955 
3956         /* If this is a per-task event, it must be for current */
3957         if ((event->attach_state & PERF_ATTACH_TASK) &&
3958             event->hw.target != current) {
3959                 ret = -EINVAL;
3960                 goto out;
3961         }
3962 
3963         /* If this is a per-CPU event, it must be for this CPU */
3964         if (!(event->attach_state & PERF_ATTACH_TASK) &&
3965             event->cpu != smp_processor_id()) {
3966                 ret = -EINVAL;
3967                 goto out;
3968         }
3969 
3970         /* If this is a pinned event it must be running on this CPU */
3971         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3972                 ret = -EBUSY;
3973                 goto out;
3974         }
3975 
3976         /*
3977          * If the event is currently on this CPU, its either a per-task event,
3978          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3979          * oncpu == -1).
3980          */
3981         if (event->oncpu == smp_processor_id())
3982                 event->pmu->read(event);
3983 
3984         *value = local64_read(&event->count);
3985         if (enabled || running) {
3986                 u64 now = event->shadow_ctx_time + perf_clock();
3987                 u64 __enabled, __running;
3988 
3989                 __perf_update_times(event, now, &__enabled, &__running);
3990                 if (enabled)
3991                         *enabled = __enabled;
3992                 if (running)
3993                         *running = __running;
3994         }
3995 out:
3996         local_irq_restore(flags);
3997 
3998         return ret;
3999 }
4000 
4001 static int perf_event_read(struct perf_event *event, bool group)
4002 {
4003         enum perf_event_state state = READ_ONCE(event->state);
4004         int event_cpu, ret = 0;
4005 
4006         /*
4007          * If event is enabled and currently active on a CPU, update the
4008          * value in the event structure:
4009          */
4010 again:
4011         if (state == PERF_EVENT_STATE_ACTIVE) {
4012                 struct perf_read_data data;
4013 
4014                 /*
4015                  * Orders the ->state and ->oncpu loads such that if we see
4016                  * ACTIVE we must also see the right ->oncpu.
4017                  *
4018                  * Matches the smp_wmb() from event_sched_in().
4019                  */
4020                 smp_rmb();
4021 
4022                 event_cpu = READ_ONCE(event->oncpu);
4023                 if ((unsigned)event_cpu >= nr_cpu_ids)
4024                         return 0;
4025 
4026                 data = (struct perf_read_data){
4027                         .event = event,
4028                         .group = group,
4029                         .ret = 0,
4030                 };
4031 
4032                 preempt_disable();
4033                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4034 
4035                 /*
4036                  * Purposely ignore the smp_call_function_single() return
4037                  * value.
4038                  *
4039                  * If event_cpu isn't a valid CPU it means the event got
4040                  * scheduled out and that will have updated the event count.
4041                  *
4042                  * Therefore, either way, we'll have an up-to-date event count
4043                  * after this.
4044                  */
4045                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4046                 preempt_enable();
4047                 ret = data.ret;
4048 
4049         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4050                 struct perf_event_context *ctx = event->ctx;
4051                 unsigned long flags;
4052 
4053                 raw_spin_lock_irqsave(&ctx->lock, flags);
4054                 state = event->state;
4055                 if (state != PERF_EVENT_STATE_INACTIVE) {
4056                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4057                         goto again;
4058                 }
4059 
4060                 /*
4061                  * May read while context is not active (e.g., thread is
4062                  * blocked), in that case we cannot update context time
4063                  */
4064                 if (ctx->is_active & EVENT_TIME) {
4065                         update_context_time(ctx);
4066                         update_cgrp_time_from_event(event);
4067                 }
4068 
4069                 perf_event_update_time(event);
4070                 if (group)
4071                         perf_event_update_sibling_time(event);
4072                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4073         }
4074 
4075         return ret;
4076 }
4077 
4078 /*
4079  * Initialize the perf_event context in a task_struct:
4080  */
4081 static void __perf_event_init_context(struct perf_event_context *ctx)
4082 {
4083         raw_spin_lock_init(&ctx->lock);
4084         mutex_init(&ctx->mutex);
4085         INIT_LIST_HEAD(&ctx->active_ctx_list);
4086         perf_event_groups_init(&ctx->pinned_groups);
4087         perf_event_groups_init(&ctx->flexible_groups);
4088         INIT_LIST_HEAD(&ctx->event_list);
4089         INIT_LIST_HEAD(&ctx->pinned_active);
4090         INIT_LIST_HEAD(&ctx->flexible_active);
4091         refcount_set(&ctx->refcount, 1);
4092 }
4093 
4094 static struct perf_event_context *
4095 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4096 {
4097         struct perf_event_context *ctx;
4098 
4099         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4100         if (!ctx)
4101                 return NULL;
4102 
4103         __perf_event_init_context(ctx);
4104         if (task) {
4105                 ctx->task = task;
4106                 get_task_struct(task);
4107         }
4108         ctx->pmu = pmu;
4109 
4110         return ctx;
4111 }
4112 
4113 static struct task_struct *
4114 find_lively_task_by_vpid(pid_t vpid)
4115 {
4116         struct task_struct *task;
4117 
4118         rcu_read_lock();
4119         if (!vpid)
4120                 task = current;
4121         else
4122                 task = find_task_by_vpid(vpid);
4123         if (task)
4124                 get_task_struct(task);
4125         rcu_read_unlock();
4126 
4127         if (!task)
4128                 return ERR_PTR(-ESRCH);
4129 
4130         return task;
4131 }
4132 
4133 /*
4134  * Returns a matching context with refcount and pincount.
4135  */
4136 static struct perf_event_context *
4137 find_get_context(struct pmu *pmu, struct task_struct *task,
4138                 struct perf_event *event)
4139 {
4140         struct perf_event_context *ctx, *clone_ctx = NULL;
4141         struct perf_cpu_context *cpuctx;
4142         void *task_ctx_data = NULL;
4143         unsigned long flags;
4144         int ctxn, err;
4145         int cpu = event->cpu;
4146 
4147         if (!task) {
4148                 /* Must be root to operate on a CPU event: */
4149                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4150                         return ERR_PTR(-EACCES);
4151 
4152                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4153                 ctx = &cpuctx->ctx;
4154                 get_ctx(ctx);
4155                 ++ctx->pin_count;
4156 
4157                 return ctx;
4158         }
4159 
4160         err = -EINVAL;
4161         ctxn = pmu->task_ctx_nr;
4162         if (ctxn < 0)
4163                 goto errout;
4164 
4165         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4166                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4167                 if (!task_ctx_data) {
4168                         err = -ENOMEM;
4169                         goto errout;
4170                 }
4171         }
4172 
4173 retry:
4174         ctx = perf_lock_task_context(task, ctxn, &flags);
4175         if (ctx) {
4176                 clone_ctx = unclone_ctx(ctx);
4177                 ++ctx->pin_count;
4178 
4179                 if (task_ctx_data && !ctx->task_ctx_data) {
4180                         ctx->task_ctx_data = task_ctx_data;
4181                         task_ctx_data = NULL;
4182                 }
4183                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4184 
4185                 if (clone_ctx)
4186                         put_ctx(clone_ctx);
4187         } else {
4188                 ctx = alloc_perf_context(pmu, task);
4189                 err = -ENOMEM;
4190                 if (!ctx)
4191                         goto errout;
4192 
4193                 if (task_ctx_data) {
4194                         ctx->task_ctx_data = task_ctx_data;
4195                         task_ctx_data = NULL;
4196                 }
4197 
4198                 err = 0;
4199                 mutex_lock(&task->perf_event_mutex);
4200                 /*
4201                  * If it has already passed perf_event_exit_task().
4202                  * we must see PF_EXITING, it takes this mutex too.
4203                  */
4204                 if (task->flags & PF_EXITING)
4205                         err = -ESRCH;
4206                 else if (task->perf_event_ctxp[ctxn])
4207                         err = -EAGAIN;
4208                 else {
4209                         get_ctx(ctx);
4210                         ++ctx->pin_count;
4211                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4212                 }
4213                 mutex_unlock(&task->perf_event_mutex);
4214 
4215                 if (unlikely(err)) {
4216                         put_ctx(ctx);
4217 
4218                         if (err == -EAGAIN)
4219                                 goto retry;
4220                         goto errout;
4221                 }
4222         }
4223 
4224         kfree(task_ctx_data);
4225         return ctx;
4226 
4227 errout:
4228         kfree(task_ctx_data);
4229         return ERR_PTR(err);
4230 }
4231 
4232 static void perf_event_free_filter(struct perf_event *event);
4233 static void perf_event_free_bpf_prog(struct perf_event *event);
4234 
4235 static void free_event_rcu(struct rcu_head *head)
4236 {
4237         struct perf_event *event;
4238 
4239         event = container_of(head, struct perf_event, rcu_head);
4240         if (event->ns)
4241                 put_pid_ns(event->ns);
4242         perf_event_free_filter(event);
4243         kfree(event);
4244 }
4245 
4246 static void ring_buffer_attach(struct perf_event *event,
4247                                struct ring_buffer *rb);
4248 
4249 static void detach_sb_event(struct perf_event *event)
4250 {
4251         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4252 
4253         raw_spin_lock(&pel->lock);
4254         list_del_rcu(&event->sb_list);
4255         raw_spin_unlock(&pel->lock);
4256 }
4257 
4258 static bool is_sb_event(struct perf_event *event)
4259 {
4260         struct perf_event_attr *attr = &event->attr;
4261 
4262         if (event->parent)
4263                 return false;
4264 
4265         if (event->attach_state & PERF_ATTACH_TASK)
4266                 return false;
4267 
4268         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4269             attr->comm || attr->comm_exec ||
4270             attr->task || attr->ksymbol ||
4271             attr->context_switch ||
4272             attr->bpf_event)
4273                 return true;
4274         return false;
4275 }
4276 
4277 static void unaccount_pmu_sb_event(struct perf_event *event)
4278 {
4279         if (is_sb_event(event))
4280                 detach_sb_event(event);
4281 }
4282 
4283 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4284 {
4285         if (event->parent)
4286                 return;
4287 
4288         if (is_cgroup_event(event))
4289                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4290 }
4291 
4292 #ifdef CONFIG_NO_HZ_FULL
4293 static DEFINE_SPINLOCK(nr_freq_lock);
4294 #endif
4295 
4296 static void unaccount_freq_event_nohz(void)
4297 {
4298 #ifdef CONFIG_NO_HZ_FULL
4299         spin_lock(&nr_freq_lock);
4300         if (atomic_dec_and_test(&nr_freq_events))
4301                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4302         spin_unlock(&nr_freq_lock);
4303 #endif
4304 }
4305 
4306 static void unaccount_freq_event(void)
4307 {
4308         if (tick_nohz_full_enabled())
4309                 unaccount_freq_event_nohz();
4310         else
4311                 atomic_dec(&nr_freq_events);
4312 }
4313 
4314 static void unaccount_event(struct perf_event *event)
4315 {
4316         bool dec = false;
4317 
4318         if (event->parent)
4319                 return;
4320 
4321         if (event->attach_state & PERF_ATTACH_TASK)
4322                 dec = true;
4323         if (event->attr.mmap || event->attr.mmap_data)
4324                 atomic_dec(&nr_mmap_events);
4325         if (event->attr.comm)
4326                 atomic_dec(&nr_comm_events);
4327         if (event->attr.namespaces)
4328                 atomic_dec(&nr_namespaces_events);
4329         if (event->attr.task)
4330                 atomic_dec(&nr_task_events);
4331         if (event->attr.freq)
4332                 unaccount_freq_event();
4333         if (event->attr.context_switch) {
4334                 dec = true;
4335                 atomic_dec(&nr_switch_events);
4336         }
4337         if (is_cgroup_event(event))
4338                 dec = true;
4339         if (has_branch_stack(event))
4340                 dec = true;
4341         if (event->attr.ksymbol)
4342                 atomic_dec(&nr_ksymbol_events);
4343         if (event->attr.bpf_event)
4344                 atomic_dec(&nr_bpf_events);
4345 
4346         if (dec) {
4347                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4348                         schedule_delayed_work(&perf_sched_work, HZ);
4349         }
4350 
4351         unaccount_event_cpu(event, event->cpu);
4352 
4353         unaccount_pmu_sb_event(event);
4354 }
4355 
4356 static void perf_sched_delayed(struct work_struct *work)
4357 {
4358         mutex_lock(&perf_sched_mutex);
4359         if (atomic_dec_and_test(&perf_sched_count))
4360                 static_branch_disable(&perf_sched_events);
4361         mutex_unlock(&perf_sched_mutex);
4362 }
4363 
4364 /*
4365  * The following implement mutual exclusion of events on "exclusive" pmus
4366  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4367  * at a time, so we disallow creating events that might conflict, namely:
4368  *
4369  *  1) cpu-wide events in the presence of per-task events,
4370  *  2) per-task events in the presence of cpu-wide events,
4371  *  3) two matching events on the same context.
4372  *
4373  * The former two cases are handled in the allocation path (perf_event_alloc(),
4374  * _free_event()), the latter -- before the first perf_install_in_context().
4375  */
4376 static int exclusive_event_init(struct perf_event *event)
4377 {
4378         struct pmu *pmu = event->pmu;
4379 
4380         if (!is_exclusive_pmu(pmu))
4381                 return 0;
4382 
4383         /*
4384          * Prevent co-existence of per-task and cpu-wide events on the
4385          * same exclusive pmu.
4386          *
4387          * Negative pmu::exclusive_cnt means there are cpu-wide
4388          * events on this "exclusive" pmu, positive means there are
4389          * per-task events.
4390          *
4391          * Since this is called in perf_event_alloc() path, event::ctx
4392          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4393          * to mean "per-task event", because unlike other attach states it
4394          * never gets cleared.
4395          */
4396         if (event->attach_state & PERF_ATTACH_TASK) {
4397                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4398                         return -EBUSY;
4399         } else {
4400                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4401                         return -EBUSY;
4402         }
4403 
4404         return 0;
4405 }
4406 
4407 static void exclusive_event_destroy(struct perf_event *event)
4408 {
4409         struct pmu *pmu = event->pmu;
4410 
4411         if (!is_exclusive_pmu(pmu))
4412                 return;
4413 
4414         /* see comment in exclusive_event_init() */
4415         if (event->attach_state & PERF_ATTACH_TASK)
4416                 atomic_dec(&pmu->exclusive_cnt);
4417         else
4418                 atomic_inc(&pmu->exclusive_cnt);
4419 }
4420 
4421 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4422 {
4423         if ((e1->pmu == e2->pmu) &&
4424             (e1->cpu == e2->cpu ||
4425              e1->cpu == -1 ||
4426              e2->cpu == -1))
4427                 return true;
4428         return false;
4429 }
4430 
4431 static bool exclusive_event_installable(struct perf_event *event,
4432                                         struct perf_event_context *ctx)
4433 {
4434         struct perf_event *iter_event;
4435         struct pmu *pmu = event->pmu;
4436 
4437         lockdep_assert_held(&ctx->mutex);
4438 
4439         if (!is_exclusive_pmu(pmu))
4440                 return true;
4441 
4442         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4443                 if (exclusive_event_match(iter_event, event))
4444                         return false;
4445         }
4446 
4447         return true;
4448 }
4449 
4450 static void perf_addr_filters_splice(struct perf_event *event,
4451                                        struct list_head *head);
4452 
4453 static void _free_event(struct perf_event *event)
4454 {
4455         irq_work_sync(&event->pending);
4456 
4457         unaccount_event(event);
4458 
4459         if (event->rb) {
4460                 /*
4461                  * Can happen when we close an event with re-directed output.
4462                  *
4463                  * Since we have a 0 refcount, perf_mmap_close() will skip
4464                  * over us; possibly making our ring_buffer_put() the last.
4465                  */
4466                 mutex_lock(&event->mmap_mutex);
4467                 ring_buffer_attach(event, NULL);
4468                 mutex_unlock(&event->mmap_mutex);
4469         }
4470 
4471         if (is_cgroup_event(event))
4472                 perf_detach_cgroup(event);
4473 
4474         if (!event->parent) {
4475                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4476                         put_callchain_buffers();
4477         }
4478 
4479         perf_event_free_bpf_prog(event);
4480         perf_addr_filters_splice(event, NULL);
4481         kfree(event->addr_filter_ranges);
4482 
4483         if (event->destroy)
4484                 event->destroy(event);
4485 
4486         /*
4487          * Must be after ->destroy(), due to uprobe_perf_close() using
4488          * hw.target.
4489          */
4490         if (event->hw.target)
4491                 put_task_struct(event->hw.target);
4492 
4493         /*
4494          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4495          * all task references must be cleaned up.
4496          */
4497         if (event->ctx)
4498                 put_ctx(event->ctx);
4499 
4500         exclusive_event_destroy(event);
4501         module_put(event->pmu->module);
4502 
4503         call_rcu(&event->rcu_head, free_event_rcu);
4504 }
4505 
4506 /*
4507  * Used to free events which have a known refcount of 1, such as in error paths
4508  * where the event isn't exposed yet and inherited events.
4509  */
4510 static void free_event(struct perf_event *event)
4511 {
4512         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4513                                 "unexpected event refcount: %ld; ptr=%p\n",
4514                                 atomic_long_read(&event->refcount), event)) {
4515                 /* leak to avoid use-after-free */
4516                 return;
4517         }
4518 
4519         _free_event(event);
4520 }
4521 
4522 /*
4523  * Remove user event from the owner task.
4524  */
4525 static void perf_remove_from_owner(struct perf_event *event)
4526 {
4527         struct task_struct *owner;
4528 
4529         rcu_read_lock();
4530         /*
4531          * Matches the smp_store_release() in perf_event_exit_task(). If we
4532          * observe !owner it means the list deletion is complete and we can
4533          * indeed free this event, otherwise we need to serialize on
4534          * owner->perf_event_mutex.
4535          */
4536         owner = READ_ONCE(event->owner);
4537         if (owner) {
4538                 /*
4539                  * Since delayed_put_task_struct() also drops the last
4540                  * task reference we can safely take a new reference
4541                  * while holding the rcu_read_lock().
4542                  */
4543                 get_task_struct(owner);
4544         }
4545         rcu_read_unlock();
4546 
4547         if (owner) {
4548                 /*
4549                  * If we're here through perf_event_exit_task() we're already
4550                  * holding ctx->mutex which would be an inversion wrt. the
4551                  * normal lock order.
4552                  *
4553                  * However we can safely take this lock because its the child
4554                  * ctx->mutex.
4555                  */
4556                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4557 
4558                 /*
4559                  * We have to re-check the event->owner field, if it is cleared
4560                  * we raced with perf_event_exit_task(), acquiring the mutex
4561                  * ensured they're done, and we can proceed with freeing the
4562                  * event.
4563                  */
4564                 if (event->owner) {
4565                         list_del_init(&event->owner_entry);
4566                         smp_store_release(&event->owner, NULL);
4567                 }
4568                 mutex_unlock(&owner->perf_event_mutex);
4569                 put_task_struct(owner);
4570         }
4571 }
4572 
4573 static void put_event(struct perf_event *event)
4574 {
4575         if (!atomic_long_dec_and_test(&event->refcount))
4576                 return;
4577 
4578         _free_event(event);
4579 }
4580 
4581 /*
4582  * Kill an event dead; while event:refcount will preserve the event
4583  * object, it will not preserve its functionality. Once the last 'user'
4584  * gives up the object, we'll destroy the thing.
4585  */
4586 int perf_event_release_kernel(struct perf_event *event)
4587 {
4588         struct perf_event_context *ctx = event->ctx;
4589         struct perf_event *child, *tmp;
4590         LIST_HEAD(free_list);
4591 
4592         /*
4593          * If we got here through err_file: fput(event_file); we will not have
4594          * attached to a context yet.
4595          */
4596         if (!ctx) {
4597                 WARN_ON_ONCE(event->attach_state &
4598                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4599                 goto no_ctx;
4600         }
4601 
4602         if (!is_kernel_event(event))
4603                 perf_remove_from_owner(event);
4604 
4605         ctx = perf_event_ctx_lock(event);
4606         WARN_ON_ONCE(ctx->parent_ctx);
4607         perf_remove_from_context(event, DETACH_GROUP);
4608 
4609         raw_spin_lock_irq(&ctx->lock);
4610         /*
4611          * Mark this event as STATE_DEAD, there is no external reference to it
4612          * anymore.
4613          *
4614          * Anybody acquiring event->child_mutex after the below loop _must_
4615          * also see this, most importantly inherit_event() which will avoid
4616          * placing more children on the list.
4617          *
4618          * Thus this guarantees that we will in fact observe and kill _ALL_
4619          * child events.
4620          */
4621         event->state = PERF_EVENT_STATE_DEAD;
4622         raw_spin_unlock_irq(&ctx->lock);
4623 
4624         perf_event_ctx_unlock(event, ctx);
4625 
4626 again:
4627         mutex_lock(&event->child_mutex);
4628         list_for_each_entry(child, &event->child_list, child_list) {
4629 
4630                 /*
4631                  * Cannot change, child events are not migrated, see the
4632                  * comment with perf_event_ctx_lock_nested().
4633                  */
4634                 ctx = READ_ONCE(child->ctx);
4635                 /*
4636                  * Since child_mutex nests inside ctx::mutex, we must jump
4637                  * through hoops. We start by grabbing a reference on the ctx.
4638                  *
4639                  * Since the event cannot get freed while we hold the
4640                  * child_mutex, the context must also exist and have a !0
4641                  * reference count.
4642                  */
4643                 get_ctx(ctx);
4644 
4645                 /*
4646                  * Now that we have a ctx ref, we can drop child_mutex, and
4647                  * acquire ctx::mutex without fear of it going away. Then we
4648                  * can re-acquire child_mutex.
4649                  */
4650                 mutex_unlock(&event->child_mutex);
4651                 mutex_lock(&ctx->mutex);
4652                 mutex_lock(&event->child_mutex);
4653 
4654                 /*
4655                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4656                  * state, if child is still the first entry, it didn't get freed
4657                  * and we can continue doing so.
4658                  */
4659                 tmp = list_first_entry_or_null(&event->child_list,
4660                                                struct perf_event, child_list);
4661                 if (tmp == child) {
4662                         perf_remove_from_context(child, DETACH_GROUP);
4663                         list_move(&child->child_list, &free_list);
4664                         /*
4665                          * This matches the refcount bump in inherit_event();
4666                          * this can't be the last reference.
4667                          */
4668                         put_event(event);
4669                 }
4670 
4671                 mutex_unlock(&event->child_mutex);
4672                 mutex_unlock(&ctx->mutex);
4673                 put_ctx(ctx);
4674                 goto again;
4675         }
4676         mutex_unlock(&event->child_mutex);
4677 
4678         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4679                 void *var = &child->ctx->refcount;
4680 
4681                 list_del(&child->child_list);
4682                 free_event(child);
4683 
4684                 /*
4685                  * Wake any perf_event_free_task() waiting for this event to be
4686                  * freed.
4687                  */
4688                 smp_mb(); /* pairs with wait_var_event() */
4689                 wake_up_var(var);
4690         }
4691 
4692 no_ctx:
4693         put_event(event); /* Must be the 'last' reference */
4694         return 0;
4695 }
4696 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4697 
4698 /*
4699  * Called when the last reference to the file is gone.
4700  */
4701 static int perf_release(struct inode *inode, struct file *file)
4702 {
4703         perf_event_release_kernel(file->private_data);
4704         return 0;
4705 }
4706 
4707 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4708 {
4709         struct perf_event *child;
4710         u64 total = 0;
4711 
4712         *enabled = 0;
4713         *running = 0;
4714 
4715         mutex_lock(&event->child_mutex);
4716 
4717         (void)perf_event_read(event, false);
4718         total += perf_event_count(event);
4719 
4720         *enabled += event->total_time_enabled +
4721                         atomic64_read(&event->child_total_time_enabled);
4722         *running += event->total_time_running +
4723                         atomic64_read(&event->child_total_time_running);
4724 
4725         list_for_each_entry(child, &event->child_list, child_list) {
4726                 (void)perf_event_read(child, false);
4727                 total += perf_event_count(child);
4728                 *enabled += child->total_time_enabled;
4729                 *running += child->total_time_running;
4730         }
4731         mutex_unlock(&event->child_mutex);
4732 
4733         return total;
4734 }
4735 
4736 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4737 {
4738         struct perf_event_context *ctx;
4739         u64 count;
4740 
4741         ctx = perf_event_ctx_lock(event);
4742         count = __perf_event_read_value(event, enabled, running);
4743         perf_event_ctx_unlock(event, ctx);
4744 
4745         return count;
4746 }
4747 EXPORT_SYMBOL_GPL(perf_event_read_value);
4748 
4749 static int __perf_read_group_add(struct perf_event *leader,
4750                                         u64 read_format, u64 *values)
4751 {
4752         struct perf_event_context *ctx = leader->ctx;
4753         struct perf_event *sub;
4754         unsigned long flags;
4755         int n = 1; /* skip @nr */
4756         int ret;
4757 
4758         ret = perf_event_read(leader, true);
4759         if (ret)
4760                 return ret;
4761 
4762         raw_spin_lock_irqsave(&ctx->lock, flags);
4763 
4764         /*
4765          * Since we co-schedule groups, {enabled,running} times of siblings
4766          * will be identical to those of the leader, so we only publish one
4767          * set.
4768          */
4769         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4770                 values[n++] += leader->total_time_enabled +
4771                         atomic64_read(&leader->child_total_time_enabled);
4772         }
4773 
4774         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4775                 values[n++] += leader->total_time_running +
4776                         atomic64_read(&leader->child_total_time_running);
4777         }
4778 
4779         /*
4780          * Write {count,id} tuples for every sibling.
4781          */
4782         values[n++] += perf_event_count(leader);
4783         if (read_format & PERF_FORMAT_ID)
4784                 values[n++] = primary_event_id(leader);
4785 
4786         for_each_sibling_event(sub, leader) {
4787                 values[n++] += perf_event_count(sub);
4788                 if (read_format & PERF_FORMAT_ID)
4789                         values[n++] = primary_event_id(sub);
4790         }
4791 
4792         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4793         return 0;
4794 }
4795 
4796 static int perf_read_group(struct perf_event *event,
4797                                    u64 read_format, char __user *buf)
4798 {
4799         struct perf_event *leader = event->group_leader, *child;
4800         struct perf_event_context *ctx = leader->ctx;
4801         int ret;
4802         u64 *values;
4803 
4804         lockdep_assert_held(&ctx->mutex);
4805 
4806         values = kzalloc(event->read_size, GFP_KERNEL);
4807         if (!values)
4808                 return -ENOMEM;
4809 
4810         values[0] = 1 + leader->nr_siblings;
4811 
4812         /*
4813          * By locking the child_mutex of the leader we effectively
4814          * lock the child list of all siblings.. XXX explain how.
4815          */
4816         mutex_lock(&leader->child_mutex);
4817 
4818         ret = __perf_read_group_add(leader, read_format, values);
4819         if (ret)
4820                 goto unlock;
4821 
4822         list_for_each_entry(child, &leader->child_list, child_list) {
4823                 ret = __perf_read_group_add(child, read_format, values);
4824                 if (ret)
4825                         goto unlock;
4826         }
4827 
4828         mutex_unlock(&leader->child_mutex);
4829 
4830         ret = event->read_size;
4831         if (copy_to_user(buf, values, event->read_size))
4832                 ret = -EFAULT;
4833         goto out;
4834 
4835 unlock:
4836         mutex_unlock(&leader->child_mutex);
4837 out:
4838         kfree(values);
4839         return ret;
4840 }
4841 
4842 static int perf_read_one(struct perf_event *event,
4843                                  u64 read_format, char __user *buf)
4844 {
4845         u64 enabled, running;
4846         u64 values[4];
4847         int n = 0;
4848 
4849         values[n++] = __perf_event_read_value(event, &enabled, &running);
4850         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4851                 values[n++] = enabled;
4852         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4853                 values[n++] = running;
4854         if (read_format & PERF_FORMAT_ID)
4855                 values[n++] = primary_event_id(event);
4856 
4857         if (copy_to_user(buf, values, n * sizeof(u64)))
4858                 return -EFAULT;
4859 
4860         return n * sizeof(u64);
4861 }
4862 
4863 static bool is_event_hup(struct perf_event *event)
4864 {
4865         bool no_children;
4866 
4867         if (event->state > PERF_EVENT_STATE_EXIT)
4868                 return false;
4869 
4870         mutex_lock(&event->child_mutex);
4871         no_children = list_empty(&event->child_list);
4872         mutex_unlock(&event->child_mutex);
4873         return no_children;
4874 }
4875 
4876 /*
4877  * Read the performance event - simple non blocking version for now
4878  */
4879 static ssize_t
4880 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4881 {
4882         u64 read_format = event->attr.read_format;
4883         int ret;
4884 
4885         /*
4886          * Return end-of-file for a read on an event that is in
4887          * error state (i.e. because it was pinned but it couldn't be
4888          * scheduled on to the CPU at some point).
4889          */
4890         if (event->state == PERF_EVENT_STATE_ERROR)
4891                 return 0;
4892 
4893         if (count < event->read_size)
4894                 return -ENOSPC;
4895 
4896         WARN_ON_ONCE(event->ctx->parent_ctx);
4897         if (read_format & PERF_FORMAT_GROUP)
4898                 ret = perf_read_group(event, read_format, buf);
4899         else
4900                 ret = perf_read_one(event, read_format, buf);
4901 
4902         return ret;
4903 }
4904 
4905 static ssize_t
4906 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4907 {
4908         struct perf_event *event = file->private_data;
4909         struct perf_event_context *ctx;
4910         int ret;
4911 
4912         ctx = perf_event_ctx_lock(event);
4913         ret = __perf_read(event, buf, count);
4914         perf_event_ctx_unlock(event, ctx);
4915 
4916         return ret;
4917 }
4918 
4919 static __poll_t perf_poll(struct file *file, poll_table *wait)
4920 {
4921         struct perf_event *event = file->private_data;
4922         struct ring_buffer *rb;
4923         __poll_t events = EPOLLHUP;
4924 
4925         poll_wait(file, &event->waitq, wait);
4926 
4927         if (is_event_hup(event))
4928                 return events;
4929 
4930         /*
4931          * Pin the event->rb by taking event->mmap_mutex; otherwise
4932          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4933          */
4934         mutex_lock(&event->mmap_mutex);
4935         rb = event->rb;
4936         if (rb)
4937                 events = atomic_xchg(&rb->poll, 0);
4938         mutex_unlock(&event->mmap_mutex);
4939         return events;
4940 }
4941 
4942 static void _perf_event_reset(struct perf_event *event)
4943 {
4944         (void)perf_event_read(event, false);
4945         local64_set(&event->count, 0);
4946         perf_event_update_userpage(event);
4947 }
4948 
4949 /*
4950  * Holding the top-level event's child_mutex means that any
4951  * descendant process that has inherited this event will block
4952  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4953  * task existence requirements of perf_event_enable/disable.
4954  */
4955 static void perf_event_for_each_child(struct perf_event *event,
4956                                         void (*func)(struct perf_event *))
4957 {
4958         struct perf_event *child;
4959 
4960         WARN_ON_ONCE(event->ctx->parent_ctx);
4961 
4962         mutex_lock(&event->child_mutex);
4963         func(event);
4964         list_for_each_entry(child, &event->child_list, child_list)
4965                 func(child);
4966         mutex_unlock(&event->child_mutex);
4967 }
4968 
4969 static void perf_event_for_each(struct perf_event *event,
4970                                   void (*func)(struct perf_event *))
4971 {
4972         struct perf_event_context *ctx = event->ctx;
4973         struct perf_event *sibling;
4974 
4975         lockdep_assert_held(&ctx->mutex);
4976 
4977         event = event->group_leader;
4978 
4979         perf_event_for_each_child(event, func);
4980         for_each_sibling_event(sibling, event)
4981                 perf_event_for_each_child(sibling, func);
4982 }
4983 
4984 static void __perf_event_period(struct perf_event *event,
4985                                 struct perf_cpu_context *cpuctx,
4986                                 struct perf_event_context *ctx,
4987                                 void *info)
4988 {
4989         u64 value = *((u64 *)info);
4990         bool active;
4991 
4992         if (event->attr.freq) {
4993                 event->attr.sample_freq = value;
4994         } else {
4995                 event->attr.sample_period = value;
4996                 event->hw.sample_period = value;
4997         }
4998 
4999         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5000         if (active) {
5001                 perf_pmu_disable(ctx->pmu);
5002                 /*
5003                  * We could be throttled; unthrottle now to avoid the tick
5004                  * trying to unthrottle while we already re-started the event.
5005                  */
5006                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5007                         event->hw.interrupts = 0;
5008                         perf_log_throttle(event, 1);
5009                 }
5010                 event->pmu->stop(event, PERF_EF_UPDATE);
5011         }
5012 
5013         local64_set(&event->hw.period_left, 0);
5014 
5015         if (active) {
5016                 event->pmu->start(event, PERF_EF_RELOAD);
5017                 perf_pmu_enable(ctx->pmu);
5018         }
5019 }
5020 
5021 static int perf_event_check_period(struct perf_event *event, u64 value)
5022 {
5023         return event->pmu->check_period(event, value);
5024 }
5025 
5026 static int perf_event_period(struct perf_event *event, u64 __user *arg)
5027 {
5028         u64 value;
5029 
5030         if (!is_sampling_event(event))
5031                 return -EINVAL;
5032 
5033         if (copy_from_user(&value, arg, sizeof(value)))
5034                 return -EFAULT;
5035 
5036         if (!value)
5037                 return -EINVAL;
5038 
5039         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5040                 return -EINVAL;
5041 
5042         if (perf_event_check_period(event, value))
5043                 return -EINVAL;
5044 
5045         if (!event->attr.freq && (value & (1ULL << 63)))
5046                 return -EINVAL;
5047 
5048         event_function_call(event, __perf_event_period, &value);
5049 
5050         return 0;
5051 }
5052 
5053 static const struct file_operations perf_fops;
5054 
5055 static inline int perf_fget_light(int fd, struct fd *p)
5056 {
5057         struct fd f = fdget(fd);
5058         if (!f.file)
5059                 return -EBADF;
5060 
5061         if (f.file->f_op != &perf_fops) {
5062                 fdput(f);
5063                 return -EBADF;
5064         }
5065         *p = f;
5066         return 0;
5067 }
5068 
5069 static int perf_event_set_output(struct perf_event *event,
5070                                  struct perf_event *output_event);
5071 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5072 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5073 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5074                           struct perf_event_attr *attr);
5075 
5076 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5077 {
5078         void (*func)(struct perf_event *);
5079         u32 flags = arg;
5080 
5081         switch (cmd) {
5082         case PERF_EVENT_IOC_ENABLE:
5083                 func = _perf_event_enable;
5084                 break;
5085         case PERF_EVENT_IOC_DISABLE:
5086                 func = _perf_event_disable;
5087                 break;
5088         case PERF_EVENT_IOC_RESET:
5089                 func = _perf_event_reset;
5090                 break;
5091 
5092         case PERF_EVENT_IOC_REFRESH:
5093                 return _perf_event_refresh(event, arg);
5094 
5095         case PERF_EVENT_IOC_PERIOD:
5096                 return perf_event_period(event, (u64 __user *)arg);
5097 
5098         case PERF_EVENT_IOC_ID:
5099         {
5100                 u64 id = primary_event_id(event);
5101 
5102                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5103                         return -EFAULT;
5104                 return 0;
5105         }
5106 
5107         case PERF_EVENT_IOC_SET_OUTPUT:
5108         {
5109                 int ret;
5110                 if (arg != -1) {
5111                         struct perf_event *output_event;
5112                         struct fd output;
5113                         ret = perf_fget_light(arg, &output);
5114                         if (ret)
5115                                 return ret;
5116                         output_event = output.file->private_data;
5117                         ret = perf_event_set_output(event, output_event);
5118                         fdput(output);
5119                 } else {
5120                         ret = perf_event_set_output(event, NULL);
5121                 }
5122                 return ret;
5123         }
5124 
5125         case PERF_EVENT_IOC_SET_FILTER:
5126                 return perf_event_set_filter(event, (void __user *)arg);
5127 
5128         case PERF_EVENT_IOC_SET_BPF:
5129                 return perf_event_set_bpf_prog(event, arg);
5130 
5131         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5132                 struct ring_buffer *rb;
5133 
5134                 rcu_read_lock();
5135                 rb = rcu_dereference(event->rb);
5136                 if (!rb || !rb->nr_pages) {
5137                         rcu_read_unlock();
5138                         return -EINVAL;
5139                 }
5140                 rb_toggle_paused(rb, !!arg);
5141                 rcu_read_unlock();
5142                 return 0;
5143         }
5144 
5145         case PERF_EVENT_IOC_QUERY_BPF:
5146                 return perf_event_query_prog_array(event, (void __user *)arg);
5147 
5148         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5149                 struct perf_event_attr new_attr;
5150                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5151                                          &new_attr);
5152 
5153                 if (err)
5154                         return err;
5155 
5156                 return perf_event_modify_attr(event,  &new_attr);
5157         }
5158         default:
5159                 return -ENOTTY;
5160         }
5161 
5162         if (flags & PERF_IOC_FLAG_GROUP)
5163                 perf_event_for_each(event, func);
5164         else
5165                 perf_event_for_each_child(event, func);
5166 
5167         return 0;
5168 }
5169 
5170 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5171 {
5172         struct perf_event *event = file->private_data;
5173         struct perf_event_context *ctx;
5174         long ret;
5175 
5176         ctx = perf_event_ctx_lock(event);
5177         ret = _perf_ioctl(event, cmd, arg);
5178         perf_event_ctx_unlock(event, ctx);
5179 
5180         return ret;
5181 }
5182 
5183 #ifdef CONFIG_COMPAT
5184 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5185                                 unsigned long arg)
5186 {
5187         switch (_IOC_NR(cmd)) {
5188         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5189         case _IOC_NR(PERF_EVENT_IOC_ID):
5190         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5191         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5192                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5193                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5194                         cmd &= ~IOCSIZE_MASK;
5195                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5196                 }
5197                 break;
5198         }
5199         return perf_ioctl(file, cmd, arg);
5200 }
5201 #else
5202 # define perf_compat_ioctl NULL
5203 #endif
5204 
5205 int perf_event_task_enable(void)
5206 {
5207         struct perf_event_context *ctx;
5208         struct perf_event *event;
5209 
5210         mutex_lock(&current->perf_event_mutex);
5211         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5212                 ctx = perf_event_ctx_lock(event);
5213                 perf_event_for_each_child(event, _perf_event_enable);
5214                 perf_event_ctx_unlock(event, ctx);
5215         }
5216         mutex_unlock(&current->perf_event_mutex);
5217 
5218         return 0;
5219 }
5220 
5221 int perf_event_task_disable(void)
5222 {
5223         struct perf_event_context *ctx;
5224         struct perf_event *event;
5225 
5226         mutex_lock(&current->perf_event_mutex);
5227         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5228                 ctx = perf_event_ctx_lock(event);
5229                 perf_event_for_each_child(event, _perf_event_disable);
5230                 perf_event_ctx_unlock(event, ctx);
5231         }
5232         mutex_unlock(&current->perf_event_mutex);
5233 
5234         return 0;
5235 }
5236 
5237 static int perf_event_index(struct perf_event *event)
5238 {
5239         if (event->hw.state & PERF_HES_STOPPED)
5240                 return 0;
5241 
5242         if (event->state != PERF_EVENT_STATE_ACTIVE)
5243                 return 0;
5244 
5245         return event->pmu->event_idx(event);
5246 }
5247 
5248 static void calc_timer_values(struct perf_event *event,
5249                                 u64 *now,
5250                                 u64 *enabled,
5251                                 u64 *running)
5252 {
5253         u64 ctx_time;
5254 
5255         *now = perf_clock();
5256         ctx_time = event->shadow_ctx_time + *now;
5257         __perf_update_times(event, ctx_time, enabled, running);
5258 }
5259 
5260 static void perf_event_init_userpage(struct perf_event *event)
5261 {
5262         struct perf_event_mmap_page *userpg;
5263         struct ring_buffer *rb;
5264 
5265         rcu_read_lock();
5266         rb = rcu_dereference(event->rb);
5267         if (!rb)
5268                 goto unlock;
5269 
5270         userpg = rb->user_page;
5271 
5272         /* Allow new userspace to detect that bit 0 is deprecated */
5273         userpg->cap_bit0_is_deprecated = 1;
5274         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5275         userpg->data_offset = PAGE_SIZE;
5276         userpg->data_size = perf_data_size(rb);
5277 
5278 unlock:
5279         rcu_read_unlock();
5280 }
5281 
5282 void __weak arch_perf_update_userpage(
5283         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5284 {
5285 }
5286 
5287 /*
5288  * Callers need to ensure there can be no nesting of this function, otherwise
5289  * the seqlock logic goes bad. We can not serialize this because the arch
5290  * code calls this from NMI context.
5291  */
5292 void perf_event_update_userpage(struct perf_event *event)
5293 {
5294         struct perf_event_mmap_page *userpg;
5295         struct ring_buffer *rb;
5296         u64 enabled, running, now;
5297 
5298         rcu_read_lock();
5299         rb = rcu_dereference(event->rb);
5300         if (!rb)
5301                 goto unlock;
5302 
5303         /*
5304          * compute total_time_enabled, total_time_running
5305          * based on snapshot values taken when the event
5306          * was last scheduled in.
5307          *
5308          * we cannot simply called update_context_time()
5309          * because of locking issue as we can be called in
5310          * NMI context
5311          */
5312         calc_timer_values(event, &now, &enabled, &running);
5313 
5314         userpg = rb->user_page;
5315         /*
5316          * Disable preemption to guarantee consistent time stamps are stored to
5317          * the user page.
5318          */
5319         preempt_disable();
5320         ++userpg->lock;
5321         barrier();
5322         userpg->index = perf_event_index(event);
5323         userpg->offset = perf_event_count(event);
5324         if (userpg->index)
5325                 userpg->offset -= local64_read(&event->hw.prev_count);
5326 
5327         userpg->time_enabled = enabled +
5328                         atomic64_read(&event->child_total_time_enabled);
5329 
5330         userpg->time_running = running +
5331                         atomic64_read(&event->child_total_time_running);
5332 
5333         arch_perf_update_userpage(event, userpg, now);
5334 
5335         barrier();
5336         ++userpg->lock;
5337         preempt_enable();
5338 unlock:
5339         rcu_read_unlock();
5340 }
5341 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5342 
5343 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5344 {
5345         struct perf_event *event = vmf->vma->vm_file->private_data;
5346         struct ring_buffer *rb;
5347         vm_fault_t ret = VM_FAULT_SIGBUS;
5348 
5349         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5350                 if (vmf->pgoff == 0)
5351                         ret = 0;
5352                 return ret;
5353         }
5354 
5355         rcu_read_lock();
5356         rb = rcu_dereference(event->rb);
5357         if (!rb)
5358                 goto unlock;
5359 
5360         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5361                 goto unlock;
5362 
5363         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5364         if (!vmf->page)
5365                 goto unlock;
5366 
5367         get_page(vmf->page);
5368         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5369         vmf->page->index   = vmf->pgoff;
5370 
5371         ret = 0;
5372 unlock:
5373         rcu_read_unlock();
5374 
5375         return ret;
5376 }
5377 
5378 static void ring_buffer_attach(struct perf_event *event,
5379                                struct ring_buffer *rb)
5380 {
5381         struct ring_buffer *old_rb = NULL;
5382         unsigned long flags;
5383 
5384         if (event->rb) {
5385                 /*
5386                  * Should be impossible, we set this when removing
5387                  * event->rb_entry and wait/clear when adding event->rb_entry.
5388                  */
5389                 WARN_ON_ONCE(event->rcu_pending);
5390 
5391                 old_rb = event->rb;
5392                 spin_lock_irqsave(&old_rb->event_lock, flags);
5393                 list_del_rcu(&event->rb_entry);
5394                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5395 
5396                 event->rcu_batches = get_state_synchronize_rcu();
5397                 event->rcu_pending = 1;
5398         }
5399 
5400         if (rb) {
5401                 if (event->rcu_pending) {
5402                         cond_synchronize_rcu(event->rcu_batches);
5403                         event->rcu_pending = 0;
5404                 }
5405 
5406                 spin_lock_irqsave(&rb->event_lock, flags);
5407                 list_add_rcu(&event->rb_entry, &rb->event_list);
5408                 spin_unlock_irqrestore(&rb->event_lock, flags);
5409         }
5410 
5411         /*
5412          * Avoid racing with perf_mmap_close(AUX): stop the event
5413          * before swizzling the event::rb pointer; if it's getting
5414          * unmapped, its aux_mmap_count will be 0 and it won't
5415          * restart. See the comment in __perf_pmu_output_stop().
5416          *
5417          * Data will inevitably be lost when set_output is done in
5418          * mid-air, but then again, whoever does it like this is
5419          * not in for the data anyway.
5420          */
5421         if (has_aux(event))
5422                 perf_event_stop(event, 0);
5423 
5424         rcu_assign_pointer(event->rb, rb);
5425 
5426         if (old_rb) {
5427                 ring_buffer_put(old_rb);
5428                 /*
5429                  * Since we detached before setting the new rb, so that we
5430                  * could attach the new rb, we could have missed a wakeup.
5431                  * Provide it now.
5432                  */
5433                 wake_up_all(&event->waitq);
5434         }
5435 }
5436 
5437 static void ring_buffer_wakeup(struct perf_event *event)
5438 {
5439         struct ring_buffer *rb;
5440 
5441         rcu_read_lock();
5442         rb = rcu_dereference(event->rb);
5443         if (rb) {
5444                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5445                         wake_up_all(&event->waitq);
5446         }
5447         rcu_read_unlock();
5448 }
5449 
5450 struct ring_buffer *ring_buffer_get(struct perf_event *event)
5451 {
5452         struct ring_buffer *rb;
5453 
5454         rcu_read_lock();
5455         rb = rcu_dereference(event->rb);
5456         if (rb) {
5457                 if (!refcount_inc_not_zero(&rb->refcount))
5458                         rb = NULL;
5459         }
5460         rcu_read_unlock();
5461 
5462         return rb;
5463 }
5464 
5465 void ring_buffer_put(struct ring_buffer *rb)
5466 {
5467         if (!refcount_dec_and_test(&rb->refcount))
5468                 return;
5469 
5470         WARN_ON_ONCE(!list_empty(&rb->event_list));
5471 
5472         call_rcu(&rb->rcu_head, rb_free_rcu);
5473 }
5474 
5475 static void perf_mmap_open(struct vm_area_struct *vma)
5476 {
5477         struct perf_event *event = vma->vm_file->private_data;
5478 
5479         atomic_inc(&event->mmap_count);
5480         atomic_inc(&event->rb->mmap_count);
5481 
5482         if (vma->vm_pgoff)
5483                 atomic_inc(&event->rb->aux_mmap_count);
5484 
5485         if (event->pmu->event_mapped)
5486                 event->pmu->event_mapped(event, vma->vm_mm);
5487 }
5488 
5489 static void perf_pmu_output_stop(struct perf_event *event);
5490 
5491 /*
5492  * A buffer can be mmap()ed multiple times; either directly through the same
5493  * event, or through other events by use of perf_event_set_output().
5494  *
5495  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5496  * the buffer here, where we still have a VM context. This means we need
5497  * to detach all events redirecting to us.
5498  */
5499 static void perf_mmap_close(struct vm_area_struct *vma)
5500 {
5501         struct perf_event *event = vma->vm_file->private_data;
5502 
5503         struct ring_buffer *rb = ring_buffer_get(event);
5504         struct user_struct *mmap_user = rb->mmap_user;
5505         int mmap_locked = rb->mmap_locked;
5506         unsigned long size = perf_data_size(rb);
5507 
5508         if (event->pmu->event_unmapped)
5509                 event->pmu->event_unmapped(event, vma->vm_mm);
5510 
5511         /*
5512          * rb->aux_mmap_count will always drop before rb->mmap_count and
5513          * event->mmap_count, so it is ok to use event->mmap_mutex to
5514          * serialize with perf_mmap here.
5515          */
5516         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5517             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5518                 /*
5519                  * Stop all AUX events that are writing to this buffer,
5520                  * so that we can free its AUX pages and corresponding PMU
5521                  * data. Note that after rb::aux_mmap_count dropped to zero,
5522                  * they won't start any more (see perf_aux_output_begin()).
5523                  */
5524                 perf_pmu_output_stop(event);
5525 
5526                 /* now it's safe to free the pages */
5527                 if (!rb->aux_mmap_locked)
5528                         atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5529                 else
5530                         atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5531 
5532                 /* this has to be the last one */
5533                 rb_free_aux(rb);
5534                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5535 
5536                 mutex_unlock(&event->mmap_mutex);
5537         }
5538 
5539         atomic_dec(&rb->mmap_count);
5540 
5541         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5542                 goto out_put;
5543 
5544         ring_buffer_attach(event, NULL);
5545         mutex_unlock(&event->mmap_mutex);
5546 
5547         /* If there's still other mmap()s of this buffer, we're done. */
5548         if (atomic_read(&rb->mmap_count))
5549                 goto out_put;
5550 
5551         /*
5552          * No other mmap()s, detach from all other events that might redirect
5553          * into the now unreachable buffer. Somewhat complicated by the
5554          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5555          */
5556 again:
5557         rcu_read_lock();
5558         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5559                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5560                         /*
5561                          * This event is en-route to free_event() which will
5562                          * detach it and remove it from the list.
5563                          */
5564                         continue;
5565                 }
5566                 rcu_read_unlock();
5567 
5568                 mutex_lock(&event->mmap_mutex);
5569                 /*
5570                  * Check we didn't race with perf_event_set_output() which can
5571                  * swizzle the rb from under us while we were waiting to
5572                  * acquire mmap_mutex.
5573                  *
5574                  * If we find a different rb; ignore this event, a next
5575                  * iteration will no longer find it on the list. We have to
5576                  * still restart the iteration to make sure we're not now
5577                  * iterating the wrong list.
5578                  */
5579                 if (event->rb == rb)
5580                         ring_buffer_attach(event, NULL);
5581 
5582                 mutex_unlock(&event->mmap_mutex);
5583                 put_event(event);
5584 
5585                 /*
5586                  * Restart the iteration; either we're on the wrong list or
5587                  * destroyed its integrity by doing a deletion.
5588                  */
5589                 goto again;
5590         }
5591         rcu_read_unlock();
5592 
5593         /*
5594          * It could be there's still a few 0-ref events on the list; they'll
5595          * get cleaned up by free_event() -- they'll also still have their
5596          * ref on the rb and will free it whenever they are done with it.
5597          *
5598          * Aside from that, this buffer is 'fully' detached and unmapped,
5599          * undo the VM accounting.
5600          */
5601 
5602         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
5603                         &mmap_user->locked_vm);
5604         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5605         free_uid(mmap_user);
5606 
5607 out_put:
5608         ring_buffer_put(rb); /* could be last */
5609 }
5610 
5611 static const struct vm_operations_struct perf_mmap_vmops = {
5612         .open           = perf_mmap_open,
5613         .close          = perf_mmap_close, /* non mergeable */
5614         .fault          = perf_mmap_fault,
5615         .page_mkwrite   = perf_mmap_fault,
5616 };
5617 
5618 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5619 {
5620         struct perf_event *event = file->private_data;
5621         unsigned long user_locked, user_lock_limit;
5622         struct user_struct *user = current_user();
5623         unsigned long locked, lock_limit;
5624         struct ring_buffer *rb = NULL;
5625         unsigned long vma_size;
5626         unsigned long nr_pages;
5627         long user_extra = 0, extra = 0;
5628         int ret = 0, flags = 0;
5629 
5630         /*
5631          * Don't allow mmap() of inherited per-task counters. This would
5632          * create a performance issue due to all children writing to the
5633          * same rb.
5634          */
5635         if (event->cpu == -1 && event->attr.inherit)
5636                 return -EINVAL;
5637 
5638         if (!(vma->vm_flags & VM_SHARED))
5639                 return -EINVAL;
5640 
5641         vma_size = vma->vm_end - vma->vm_start;
5642 
5643         if (vma->vm_pgoff == 0) {
5644                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5645         } else {
5646                 /*
5647                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5648                  * mapped, all subsequent mappings should have the same size
5649                  * and offset. Must be above the normal perf buffer.
5650                  */
5651                 u64 aux_offset, aux_size;
5652 
5653                 if (!event->rb)
5654                         return -EINVAL;
5655 
5656                 nr_pages = vma_size / PAGE_SIZE;
5657 
5658                 mutex_lock(&event->mmap_mutex);
5659                 ret = -EINVAL;
5660 
5661                 rb = event->rb;
5662                 if (!rb)
5663                         goto aux_unlock;
5664 
5665                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5666                 aux_size = READ_ONCE(rb->user_page->aux_size);
5667 
5668                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5669                         goto aux_unlock;
5670 
5671                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5672                         goto aux_unlock;
5673 
5674                 /* already mapped with a different offset */
5675                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5676                         goto aux_unlock;
5677 
5678                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5679                         goto aux_unlock;
5680 
5681                 /* already mapped with a different size */
5682                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5683                         goto aux_unlock;
5684 
5685                 if (!is_power_of_2(nr_pages))
5686                         goto aux_unlock;
5687 
5688                 if (!atomic_inc_not_zero(&rb->mmap_count))
5689                         goto aux_unlock;
5690 
5691                 if (rb_has_aux(rb)) {
5692                         atomic_inc(&rb->aux_mmap_count);
5693                         ret = 0;
5694                         goto unlock;
5695                 }
5696 
5697                 atomic_set(&rb->aux_mmap_count, 1);
5698                 user_extra = nr_pages;
5699 
5700                 goto accounting;
5701         }
5702 
5703         /*
5704          * If we have rb pages ensure they're a power-of-two number, so we
5705          * can do bitmasks instead of modulo.
5706          */
5707         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5708                 return -EINVAL;
5709 
5710         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5711                 return -EINVAL;
5712 
5713         WARN_ON_ONCE(event->ctx->parent_ctx);
5714 again:
5715         mutex_lock(&event->mmap_mutex);
5716         if (event->rb) {
5717                 if (event->rb->nr_pages != nr_pages) {
5718                         ret = -EINVAL;
5719                         goto unlock;
5720                 }
5721 
5722                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5723                         /*
5724                          * Raced against perf_mmap_close() through
5725                          * perf_event_set_output(). Try again, hope for better
5726                          * luck.
5727                          */
5728                         mutex_unlock(&event->mmap_mutex);
5729                         goto again;
5730                 }
5731 
5732                 goto unlock;
5733         }
5734 
5735         user_extra = nr_pages + 1;
5736 
5737 accounting:
5738         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5739 
5740         /*
5741          * Increase the limit linearly with more CPUs:
5742          */
5743         user_lock_limit *= num_online_cpus();
5744 
5745         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5746 
5747         if (user_locked <= user_lock_limit) {
5748                 /* charge all to locked_vm */
5749         } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
5750                 /* charge all to pinned_vm */
5751                 extra = user_extra;
5752                 user_extra = 0;
5753         } else {
5754                 /*
5755                  * charge locked_vm until it hits user_lock_limit;
5756                  * charge the rest from pinned_vm
5757                  */
5758                 extra = user_locked - user_lock_limit;
5759                 user_extra -= extra;
5760         }
5761 
5762         lock_limit = rlimit(RLIMIT_MEMLOCK);
5763         lock_limit >>= PAGE_SHIFT;
5764         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
5765 
5766         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5767                 !capable(CAP_IPC_LOCK)) {
5768                 ret = -EPERM;
5769                 goto unlock;
5770         }
5771 
5772         WARN_ON(!rb && event->rb);
5773 
5774         if (vma->vm_flags & VM_WRITE)
5775                 flags |= RING_BUFFER_WRITABLE;
5776 
5777         if (!rb) {
5778                 rb = rb_alloc(nr_pages,
5779                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5780                               event->cpu, flags);
5781 
5782                 if (!rb) {
5783                         ret = -ENOMEM;
5784                         goto unlock;
5785                 }
5786 
5787                 atomic_set(&rb->mmap_count, 1);
5788                 rb->mmap_user = get_current_user();
5789                 rb->mmap_locked = extra;
5790 
5791                 ring_buffer_attach(event, rb);
5792 
5793                 perf_event_init_userpage(event);
5794                 perf_event_update_userpage(event);
5795         } else {
5796                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5797                                    event->attr.aux_watermark, flags);
5798                 if (!ret)
5799                         rb->aux_mmap_locked = extra;
5800         }
5801 
5802 unlock:
5803         if (!ret) {
5804                 atomic_long_add(user_extra, &user->locked_vm);
5805                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
5806 
5807                 atomic_inc(&event->mmap_count);
5808         } else if (rb) {
5809                 atomic_dec(&rb->mmap_count);
5810         }
5811 aux_unlock:
5812         mutex_unlock(&event->mmap_mutex);
5813 
5814         /*
5815          * Since pinned accounting is per vm we cannot allow fork() to copy our
5816          * vma.
5817          */
5818         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5819         vma->vm_ops = &perf_mmap_vmops;
5820 
5821         if (event->pmu->event_mapped)
5822                 event->pmu->event_mapped(event, vma->vm_mm);
5823 
5824         return ret;
5825 }
5826 
5827 static int perf_fasync(int fd, struct file *filp, int on)
5828 {
5829         struct inode *inode = file_inode(filp);
5830         struct perf_event *event = filp->private_data;
5831         int retval;
5832 
5833         inode_lock(inode);
5834         retval = fasync_helper(fd, filp, on, &event->fasync);
5835         inode_unlock(inode);
5836 
5837         if (retval < 0)
5838                 return retval;
5839 
5840         return 0;
5841 }
5842 
5843 static const struct file_operations perf_fops = {
5844         .llseek                 = no_llseek,
5845         .release                = perf_release,
5846         .read                   = perf_read,
5847         .poll                   = perf_poll,
5848         .unlocked_ioctl         = perf_ioctl,
5849         .compat_ioctl           = perf_compat_ioctl,
5850         .mmap                   = perf_mmap,
5851         .fasync                 = perf_fasync,
5852 };
5853 
5854 /*
5855  * Perf event wakeup
5856  *
5857  * If there's data, ensure we set the poll() state and publish everything
5858  * to user-space before waking everybody up.
5859  */
5860 
5861 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5862 {
5863         /* only the parent has fasync state */
5864         if (event->parent)
5865                 event = event->parent;
5866         return &event->fasync;
5867 }
5868 
5869 void perf_event_wakeup(struct perf_event *event)
5870 {
5871         ring_buffer_wakeup(event);
5872 
5873         if (event->pending_kill) {
5874                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5875                 event->pending_kill = 0;
5876         }
5877 }
5878 
5879 static void perf_pending_event_disable(struct perf_event *event)
5880 {
5881         int cpu = READ_ONCE(event->pending_disable);
5882 
5883         if (cpu < 0)
5884                 return;
5885 
5886         if (cpu == smp_processor_id()) {
5887                 WRITE_ONCE(event->pending_disable, -1);
5888                 perf_event_disable_local(event);
5889                 return;
5890         }
5891 
5892         /*
5893          *  CPU-A                       CPU-B
5894          *
5895          *  perf_event_disable_inatomic()
5896          *    @pending_disable = CPU-A;
5897          *    irq_work_queue();
5898          *
5899          *  sched-out
5900          *    @pending_disable = -1;
5901          *
5902          *                              sched-in
5903          *                              perf_event_disable_inatomic()
5904          *                                @pending_disable = CPU-B;
5905          *                                irq_work_queue(); // FAILS
5906          *
5907          *  irq_work_run()
5908          *    perf_pending_event()
5909          *
5910          * But the event runs on CPU-B and wants disabling there.
5911          */
5912         irq_work_queue_on(&event->pending, cpu);
5913 }
5914 
5915 static void perf_pending_event(struct irq_work *entry)
5916 {
5917         struct perf_event *event = container_of(entry, struct perf_event, pending);
5918         int rctx;
5919 
5920         rctx = perf_swevent_get_recursion_context();
5921         /*
5922          * If we 'fail' here, that's OK, it means recursion is already disabled
5923          * and we won't recurse 'further'.
5924          */
5925 
5926         perf_pending_event_disable(event);
5927 
5928         if (event->pending_wakeup) {
5929                 event->pending_wakeup = 0;
5930                 perf_event_wakeup(event);
5931         }
5932 
5933         if (rctx >= 0)
5934                 perf_swevent_put_recursion_context(rctx);
5935 }
5936 
5937 /*
5938  * We assume there is only KVM supporting the callbacks.
5939  * Later on, we might change it to a list if there is
5940  * another virtualization implementation supporting the callbacks.
5941  */
5942 struct perf_guest_info_callbacks *perf_guest_cbs;
5943 
5944 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5945 {
5946         perf_guest_cbs = cbs;
5947         return 0;
5948 }
5949 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5950 
5951 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5952 {
5953         perf_guest_cbs = NULL;
5954         return 0;
5955 }
5956 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5957 
5958 static void
5959 perf_output_sample_regs(struct perf_output_handle *handle,
5960                         struct pt_regs *regs, u64 mask)
5961 {
5962         int bit;
5963         DECLARE_BITMAP(_mask, 64);
5964 
5965         bitmap_from_u64(_mask, mask);
5966         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5967                 u64 val;
5968 
5969                 val = perf_reg_value(regs, bit);
5970                 perf_output_put(handle, val);
5971         }
5972 }
5973 
5974 static void perf_sample_regs_user(struct perf_regs *regs_user,
5975                                   struct pt_regs *regs,
5976                                   struct pt_regs *regs_user_copy)
5977 {
5978         if (user_mode(regs)) {
5979                 regs_user->abi = perf_reg_abi(current);
5980                 regs_user->regs = regs;
5981         } else if (!(current->flags & PF_KTHREAD)) {
5982                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5983         } else {
5984                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5985                 regs_user->regs = NULL;
5986         }
5987 }
5988 
5989 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5990                                   struct pt_regs *regs)
5991 {
5992         regs_intr->regs = regs;
5993         regs_intr->abi  = perf_reg_abi(current);
5994 }
5995 
5996 
5997 /*
5998  * Get remaining task size from user stack pointer.
5999  *
6000  * It'd be better to take stack vma map and limit this more
6001  * precisly, but there's no way to get it safely under interrupt,
6002  * so using TASK_SIZE as limit.
6003  */
6004 static u64 perf_ustack_task_size(struct pt_regs *regs)
6005 {
6006         unsigned long addr = perf_user_stack_pointer(regs);
6007 
6008         if (!addr || addr >= TASK_SIZE)
6009                 return 0;
6010 
6011         return TASK_SIZE - addr;
6012 }
6013 
6014 static u16
6015 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6016                         struct pt_regs *regs)
6017 {
6018         u64 task_size;
6019 
6020         /* No regs, no stack pointer, no dump. */
6021         if (!regs)
6022                 return 0;
6023 
6024         /*
6025          * Check if we fit in with the requested stack size into the:
6026          * - TASK_SIZE
6027          *   If we don't, we limit the size to the TASK_SIZE.
6028          *
6029          * - remaining sample size
6030          *   If we don't, we customize the stack size to
6031          *   fit in to the remaining sample size.
6032          */
6033 
6034         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6035         stack_size = min(stack_size, (u16) task_size);
6036 
6037         /* Current header size plus static size and dynamic size. */
6038         header_size += 2 * sizeof(u64);
6039 
6040         /* Do we fit in with the current stack dump size? */
6041         if ((u16) (header_size + stack_size) < header_size) {
6042                 /*
6043                  * If we overflow the maximum size for the sample,
6044                  * we customize the stack dump size to fit in.
6045                  */
6046                 stack_size = USHRT_MAX - header_size - sizeof(u64);
6047                 stack_size = round_up(stack_size, sizeof(u64));
6048         }
6049 
6050         return stack_size;
6051 }
6052 
6053 static void
6054 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6055                           struct pt_regs *regs)
6056 {
6057         /* Case of a kernel thread, nothing to dump */
6058         if (!regs) {
6059                 u64 size = 0;
6060                 perf_output_put(handle, size);
6061         } else {
6062                 unsigned long sp;
6063                 unsigned int rem;
6064                 u64 dyn_size;
6065                 mm_segment_t fs;
6066 
6067                 /*
6068                  * We dump:
6069                  * static size
6070                  *   - the size requested by user or the best one we can fit
6071                  *     in to the sample max size
6072                  * data
6073                  *   - user stack dump data
6074                  * dynamic size
6075                  *   - the actual dumped size
6076                  */
6077 
6078                 /* Static size. */
6079                 perf_output_put(handle, dump_size);
6080 
6081                 /* Data. */
6082                 sp = perf_user_stack_pointer(regs);
6083                 fs = get_fs();
6084                 set_fs(USER_DS);
6085                 rem = __output_copy_user(handle, (void *) sp, dump_size);
6086                 set_fs(fs);
6087                 dyn_size = dump_size - rem;
6088 
6089                 perf_output_skip(handle, rem);
6090 
6091                 /* Dynamic size. */
6092                 perf_output_put(handle, dyn_size);
6093         }
6094 }
6095 
6096 static void __perf_event_header__init_id(struct perf_event_header *header,
6097                                          struct perf_sample_data *data,
6098                                          struct perf_event *event)
6099 {
6100         u64 sample_type = event->attr.sample_type;
6101 
6102         data->type = sample_type;
6103         header->size += event->id_header_size;
6104 
6105         if (sample_type & PERF_SAMPLE_TID) {
6106                 /* namespace issues */
6107                 data->tid_entry.pid = perf_event_pid(event, current);
6108                 data->tid_entry.tid = perf_event_tid(event, current);
6109         }
6110 
6111         if (sample_type & PERF_SAMPLE_TIME)
6112                 data->time = perf_event_clock(event);
6113 
6114         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6115                 data->id = primary_event_id(event);
6116 
6117         if (sample_type & PERF_SAMPLE_STREAM_ID)
6118                 data->stream_id = event->id;
6119 
6120         if (sample_type & PERF_SAMPLE_CPU) {
6121                 data->cpu_entry.cpu      = raw_smp_processor_id();
6122                 data->cpu_entry.reserved = 0;
6123         }
6124 }
6125 
6126 void perf_event_header__init_id(struct perf_event_header *header,
6127                                 struct perf_sample_data *data,
6128                                 struct perf_event *event)
6129 {
6130         if (event->attr.sample_id_all)
6131                 __perf_event_header__init_id(header, data, event);
6132 }
6133 
6134 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6135                                            struct perf_sample_data *data)
6136 {
6137         u64 sample_type = data->type;
6138 
6139         if (sample_type & PERF_SAMPLE_TID)
6140                 perf_output_put(handle, data->tid_entry);
6141 
6142         if (sample_type & PERF_SAMPLE_TIME)
6143                 perf_output_put(handle, data->time);
6144 
6145         if (sample_type & PERF_SAMPLE_ID)
6146                 perf_output_put(handle, data->id);
6147 
6148         if (sample_type & PERF_SAMPLE_STREAM_ID)
6149                 perf_output_put(handle, data->stream_id);
6150 
6151         if (sample_type & PERF_SAMPLE_CPU)
6152                 perf_output_put(handle, data->cpu_entry);
6153 
6154         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6155                 perf_output_put(handle, data->id);
6156 }
6157 
6158 void perf_event__output_id_sample(struct perf_event *event,
6159                                   struct perf_output_handle *handle,
6160                                   struct perf_sample_data *sample)
6161 {
6162         if (event->attr.sample_id_all)
6163                 __perf_event__output_id_sample(handle, sample);
6164 }
6165 
6166 static void perf_output_read_one(struct perf_output_handle *handle,
6167                                  struct perf_event *event,
6168                                  u64 enabled, u64 running)
6169 {
6170         u64 read_format = event->attr.read_format;
6171         u64 values[4];
6172         int n = 0;
6173 
6174         values[n++] = perf_event_count(event);
6175         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6176                 values[n++] = enabled +
6177                         atomic64_read(&event->child_total_time_enabled);
6178         }
6179         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6180                 values[n++] = running +
6181                         atomic64_read(&event->child_total_time_running);
6182         }
6183         if (read_format & PERF_FORMAT_ID)
6184                 values[n++] = primary_event_id(event);
6185 
6186         __output_copy(handle, values, n * sizeof(u64));
6187 }
6188 
6189 static void perf_output_read_group(struct perf_output_handle *handle,
6190                             struct perf_event *event,
6191                             u64 enabled, u64 running)
6192 {
6193         struct perf_event *leader = event->group_leader, *sub;
6194         u64 read_format = event