~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/events/core.c

Version: ~ [ linux-5.16 ] ~ [ linux-5.15.13 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.90 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.170 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.224 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.261 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.296 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.298 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Performance events core code:
  3  *
  4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  8  *
  9  * For licensing details see kernel-base/COPYING
 10  */
 11 
 12 #include <linux/fs.h>
 13 #include <linux/mm.h>
 14 #include <linux/cpu.h>
 15 #include <linux/smp.h>
 16 #include <linux/idr.h>
 17 #include <linux/file.h>
 18 #include <linux/poll.h>
 19 #include <linux/slab.h>
 20 #include <linux/hash.h>
 21 #include <linux/tick.h>
 22 #include <linux/sysfs.h>
 23 #include <linux/dcache.h>
 24 #include <linux/percpu.h>
 25 #include <linux/ptrace.h>
 26 #include <linux/reboot.h>
 27 #include <linux/vmstat.h>
 28 #include <linux/device.h>
 29 #include <linux/export.h>
 30 #include <linux/vmalloc.h>
 31 #include <linux/hardirq.h>
 32 #include <linux/rculist.h>
 33 #include <linux/uaccess.h>
 34 #include <linux/syscalls.h>
 35 #include <linux/anon_inodes.h>
 36 #include <linux/kernel_stat.h>
 37 #include <linux/cgroup.h>
 38 #include <linux/perf_event.h>
 39 #include <linux/ftrace_event.h>
 40 #include <linux/hw_breakpoint.h>
 41 #include <linux/mm_types.h>
 42 #include <linux/module.h>
 43 #include <linux/mman.h>
 44 #include <linux/compat.h>
 45 #include <linux/bpf.h>
 46 #include <linux/filter.h>
 47 
 48 #include "internal.h"
 49 
 50 #include <asm/irq_regs.h>
 51 
 52 static struct workqueue_struct *perf_wq;
 53 
 54 struct remote_function_call {
 55         struct task_struct      *p;
 56         int                     (*func)(void *info);
 57         void                    *info;
 58         int                     ret;
 59 };
 60 
 61 static void remote_function(void *data)
 62 {
 63         struct remote_function_call *tfc = data;
 64         struct task_struct *p = tfc->p;
 65 
 66         if (p) {
 67                 tfc->ret = -EAGAIN;
 68                 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
 69                         return;
 70         }
 71 
 72         tfc->ret = tfc->func(tfc->info);
 73 }
 74 
 75 /**
 76  * task_function_call - call a function on the cpu on which a task runs
 77  * @p:          the task to evaluate
 78  * @func:       the function to be called
 79  * @info:       the function call argument
 80  *
 81  * Calls the function @func when the task is currently running. This might
 82  * be on the current CPU, which just calls the function directly
 83  *
 84  * returns: @func return value, or
 85  *          -ESRCH  - when the process isn't running
 86  *          -EAGAIN - when the process moved away
 87  */
 88 static int
 89 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
 90 {
 91         struct remote_function_call data = {
 92                 .p      = p,
 93                 .func   = func,
 94                 .info   = info,
 95                 .ret    = -ESRCH, /* No such (running) process */
 96         };
 97 
 98         if (task_curr(p))
 99                 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
100 
101         return data.ret;
102 }
103 
104 /**
105  * cpu_function_call - call a function on the cpu
106  * @func:       the function to be called
107  * @info:       the function call argument
108  *
109  * Calls the function @func on the remote cpu.
110  *
111  * returns: @func return value or -ENXIO when the cpu is offline
112  */
113 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
114 {
115         struct remote_function_call data = {
116                 .p      = NULL,
117                 .func   = func,
118                 .info   = info,
119                 .ret    = -ENXIO, /* No such CPU */
120         };
121 
122         smp_call_function_single(cpu, remote_function, &data, 1);
123 
124         return data.ret;
125 }
126 
127 #define EVENT_OWNER_KERNEL ((void *) -1)
128 
129 static bool is_kernel_event(struct perf_event *event)
130 {
131         return event->owner == EVENT_OWNER_KERNEL;
132 }
133 
134 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
135                        PERF_FLAG_FD_OUTPUT  |\
136                        PERF_FLAG_PID_CGROUP |\
137                        PERF_FLAG_FD_CLOEXEC)
138 
139 /*
140  * branch priv levels that need permission checks
141  */
142 #define PERF_SAMPLE_BRANCH_PERM_PLM \
143         (PERF_SAMPLE_BRANCH_KERNEL |\
144          PERF_SAMPLE_BRANCH_HV)
145 
146 enum event_type_t {
147         EVENT_FLEXIBLE = 0x1,
148         EVENT_PINNED = 0x2,
149         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
150 };
151 
152 /*
153  * perf_sched_events : >0 events exist
154  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
155  */
156 struct static_key_deferred perf_sched_events __read_mostly;
157 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
158 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
159 
160 static atomic_t nr_mmap_events __read_mostly;
161 static atomic_t nr_comm_events __read_mostly;
162 static atomic_t nr_task_events __read_mostly;
163 static atomic_t nr_freq_events __read_mostly;
164 
165 static LIST_HEAD(pmus);
166 static DEFINE_MUTEX(pmus_lock);
167 static struct srcu_struct pmus_srcu;
168 
169 /*
170  * perf event paranoia level:
171  *  -1 - not paranoid at all
172  *   0 - disallow raw tracepoint access for unpriv
173  *   1 - disallow cpu events for unpriv
174  *   2 - disallow kernel profiling for unpriv
175  */
176 int sysctl_perf_event_paranoid __read_mostly = 1;
177 
178 /* Minimum for 512 kiB + 1 user control page */
179 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
180 
181 /*
182  * max perf event sample rate
183  */
184 #define DEFAULT_MAX_SAMPLE_RATE         100000
185 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
186 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
187 
188 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
189 
190 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
191 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
192 
193 static int perf_sample_allowed_ns __read_mostly =
194         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
195 
196 void update_perf_cpu_limits(void)
197 {
198         u64 tmp = perf_sample_period_ns;
199 
200         tmp *= sysctl_perf_cpu_time_max_percent;
201         tmp = div_u64(tmp, 100);
202         if (!tmp)
203                 tmp = 1;
204 
205         WRITE_ONCE(perf_sample_allowed_ns, tmp);
206 }
207 
208 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
209 
210 int perf_proc_update_handler(struct ctl_table *table, int write,
211                 void __user *buffer, size_t *lenp,
212                 loff_t *ppos)
213 {
214         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
215 
216         if (ret || !write)
217                 return ret;
218 
219         /*
220          * If throttling is disabled don't allow the write:
221          */
222         if (sysctl_perf_cpu_time_max_percent == 100 ||
223             sysctl_perf_cpu_time_max_percent == 0)
224                 return -EINVAL;
225 
226         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
227         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
228         update_perf_cpu_limits();
229 
230         return 0;
231 }
232 
233 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
234 
235 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
236                                 void __user *buffer, size_t *lenp,
237                                 loff_t *ppos)
238 {
239         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
240 
241         if (ret || !write)
242                 return ret;
243 
244         if (sysctl_perf_cpu_time_max_percent == 100 ||
245             sysctl_perf_cpu_time_max_percent == 0) {
246                 printk(KERN_WARNING
247                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
248                 WRITE_ONCE(perf_sample_allowed_ns, 0);
249         } else {
250                 update_perf_cpu_limits();
251         }
252 
253         return 0;
254 }
255 
256 /*
257  * perf samples are done in some very critical code paths (NMIs).
258  * If they take too much CPU time, the system can lock up and not
259  * get any real work done.  This will drop the sample rate when
260  * we detect that events are taking too long.
261  */
262 #define NR_ACCUMULATED_SAMPLES 128
263 static DEFINE_PER_CPU(u64, running_sample_length);
264 
265 static u64 __report_avg;
266 static u64 __report_allowed;
267 
268 static void perf_duration_warn(struct irq_work *w)
269 {
270         printk_ratelimited(KERN_WARNING
271                 "perf: interrupt took too long (%lld > %lld), lowering "
272                 "kernel.perf_event_max_sample_rate to %d\n",
273                 __report_avg, __report_allowed,
274                 sysctl_perf_event_sample_rate);
275 }
276 
277 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
278 
279 void perf_sample_event_took(u64 sample_len_ns)
280 {
281         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
282         u64 running_len;
283         u64 avg_len;
284         u32 max;
285 
286         if (max_len == 0)
287                 return;
288 
289         /* Decay the counter by 1 average sample. */
290         running_len = __this_cpu_read(running_sample_length);
291         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
292         running_len += sample_len_ns;
293         __this_cpu_write(running_sample_length, running_len);
294 
295         /*
296          * Note: this will be biased artifically low until we have
297          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
298          * from having to maintain a count.
299          */
300         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
301         if (avg_len <= max_len)
302                 return;
303 
304         __report_avg = avg_len;
305         __report_allowed = max_len;
306 
307         /*
308          * Compute a throttle threshold 25% below the current duration.
309          */
310         avg_len += avg_len / 4;
311         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
312         if (avg_len < max)
313                 max /= (u32)avg_len;
314         else
315                 max = 1;
316 
317         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
318         WRITE_ONCE(max_samples_per_tick, max);
319 
320         sysctl_perf_event_sample_rate = max * HZ;
321         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
322 
323         if (!irq_work_queue(&perf_duration_work)) {
324                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
325                              "kernel.perf_event_max_sample_rate to %d\n",
326                              __report_avg, __report_allowed,
327                              sysctl_perf_event_sample_rate);
328         }
329 }
330 
331 static atomic64_t perf_event_id;
332 
333 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
334                               enum event_type_t event_type);
335 
336 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
337                              enum event_type_t event_type,
338                              struct task_struct *task);
339 
340 static void update_context_time(struct perf_event_context *ctx);
341 static u64 perf_event_time(struct perf_event *event);
342 
343 void __weak perf_event_print_debug(void)        { }
344 
345 extern __weak const char *perf_pmu_name(void)
346 {
347         return "pmu";
348 }
349 
350 static inline u64 perf_clock(void)
351 {
352         return local_clock();
353 }
354 
355 static inline u64 perf_event_clock(struct perf_event *event)
356 {
357         return event->clock();
358 }
359 
360 static inline struct perf_cpu_context *
361 __get_cpu_context(struct perf_event_context *ctx)
362 {
363         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
364 }
365 
366 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
367                           struct perf_event_context *ctx)
368 {
369         raw_spin_lock(&cpuctx->ctx.lock);
370         if (ctx)
371                 raw_spin_lock(&ctx->lock);
372 }
373 
374 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
375                             struct perf_event_context *ctx)
376 {
377         if (ctx)
378                 raw_spin_unlock(&ctx->lock);
379         raw_spin_unlock(&cpuctx->ctx.lock);
380 }
381 
382 #ifdef CONFIG_CGROUP_PERF
383 
384 static inline bool
385 perf_cgroup_match(struct perf_event *event)
386 {
387         struct perf_event_context *ctx = event->ctx;
388         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
389 
390         /* @event doesn't care about cgroup */
391         if (!event->cgrp)
392                 return true;
393 
394         /* wants specific cgroup scope but @cpuctx isn't associated with any */
395         if (!cpuctx->cgrp)
396                 return false;
397 
398         /*
399          * Cgroup scoping is recursive.  An event enabled for a cgroup is
400          * also enabled for all its descendant cgroups.  If @cpuctx's
401          * cgroup is a descendant of @event's (the test covers identity
402          * case), it's a match.
403          */
404         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
405                                     event->cgrp->css.cgroup);
406 }
407 
408 static inline void perf_detach_cgroup(struct perf_event *event)
409 {
410         css_put(&event->cgrp->css);
411         event->cgrp = NULL;
412 }
413 
414 static inline int is_cgroup_event(struct perf_event *event)
415 {
416         return event->cgrp != NULL;
417 }
418 
419 static inline u64 perf_cgroup_event_time(struct perf_event *event)
420 {
421         struct perf_cgroup_info *t;
422 
423         t = per_cpu_ptr(event->cgrp->info, event->cpu);
424         return t->time;
425 }
426 
427 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
428 {
429         struct perf_cgroup_info *info;
430         u64 now;
431 
432         now = perf_clock();
433 
434         info = this_cpu_ptr(cgrp->info);
435 
436         info->time += now - info->timestamp;
437         info->timestamp = now;
438 }
439 
440 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
441 {
442         struct perf_cgroup *cgrp_out = cpuctx->cgrp;
443         if (cgrp_out)
444                 __update_cgrp_time(cgrp_out);
445 }
446 
447 static inline void update_cgrp_time_from_event(struct perf_event *event)
448 {
449         struct perf_cgroup *cgrp;
450 
451         /*
452          * ensure we access cgroup data only when needed and
453          * when we know the cgroup is pinned (css_get)
454          */
455         if (!is_cgroup_event(event))
456                 return;
457 
458         cgrp = perf_cgroup_from_task(current);
459         /*
460          * Do not update time when cgroup is not active
461          */
462         if (cgrp == event->cgrp)
463                 __update_cgrp_time(event->cgrp);
464 }
465 
466 static inline void
467 perf_cgroup_set_timestamp(struct task_struct *task,
468                           struct perf_event_context *ctx)
469 {
470         struct perf_cgroup *cgrp;
471         struct perf_cgroup_info *info;
472 
473         /*
474          * ctx->lock held by caller
475          * ensure we do not access cgroup data
476          * unless we have the cgroup pinned (css_get)
477          */
478         if (!task || !ctx->nr_cgroups)
479                 return;
480 
481         cgrp = perf_cgroup_from_task(task);
482         info = this_cpu_ptr(cgrp->info);
483         info->timestamp = ctx->timestamp;
484 }
485 
486 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
487 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
488 
489 /*
490  * reschedule events based on the cgroup constraint of task.
491  *
492  * mode SWOUT : schedule out everything
493  * mode SWIN : schedule in based on cgroup for next
494  */
495 void perf_cgroup_switch(struct task_struct *task, int mode)
496 {
497         struct perf_cpu_context *cpuctx;
498         struct pmu *pmu;
499         unsigned long flags;
500 
501         /*
502          * disable interrupts to avoid geting nr_cgroup
503          * changes via __perf_event_disable(). Also
504          * avoids preemption.
505          */
506         local_irq_save(flags);
507 
508         /*
509          * we reschedule only in the presence of cgroup
510          * constrained events.
511          */
512         rcu_read_lock();
513 
514         list_for_each_entry_rcu(pmu, &pmus, entry) {
515                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
516                 if (cpuctx->unique_pmu != pmu)
517                         continue; /* ensure we process each cpuctx once */
518 
519                 /*
520                  * perf_cgroup_events says at least one
521                  * context on this CPU has cgroup events.
522                  *
523                  * ctx->nr_cgroups reports the number of cgroup
524                  * events for a context.
525                  */
526                 if (cpuctx->ctx.nr_cgroups > 0) {
527                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
528                         perf_pmu_disable(cpuctx->ctx.pmu);
529 
530                         if (mode & PERF_CGROUP_SWOUT) {
531                                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
532                                 /*
533                                  * must not be done before ctxswout due
534                                  * to event_filter_match() in event_sched_out()
535                                  */
536                                 cpuctx->cgrp = NULL;
537                         }
538 
539                         if (mode & PERF_CGROUP_SWIN) {
540                                 WARN_ON_ONCE(cpuctx->cgrp);
541                                 /*
542                                  * set cgrp before ctxsw in to allow
543                                  * event_filter_match() to not have to pass
544                                  * task around
545                                  */
546                                 cpuctx->cgrp = perf_cgroup_from_task(task);
547                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
548                         }
549                         perf_pmu_enable(cpuctx->ctx.pmu);
550                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
551                 }
552         }
553 
554         rcu_read_unlock();
555 
556         local_irq_restore(flags);
557 }
558 
559 static inline void perf_cgroup_sched_out(struct task_struct *task,
560                                          struct task_struct *next)
561 {
562         struct perf_cgroup *cgrp1;
563         struct perf_cgroup *cgrp2 = NULL;
564 
565         /*
566          * we come here when we know perf_cgroup_events > 0
567          */
568         cgrp1 = perf_cgroup_from_task(task);
569 
570         /*
571          * next is NULL when called from perf_event_enable_on_exec()
572          * that will systematically cause a cgroup_switch()
573          */
574         if (next)
575                 cgrp2 = perf_cgroup_from_task(next);
576 
577         /*
578          * only schedule out current cgroup events if we know
579          * that we are switching to a different cgroup. Otherwise,
580          * do no touch the cgroup events.
581          */
582         if (cgrp1 != cgrp2)
583                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
584 }
585 
586 static inline void perf_cgroup_sched_in(struct task_struct *prev,
587                                         struct task_struct *task)
588 {
589         struct perf_cgroup *cgrp1;
590         struct perf_cgroup *cgrp2 = NULL;
591 
592         /*
593          * we come here when we know perf_cgroup_events > 0
594          */
595         cgrp1 = perf_cgroup_from_task(task);
596 
597         /* prev can never be NULL */
598         cgrp2 = perf_cgroup_from_task(prev);
599 
600         /*
601          * only need to schedule in cgroup events if we are changing
602          * cgroup during ctxsw. Cgroup events were not scheduled
603          * out of ctxsw out if that was not the case.
604          */
605         if (cgrp1 != cgrp2)
606                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
607 }
608 
609 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
610                                       struct perf_event_attr *attr,
611                                       struct perf_event *group_leader)
612 {
613         struct perf_cgroup *cgrp;
614         struct cgroup_subsys_state *css;
615         struct fd f = fdget(fd);
616         int ret = 0;
617 
618         if (!f.file)
619                 return -EBADF;
620 
621         css = css_tryget_online_from_dir(f.file->f_path.dentry,
622                                          &perf_event_cgrp_subsys);
623         if (IS_ERR(css)) {
624                 ret = PTR_ERR(css);
625                 goto out;
626         }
627 
628         cgrp = container_of(css, struct perf_cgroup, css);
629         event->cgrp = cgrp;
630 
631         /*
632          * all events in a group must monitor
633          * the same cgroup because a task belongs
634          * to only one perf cgroup at a time
635          */
636         if (group_leader && group_leader->cgrp != cgrp) {
637                 perf_detach_cgroup(event);
638                 ret = -EINVAL;
639         }
640 out:
641         fdput(f);
642         return ret;
643 }
644 
645 static inline void
646 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
647 {
648         struct perf_cgroup_info *t;
649         t = per_cpu_ptr(event->cgrp->info, event->cpu);
650         event->shadow_ctx_time = now - t->timestamp;
651 }
652 
653 static inline void
654 perf_cgroup_defer_enabled(struct perf_event *event)
655 {
656         /*
657          * when the current task's perf cgroup does not match
658          * the event's, we need to remember to call the
659          * perf_mark_enable() function the first time a task with
660          * a matching perf cgroup is scheduled in.
661          */
662         if (is_cgroup_event(event) && !perf_cgroup_match(event))
663                 event->cgrp_defer_enabled = 1;
664 }
665 
666 static inline void
667 perf_cgroup_mark_enabled(struct perf_event *event,
668                          struct perf_event_context *ctx)
669 {
670         struct perf_event *sub;
671         u64 tstamp = perf_event_time(event);
672 
673         if (!event->cgrp_defer_enabled)
674                 return;
675 
676         event->cgrp_defer_enabled = 0;
677 
678         event->tstamp_enabled = tstamp - event->total_time_enabled;
679         list_for_each_entry(sub, &event->sibling_list, group_entry) {
680                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
681                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
682                         sub->cgrp_defer_enabled = 0;
683                 }
684         }
685 }
686 #else /* !CONFIG_CGROUP_PERF */
687 
688 static inline bool
689 perf_cgroup_match(struct perf_event *event)
690 {
691         return true;
692 }
693 
694 static inline void perf_detach_cgroup(struct perf_event *event)
695 {}
696 
697 static inline int is_cgroup_event(struct perf_event *event)
698 {
699         return 0;
700 }
701 
702 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
703 {
704         return 0;
705 }
706 
707 static inline void update_cgrp_time_from_event(struct perf_event *event)
708 {
709 }
710 
711 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
712 {
713 }
714 
715 static inline void perf_cgroup_sched_out(struct task_struct *task,
716                                          struct task_struct *next)
717 {
718 }
719 
720 static inline void perf_cgroup_sched_in(struct task_struct *prev,
721                                         struct task_struct *task)
722 {
723 }
724 
725 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
726                                       struct perf_event_attr *attr,
727                                       struct perf_event *group_leader)
728 {
729         return -EINVAL;
730 }
731 
732 static inline void
733 perf_cgroup_set_timestamp(struct task_struct *task,
734                           struct perf_event_context *ctx)
735 {
736 }
737 
738 void
739 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
740 {
741 }
742 
743 static inline void
744 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
745 {
746 }
747 
748 static inline u64 perf_cgroup_event_time(struct perf_event *event)
749 {
750         return 0;
751 }
752 
753 static inline void
754 perf_cgroup_defer_enabled(struct perf_event *event)
755 {
756 }
757 
758 static inline void
759 perf_cgroup_mark_enabled(struct perf_event *event,
760                          struct perf_event_context *ctx)
761 {
762 }
763 #endif
764 
765 /*
766  * set default to be dependent on timer tick just
767  * like original code
768  */
769 #define PERF_CPU_HRTIMER (1000 / HZ)
770 /*
771  * function must be called with interrupts disbled
772  */
773 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
774 {
775         struct perf_cpu_context *cpuctx;
776         enum hrtimer_restart ret = HRTIMER_NORESTART;
777         int rotations = 0;
778 
779         WARN_ON(!irqs_disabled());
780 
781         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
782 
783         rotations = perf_rotate_context(cpuctx);
784 
785         /*
786          * arm timer if needed
787          */
788         if (rotations) {
789                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
790                 ret = HRTIMER_RESTART;
791         }
792 
793         return ret;
794 }
795 
796 /* CPU is going down */
797 void perf_cpu_hrtimer_cancel(int cpu)
798 {
799         struct perf_cpu_context *cpuctx;
800         struct pmu *pmu;
801         unsigned long flags;
802 
803         if (WARN_ON(cpu != smp_processor_id()))
804                 return;
805 
806         local_irq_save(flags);
807 
808         rcu_read_lock();
809 
810         list_for_each_entry_rcu(pmu, &pmus, entry) {
811                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
812 
813                 if (pmu->task_ctx_nr == perf_sw_context)
814                         continue;
815 
816                 hrtimer_cancel(&cpuctx->hrtimer);
817         }
818 
819         rcu_read_unlock();
820 
821         local_irq_restore(flags);
822 }
823 
824 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
825 {
826         struct hrtimer *hr = &cpuctx->hrtimer;
827         struct pmu *pmu = cpuctx->ctx.pmu;
828         int timer;
829 
830         /* no multiplexing needed for SW PMU */
831         if (pmu->task_ctx_nr == perf_sw_context)
832                 return;
833 
834         /*
835          * check default is sane, if not set then force to
836          * default interval (1/tick)
837          */
838         timer = pmu->hrtimer_interval_ms;
839         if (timer < 1)
840                 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
841 
842         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
843 
844         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
845         hr->function = perf_cpu_hrtimer_handler;
846 }
847 
848 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
849 {
850         struct hrtimer *hr = &cpuctx->hrtimer;
851         struct pmu *pmu = cpuctx->ctx.pmu;
852 
853         /* not for SW PMU */
854         if (pmu->task_ctx_nr == perf_sw_context)
855                 return;
856 
857         if (hrtimer_active(hr))
858                 return;
859 
860         if (!hrtimer_callback_running(hr))
861                 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
862                                          0, HRTIMER_MODE_REL_PINNED, 0);
863 }
864 
865 void perf_pmu_disable(struct pmu *pmu)
866 {
867         int *count = this_cpu_ptr(pmu->pmu_disable_count);
868         if (!(*count)++)
869                 pmu->pmu_disable(pmu);
870 }
871 
872 void perf_pmu_enable(struct pmu *pmu)
873 {
874         int *count = this_cpu_ptr(pmu->pmu_disable_count);
875         if (!--(*count))
876                 pmu->pmu_enable(pmu);
877 }
878 
879 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
880 
881 /*
882  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
883  * perf_event_task_tick() are fully serialized because they're strictly cpu
884  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
885  * disabled, while perf_event_task_tick is called from IRQ context.
886  */
887 static void perf_event_ctx_activate(struct perf_event_context *ctx)
888 {
889         struct list_head *head = this_cpu_ptr(&active_ctx_list);
890 
891         WARN_ON(!irqs_disabled());
892 
893         WARN_ON(!list_empty(&ctx->active_ctx_list));
894 
895         list_add(&ctx->active_ctx_list, head);
896 }
897 
898 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
899 {
900         WARN_ON(!irqs_disabled());
901 
902         WARN_ON(list_empty(&ctx->active_ctx_list));
903 
904         list_del_init(&ctx->active_ctx_list);
905 }
906 
907 static void get_ctx(struct perf_event_context *ctx)
908 {
909         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
910 }
911 
912 static void free_ctx(struct rcu_head *head)
913 {
914         struct perf_event_context *ctx;
915 
916         ctx = container_of(head, struct perf_event_context, rcu_head);
917         kfree(ctx->task_ctx_data);
918         kfree(ctx);
919 }
920 
921 static void put_ctx(struct perf_event_context *ctx)
922 {
923         if (atomic_dec_and_test(&ctx->refcount)) {
924                 if (ctx->parent_ctx)
925                         put_ctx(ctx->parent_ctx);
926                 if (ctx->task)
927                         put_task_struct(ctx->task);
928                 call_rcu(&ctx->rcu_head, free_ctx);
929         }
930 }
931 
932 /*
933  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
934  * perf_pmu_migrate_context() we need some magic.
935  *
936  * Those places that change perf_event::ctx will hold both
937  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
938  *
939  * Lock ordering is by mutex address. There are two other sites where
940  * perf_event_context::mutex nests and those are:
941  *
942  *  - perf_event_exit_task_context()    [ child , 0 ]
943  *      __perf_event_exit_task()
944  *        sync_child_event()
945  *          put_event()                 [ parent, 1 ]
946  *
947  *  - perf_event_init_context()         [ parent, 0 ]
948  *      inherit_task_group()
949  *        inherit_group()
950  *          inherit_event()
951  *            perf_event_alloc()
952  *              perf_init_event()
953  *                perf_try_init_event() [ child , 1 ]
954  *
955  * While it appears there is an obvious deadlock here -- the parent and child
956  * nesting levels are inverted between the two. This is in fact safe because
957  * life-time rules separate them. That is an exiting task cannot fork, and a
958  * spawning task cannot (yet) exit.
959  *
960  * But remember that that these are parent<->child context relations, and
961  * migration does not affect children, therefore these two orderings should not
962  * interact.
963  *
964  * The change in perf_event::ctx does not affect children (as claimed above)
965  * because the sys_perf_event_open() case will install a new event and break
966  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
967  * concerned with cpuctx and that doesn't have children.
968  *
969  * The places that change perf_event::ctx will issue:
970  *
971  *   perf_remove_from_context();
972  *   synchronize_rcu();
973  *   perf_install_in_context();
974  *
975  * to affect the change. The remove_from_context() + synchronize_rcu() should
976  * quiesce the event, after which we can install it in the new location. This
977  * means that only external vectors (perf_fops, prctl) can perturb the event
978  * while in transit. Therefore all such accessors should also acquire
979  * perf_event_context::mutex to serialize against this.
980  *
981  * However; because event->ctx can change while we're waiting to acquire
982  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
983  * function.
984  *
985  * Lock order:
986  *      task_struct::perf_event_mutex
987  *        perf_event_context::mutex
988  *          perf_event_context::lock
989  *          perf_event::child_mutex;
990  *          perf_event::mmap_mutex
991  *          mmap_sem
992  */
993 static struct perf_event_context *
994 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
995 {
996         struct perf_event_context *ctx;
997 
998 again:
999         rcu_read_lock();
1000         ctx = ACCESS_ONCE(event->ctx);
1001         if (!atomic_inc_not_zero(&ctx->refcount)) {
1002                 rcu_read_unlock();
1003                 goto again;
1004         }
1005         rcu_read_unlock();
1006 
1007         mutex_lock_nested(&ctx->mutex, nesting);
1008         if (event->ctx != ctx) {
1009                 mutex_unlock(&ctx->mutex);
1010                 put_ctx(ctx);
1011                 goto again;
1012         }
1013 
1014         return ctx;
1015 }
1016 
1017 static inline struct perf_event_context *
1018 perf_event_ctx_lock(struct perf_event *event)
1019 {
1020         return perf_event_ctx_lock_nested(event, 0);
1021 }
1022 
1023 static void perf_event_ctx_unlock(struct perf_event *event,
1024                                   struct perf_event_context *ctx)
1025 {
1026         mutex_unlock(&ctx->mutex);
1027         put_ctx(ctx);
1028 }
1029 
1030 /*
1031  * This must be done under the ctx->lock, such as to serialize against
1032  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1033  * calling scheduler related locks and ctx->lock nests inside those.
1034  */
1035 static __must_check struct perf_event_context *
1036 unclone_ctx(struct perf_event_context *ctx)
1037 {
1038         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1039 
1040         lockdep_assert_held(&ctx->lock);
1041 
1042         if (parent_ctx)
1043                 ctx->parent_ctx = NULL;
1044         ctx->generation++;
1045 
1046         return parent_ctx;
1047 }
1048 
1049 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1050 {
1051         /*
1052          * only top level events have the pid namespace they were created in
1053          */
1054         if (event->parent)
1055                 event = event->parent;
1056 
1057         return task_tgid_nr_ns(p, event->ns);
1058 }
1059 
1060 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1061 {
1062         /*
1063          * only top level events have the pid namespace they were created in
1064          */
1065         if (event->parent)
1066                 event = event->parent;
1067 
1068         return task_pid_nr_ns(p, event->ns);
1069 }
1070 
1071 /*
1072  * If we inherit events we want to return the parent event id
1073  * to userspace.
1074  */
1075 static u64 primary_event_id(struct perf_event *event)
1076 {
1077         u64 id = event->id;
1078 
1079         if (event->parent)
1080                 id = event->parent->id;
1081 
1082         return id;
1083 }
1084 
1085 /*
1086  * Get the perf_event_context for a task and lock it.
1087  * This has to cope with with the fact that until it is locked,
1088  * the context could get moved to another task.
1089  */
1090 static struct perf_event_context *
1091 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1092 {
1093         struct perf_event_context *ctx;
1094 
1095 retry:
1096         /*
1097          * One of the few rules of preemptible RCU is that one cannot do
1098          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1099          * part of the read side critical section was preemptible -- see
1100          * rcu_read_unlock_special().
1101          *
1102          * Since ctx->lock nests under rq->lock we must ensure the entire read
1103          * side critical section is non-preemptible.
1104          */
1105         preempt_disable();
1106         rcu_read_lock();
1107         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1108         if (ctx) {
1109                 /*
1110                  * If this context is a clone of another, it might
1111                  * get swapped for another underneath us by
1112                  * perf_event_task_sched_out, though the
1113                  * rcu_read_lock() protects us from any context
1114                  * getting freed.  Lock the context and check if it
1115                  * got swapped before we could get the lock, and retry
1116                  * if so.  If we locked the right context, then it
1117                  * can't get swapped on us any more.
1118                  */
1119                 raw_spin_lock_irqsave(&ctx->lock, *flags);
1120                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1121                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1122                         rcu_read_unlock();
1123                         preempt_enable();
1124                         goto retry;
1125                 }
1126 
1127                 if (!atomic_inc_not_zero(&ctx->refcount)) {
1128                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1129                         ctx = NULL;
1130                 }
1131         }
1132         rcu_read_unlock();
1133         preempt_enable();
1134         return ctx;
1135 }
1136 
1137 /*
1138  * Get the context for a task and increment its pin_count so it
1139  * can't get swapped to another task.  This also increments its
1140  * reference count so that the context can't get freed.
1141  */
1142 static struct perf_event_context *
1143 perf_pin_task_context(struct task_struct *task, int ctxn)
1144 {
1145         struct perf_event_context *ctx;
1146         unsigned long flags;
1147 
1148         ctx = perf_lock_task_context(task, ctxn, &flags);
1149         if (ctx) {
1150                 ++ctx->pin_count;
1151                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1152         }
1153         return ctx;
1154 }
1155 
1156 static void perf_unpin_context(struct perf_event_context *ctx)
1157 {
1158         unsigned long flags;
1159 
1160         raw_spin_lock_irqsave(&ctx->lock, flags);
1161         --ctx->pin_count;
1162         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1163 }
1164 
1165 /*
1166  * Update the record of the current time in a context.
1167  */
1168 static void update_context_time(struct perf_event_context *ctx)
1169 {
1170         u64 now = perf_clock();
1171 
1172         ctx->time += now - ctx->timestamp;
1173         ctx->timestamp = now;
1174 }
1175 
1176 static u64 perf_event_time(struct perf_event *event)
1177 {
1178         struct perf_event_context *ctx = event->ctx;
1179 
1180         if (is_cgroup_event(event))
1181                 return perf_cgroup_event_time(event);
1182 
1183         return ctx ? ctx->time : 0;
1184 }
1185 
1186 /*
1187  * Update the total_time_enabled and total_time_running fields for a event.
1188  * The caller of this function needs to hold the ctx->lock.
1189  */
1190 static void update_event_times(struct perf_event *event)
1191 {
1192         struct perf_event_context *ctx = event->ctx;
1193         u64 run_end;
1194 
1195         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1196             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1197                 return;
1198         /*
1199          * in cgroup mode, time_enabled represents
1200          * the time the event was enabled AND active
1201          * tasks were in the monitored cgroup. This is
1202          * independent of the activity of the context as
1203          * there may be a mix of cgroup and non-cgroup events.
1204          *
1205          * That is why we treat cgroup events differently
1206          * here.
1207          */
1208         if (is_cgroup_event(event))
1209                 run_end = perf_cgroup_event_time(event);
1210         else if (ctx->is_active)
1211                 run_end = ctx->time;
1212         else
1213                 run_end = event->tstamp_stopped;
1214 
1215         event->total_time_enabled = run_end - event->tstamp_enabled;
1216 
1217         if (event->state == PERF_EVENT_STATE_INACTIVE)
1218                 run_end = event->tstamp_stopped;
1219         else
1220                 run_end = perf_event_time(event);
1221 
1222         event->total_time_running = run_end - event->tstamp_running;
1223 
1224 }
1225 
1226 /*
1227  * Update total_time_enabled and total_time_running for all events in a group.
1228  */
1229 static void update_group_times(struct perf_event *leader)
1230 {
1231         struct perf_event *event;
1232 
1233         update_event_times(leader);
1234         list_for_each_entry(event, &leader->sibling_list, group_entry)
1235                 update_event_times(event);
1236 }
1237 
1238 static struct list_head *
1239 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1240 {
1241         if (event->attr.pinned)
1242                 return &ctx->pinned_groups;
1243         else
1244                 return &ctx->flexible_groups;
1245 }
1246 
1247 /*
1248  * Add a event from the lists for its context.
1249  * Must be called with ctx->mutex and ctx->lock held.
1250  */
1251 static void
1252 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1253 {
1254         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1255         event->attach_state |= PERF_ATTACH_CONTEXT;
1256 
1257         /*
1258          * If we're a stand alone event or group leader, we go to the context
1259          * list, group events are kept attached to the group so that
1260          * perf_group_detach can, at all times, locate all siblings.
1261          */
1262         if (event->group_leader == event) {
1263                 struct list_head *list;
1264 
1265                 if (is_software_event(event))
1266                         event->group_flags |= PERF_GROUP_SOFTWARE;
1267 
1268                 list = ctx_group_list(event, ctx);
1269                 list_add_tail(&event->group_entry, list);
1270         }
1271 
1272         if (is_cgroup_event(event))
1273                 ctx->nr_cgroups++;
1274 
1275         list_add_rcu(&event->event_entry, &ctx->event_list);
1276         ctx->nr_events++;
1277         if (event->attr.inherit_stat)
1278                 ctx->nr_stat++;
1279 
1280         ctx->generation++;
1281 }
1282 
1283 /*
1284  * Initialize event state based on the perf_event_attr::disabled.
1285  */
1286 static inline void perf_event__state_init(struct perf_event *event)
1287 {
1288         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1289                                               PERF_EVENT_STATE_INACTIVE;
1290 }
1291 
1292 /*
1293  * Called at perf_event creation and when events are attached/detached from a
1294  * group.
1295  */
1296 static void perf_event__read_size(struct perf_event *event)
1297 {
1298         int entry = sizeof(u64); /* value */
1299         int size = 0;
1300         int nr = 1;
1301 
1302         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1303                 size += sizeof(u64);
1304 
1305         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1306                 size += sizeof(u64);
1307 
1308         if (event->attr.read_format & PERF_FORMAT_ID)
1309                 entry += sizeof(u64);
1310 
1311         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1312                 nr += event->group_leader->nr_siblings;
1313                 size += sizeof(u64);
1314         }
1315 
1316         size += entry * nr;
1317         event->read_size = size;
1318 }
1319 
1320 static void perf_event__header_size(struct perf_event *event)
1321 {
1322         struct perf_sample_data *data;
1323         u64 sample_type = event->attr.sample_type;
1324         u16 size = 0;
1325 
1326         perf_event__read_size(event);
1327 
1328         if (sample_type & PERF_SAMPLE_IP)
1329                 size += sizeof(data->ip);
1330 
1331         if (sample_type & PERF_SAMPLE_ADDR)
1332                 size += sizeof(data->addr);
1333 
1334         if (sample_type & PERF_SAMPLE_PERIOD)
1335                 size += sizeof(data->period);
1336 
1337         if (sample_type & PERF_SAMPLE_WEIGHT)
1338                 size += sizeof(data->weight);
1339 
1340         if (sample_type & PERF_SAMPLE_READ)
1341                 size += event->read_size;
1342 
1343         if (sample_type & PERF_SAMPLE_DATA_SRC)
1344                 size += sizeof(data->data_src.val);
1345 
1346         if (sample_type & PERF_SAMPLE_TRANSACTION)
1347                 size += sizeof(data->txn);
1348 
1349         event->header_size = size;
1350 }
1351 
1352 static void perf_event__id_header_size(struct perf_event *event)
1353 {
1354         struct perf_sample_data *data;
1355         u64 sample_type = event->attr.sample_type;
1356         u16 size = 0;
1357 
1358         if (sample_type & PERF_SAMPLE_TID)
1359                 size += sizeof(data->tid_entry);
1360 
1361         if (sample_type & PERF_SAMPLE_TIME)
1362                 size += sizeof(data->time);
1363 
1364         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1365                 size += sizeof(data->id);
1366 
1367         if (sample_type & PERF_SAMPLE_ID)
1368                 size += sizeof(data->id);
1369 
1370         if (sample_type & PERF_SAMPLE_STREAM_ID)
1371                 size += sizeof(data->stream_id);
1372 
1373         if (sample_type & PERF_SAMPLE_CPU)
1374                 size += sizeof(data->cpu_entry);
1375 
1376         event->id_header_size = size;
1377 }
1378 
1379 static void perf_group_attach(struct perf_event *event)
1380 {
1381         struct perf_event *group_leader = event->group_leader, *pos;
1382 
1383         /*
1384          * We can have double attach due to group movement in perf_event_open.
1385          */
1386         if (event->attach_state & PERF_ATTACH_GROUP)
1387                 return;
1388 
1389         event->attach_state |= PERF_ATTACH_GROUP;
1390 
1391         if (group_leader == event)
1392                 return;
1393 
1394         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1395 
1396         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1397                         !is_software_event(event))
1398                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1399 
1400         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1401         group_leader->nr_siblings++;
1402 
1403         perf_event__header_size(group_leader);
1404 
1405         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1406                 perf_event__header_size(pos);
1407 }
1408 
1409 /*
1410  * Remove a event from the lists for its context.
1411  * Must be called with ctx->mutex and ctx->lock held.
1412  */
1413 static void
1414 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1415 {
1416         struct perf_cpu_context *cpuctx;
1417 
1418         WARN_ON_ONCE(event->ctx != ctx);
1419         lockdep_assert_held(&ctx->lock);
1420 
1421         /*
1422          * We can have double detach due to exit/hot-unplug + close.
1423          */
1424         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1425                 return;
1426 
1427         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1428 
1429         if (is_cgroup_event(event)) {
1430                 ctx->nr_cgroups--;
1431                 cpuctx = __get_cpu_context(ctx);
1432                 /*
1433                  * if there are no more cgroup events
1434                  * then cler cgrp to avoid stale pointer
1435                  * in update_cgrp_time_from_cpuctx()
1436                  */
1437                 if (!ctx->nr_cgroups)
1438                         cpuctx->cgrp = NULL;
1439         }
1440 
1441         ctx->nr_events--;
1442         if (event->attr.inherit_stat)
1443                 ctx->nr_stat--;
1444 
1445         list_del_rcu(&event->event_entry);
1446 
1447         if (event->group_leader == event)
1448                 list_del_init(&event->group_entry);
1449 
1450         update_group_times(event);
1451 
1452         /*
1453          * If event was in error state, then keep it
1454          * that way, otherwise bogus counts will be
1455          * returned on read(). The only way to get out
1456          * of error state is by explicit re-enabling
1457          * of the event
1458          */
1459         if (event->state > PERF_EVENT_STATE_OFF)
1460                 event->state = PERF_EVENT_STATE_OFF;
1461 
1462         ctx->generation++;
1463 }
1464 
1465 static void perf_group_detach(struct perf_event *event)
1466 {
1467         struct perf_event *sibling, *tmp;
1468         struct list_head *list = NULL;
1469 
1470         /*
1471          * We can have double detach due to exit/hot-unplug + close.
1472          */
1473         if (!(event->attach_state & PERF_ATTACH_GROUP))
1474                 return;
1475 
1476         event->attach_state &= ~PERF_ATTACH_GROUP;
1477 
1478         /*
1479          * If this is a sibling, remove it from its group.
1480          */
1481         if (event->group_leader != event) {
1482                 list_del_init(&event->group_entry);
1483                 event->group_leader->nr_siblings--;
1484                 goto out;
1485         }
1486 
1487         if (!list_empty(&event->group_entry))
1488                 list = &event->group_entry;
1489 
1490         /*
1491          * If this was a group event with sibling events then
1492          * upgrade the siblings to singleton events by adding them
1493          * to whatever list we are on.
1494          */
1495         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1496                 if (list)
1497                         list_move_tail(&sibling->group_entry, list);
1498                 sibling->group_leader = sibling;
1499 
1500                 /* Inherit group flags from the previous leader */
1501                 sibling->group_flags = event->group_flags;
1502 
1503                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1504         }
1505 
1506 out:
1507         perf_event__header_size(event->group_leader);
1508 
1509         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1510                 perf_event__header_size(tmp);
1511 }
1512 
1513 /*
1514  * User event without the task.
1515  */
1516 static bool is_orphaned_event(struct perf_event *event)
1517 {
1518         return event && !is_kernel_event(event) && !event->owner;
1519 }
1520 
1521 /*
1522  * Event has a parent but parent's task finished and it's
1523  * alive only because of children holding refference.
1524  */
1525 static bool is_orphaned_child(struct perf_event *event)
1526 {
1527         return is_orphaned_event(event->parent);
1528 }
1529 
1530 static void orphans_remove_work(struct work_struct *work);
1531 
1532 static void schedule_orphans_remove(struct perf_event_context *ctx)
1533 {
1534         if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1535                 return;
1536 
1537         if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1538                 get_ctx(ctx);
1539                 ctx->orphans_remove_sched = true;
1540         }
1541 }
1542 
1543 static int __init perf_workqueue_init(void)
1544 {
1545         perf_wq = create_singlethread_workqueue("perf");
1546         WARN(!perf_wq, "failed to create perf workqueue\n");
1547         return perf_wq ? 0 : -1;
1548 }
1549 
1550 core_initcall(perf_workqueue_init);
1551 
1552 static inline int
1553 event_filter_match(struct perf_event *event)
1554 {
1555         return (event->cpu == -1 || event->cpu == smp_processor_id())
1556             && perf_cgroup_match(event);
1557 }
1558 
1559 static void
1560 event_sched_out(struct perf_event *event,
1561                   struct perf_cpu_context *cpuctx,
1562                   struct perf_event_context *ctx)
1563 {
1564         u64 tstamp = perf_event_time(event);
1565         u64 delta;
1566 
1567         WARN_ON_ONCE(event->ctx != ctx);
1568         lockdep_assert_held(&ctx->lock);
1569 
1570         /*
1571          * An event which could not be activated because of
1572          * filter mismatch still needs to have its timings
1573          * maintained, otherwise bogus information is return
1574          * via read() for time_enabled, time_running:
1575          */
1576         if (event->state == PERF_EVENT_STATE_INACTIVE
1577             && !event_filter_match(event)) {
1578                 delta = tstamp - event->tstamp_stopped;
1579                 event->tstamp_running += delta;
1580                 event->tstamp_stopped = tstamp;
1581         }
1582 
1583         if (event->state != PERF_EVENT_STATE_ACTIVE)
1584                 return;
1585 
1586         perf_pmu_disable(event->pmu);
1587 
1588         event->tstamp_stopped = tstamp;
1589         event->pmu->del(event, 0);
1590         event->oncpu = -1;
1591         event->state = PERF_EVENT_STATE_INACTIVE;
1592         if (event->pending_disable) {
1593                 event->pending_disable = 0;
1594                 event->state = PERF_EVENT_STATE_OFF;
1595         }
1596 
1597         if (!is_software_event(event))
1598                 cpuctx->active_oncpu--;
1599         if (!--ctx->nr_active)
1600                 perf_event_ctx_deactivate(ctx);
1601         if (event->attr.freq && event->attr.sample_freq)
1602                 ctx->nr_freq--;
1603         if (event->attr.exclusive || !cpuctx->active_oncpu)
1604                 cpuctx->exclusive = 0;
1605 
1606         if (is_orphaned_child(event))
1607                 schedule_orphans_remove(ctx);
1608 
1609         perf_pmu_enable(event->pmu);
1610 }
1611 
1612 static void
1613 group_sched_out(struct perf_event *group_event,
1614                 struct perf_cpu_context *cpuctx,
1615                 struct perf_event_context *ctx)
1616 {
1617         struct perf_event *event;
1618         int state = group_event->state;
1619 
1620         event_sched_out(group_event, cpuctx, ctx);
1621 
1622         /*
1623          * Schedule out siblings (if any):
1624          */
1625         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1626                 event_sched_out(event, cpuctx, ctx);
1627 
1628         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1629                 cpuctx->exclusive = 0;
1630 }
1631 
1632 struct remove_event {
1633         struct perf_event *event;
1634         bool detach_group;
1635 };
1636 
1637 /*
1638  * Cross CPU call to remove a performance event
1639  *
1640  * We disable the event on the hardware level first. After that we
1641  * remove it from the context list.
1642  */
1643 static int __perf_remove_from_context(void *info)
1644 {
1645         struct remove_event *re = info;
1646         struct perf_event *event = re->event;
1647         struct perf_event_context *ctx = event->ctx;
1648         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1649 
1650         raw_spin_lock(&ctx->lock);
1651         event_sched_out(event, cpuctx, ctx);
1652         if (re->detach_group)
1653                 perf_group_detach(event);
1654         list_del_event(event, ctx);
1655         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1656                 ctx->is_active = 0;
1657                 cpuctx->task_ctx = NULL;
1658         }
1659         raw_spin_unlock(&ctx->lock);
1660 
1661         return 0;
1662 }
1663 
1664 
1665 /*
1666  * Remove the event from a task's (or a CPU's) list of events.
1667  *
1668  * CPU events are removed with a smp call. For task events we only
1669  * call when the task is on a CPU.
1670  *
1671  * If event->ctx is a cloned context, callers must make sure that
1672  * every task struct that event->ctx->task could possibly point to
1673  * remains valid.  This is OK when called from perf_release since
1674  * that only calls us on the top-level context, which can't be a clone.
1675  * When called from perf_event_exit_task, it's OK because the
1676  * context has been detached from its task.
1677  */
1678 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1679 {
1680         struct perf_event_context *ctx = event->ctx;
1681         struct task_struct *task = ctx->task;
1682         struct remove_event re = {
1683                 .event = event,
1684                 .detach_group = detach_group,
1685         };
1686 
1687         lockdep_assert_held(&ctx->mutex);
1688 
1689         if (!task) {
1690                 /*
1691                  * Per cpu events are removed via an smp call. The removal can
1692                  * fail if the CPU is currently offline, but in that case we
1693                  * already called __perf_remove_from_context from
1694                  * perf_event_exit_cpu.
1695                  */
1696                 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1697                 return;
1698         }
1699 
1700 retry:
1701         if (!task_function_call(task, __perf_remove_from_context, &re))
1702                 return;
1703 
1704         raw_spin_lock_irq(&ctx->lock);
1705         /*
1706          * If we failed to find a running task, but find the context active now
1707          * that we've acquired the ctx->lock, retry.
1708          */
1709         if (ctx->is_active) {
1710                 raw_spin_unlock_irq(&ctx->lock);
1711                 /*
1712                  * Reload the task pointer, it might have been changed by
1713                  * a concurrent perf_event_context_sched_out().
1714                  */
1715                 task = ctx->task;
1716                 goto retry;
1717         }
1718 
1719         /*
1720          * Since the task isn't running, its safe to remove the event, us
1721          * holding the ctx->lock ensures the task won't get scheduled in.
1722          */
1723         if (detach_group)
1724                 perf_group_detach(event);
1725         list_del_event(event, ctx);
1726         raw_spin_unlock_irq(&ctx->lock);
1727 }
1728 
1729 /*
1730  * Cross CPU call to disable a performance event
1731  */
1732 int __perf_event_disable(void *info)
1733 {
1734         struct perf_event *event = info;
1735         struct perf_event_context *ctx = event->ctx;
1736         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1737 
1738         /*
1739          * If this is a per-task event, need to check whether this
1740          * event's task is the current task on this cpu.
1741          *
1742          * Can trigger due to concurrent perf_event_context_sched_out()
1743          * flipping contexts around.
1744          */
1745         if (ctx->task && cpuctx->task_ctx != ctx)
1746                 return -EINVAL;
1747 
1748         raw_spin_lock(&ctx->lock);
1749 
1750         /*
1751          * If the event is on, turn it off.
1752          * If it is in error state, leave it in error state.
1753          */
1754         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1755                 update_context_time(ctx);
1756                 update_cgrp_time_from_event(event);
1757                 update_group_times(event);
1758                 if (event == event->group_leader)
1759                         group_sched_out(event, cpuctx, ctx);
1760                 else
1761                         event_sched_out(event, cpuctx, ctx);
1762                 event->state = PERF_EVENT_STATE_OFF;
1763         }
1764 
1765         raw_spin_unlock(&ctx->lock);
1766 
1767         return 0;
1768 }
1769 
1770 /*
1771  * Disable a event.
1772  *
1773  * If event->ctx is a cloned context, callers must make sure that
1774  * every task struct that event->ctx->task could possibly point to
1775  * remains valid.  This condition is satisifed when called through
1776  * perf_event_for_each_child or perf_event_for_each because they
1777  * hold the top-level event's child_mutex, so any descendant that
1778  * goes to exit will block in sync_child_event.
1779  * When called from perf_pending_event it's OK because event->ctx
1780  * is the current context on this CPU and preemption is disabled,
1781  * hence we can't get into perf_event_task_sched_out for this context.
1782  */
1783 static void _perf_event_disable(struct perf_event *event)
1784 {
1785         struct perf_event_context *ctx = event->ctx;
1786         struct task_struct *task = ctx->task;
1787 
1788         if (!task) {
1789                 /*
1790                  * Disable the event on the cpu that it's on
1791                  */
1792                 cpu_function_call(event->cpu, __perf_event_disable, event);
1793                 return;
1794         }
1795 
1796 retry:
1797         if (!task_function_call(task, __perf_event_disable, event))
1798                 return;
1799 
1800         raw_spin_lock_irq(&ctx->lock);
1801         /*
1802          * If the event is still active, we need to retry the cross-call.
1803          */
1804         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1805                 raw_spin_unlock_irq(&ctx->lock);
1806                 /*
1807                  * Reload the task pointer, it might have been changed by
1808                  * a concurrent perf_event_context_sched_out().
1809                  */
1810                 task = ctx->task;
1811                 goto retry;
1812         }
1813 
1814         /*
1815          * Since we have the lock this context can't be scheduled
1816          * in, so we can change the state safely.
1817          */
1818         if (event->state == PERF_EVENT_STATE_INACTIVE) {
1819                 update_group_times(event);
1820                 event->state = PERF_EVENT_STATE_OFF;
1821         }
1822         raw_spin_unlock_irq(&ctx->lock);
1823 }
1824 
1825 /*
1826  * Strictly speaking kernel users cannot create groups and therefore this
1827  * interface does not need the perf_event_ctx_lock() magic.
1828  */
1829 void perf_event_disable(struct perf_event *event)
1830 {
1831         struct perf_event_context *ctx;
1832 
1833         ctx = perf_event_ctx_lock(event);
1834         _perf_event_disable(event);
1835         perf_event_ctx_unlock(event, ctx);
1836 }
1837 EXPORT_SYMBOL_GPL(perf_event_disable);
1838 
1839 static void perf_set_shadow_time(struct perf_event *event,
1840                                  struct perf_event_context *ctx,
1841                                  u64 tstamp)
1842 {
1843         /*
1844          * use the correct time source for the time snapshot
1845          *
1846          * We could get by without this by leveraging the
1847          * fact that to get to this function, the caller
1848          * has most likely already called update_context_time()
1849          * and update_cgrp_time_xx() and thus both timestamp
1850          * are identical (or very close). Given that tstamp is,
1851          * already adjusted for cgroup, we could say that:
1852          *    tstamp - ctx->timestamp
1853          * is equivalent to
1854          *    tstamp - cgrp->timestamp.
1855          *
1856          * Then, in perf_output_read(), the calculation would
1857          * work with no changes because:
1858          * - event is guaranteed scheduled in
1859          * - no scheduled out in between
1860          * - thus the timestamp would be the same
1861          *
1862          * But this is a bit hairy.
1863          *
1864          * So instead, we have an explicit cgroup call to remain
1865          * within the time time source all along. We believe it
1866          * is cleaner and simpler to understand.
1867          */
1868         if (is_cgroup_event(event))
1869                 perf_cgroup_set_shadow_time(event, tstamp);
1870         else
1871                 event->shadow_ctx_time = tstamp - ctx->timestamp;
1872 }
1873 
1874 #define MAX_INTERRUPTS (~0ULL)
1875 
1876 static void perf_log_throttle(struct perf_event *event, int enable);
1877 static void perf_log_itrace_start(struct perf_event *event);
1878 
1879 static int
1880 event_sched_in(struct perf_event *event,
1881                  struct perf_cpu_context *cpuctx,
1882                  struct perf_event_context *ctx)
1883 {
1884         u64 tstamp = perf_event_time(event);
1885         int ret = 0;
1886 
1887         lockdep_assert_held(&ctx->lock);
1888 
1889         if (event->state <= PERF_EVENT_STATE_OFF)
1890                 return 0;
1891 
1892         event->state = PERF_EVENT_STATE_ACTIVE;
1893         event->oncpu = smp_processor_id();
1894 
1895         /*
1896          * Unthrottle events, since we scheduled we might have missed several
1897          * ticks already, also for a heavily scheduling task there is little
1898          * guarantee it'll get a tick in a timely manner.
1899          */
1900         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1901                 perf_log_throttle(event, 1);
1902                 event->hw.interrupts = 0;
1903         }
1904 
1905         /*
1906          * The new state must be visible before we turn it on in the hardware:
1907          */
1908         smp_wmb();
1909 
1910         perf_pmu_disable(event->pmu);
1911 
1912         perf_set_shadow_time(event, ctx, tstamp);
1913 
1914         perf_log_itrace_start(event);
1915 
1916         if (event->pmu->add(event, PERF_EF_START)) {
1917                 event->state = PERF_EVENT_STATE_INACTIVE;
1918                 event->oncpu = -1;
1919                 ret = -EAGAIN;
1920                 goto out;
1921         }
1922 
1923         event->tstamp_running += tstamp - event->tstamp_stopped;
1924 
1925         if (!is_software_event(event))
1926                 cpuctx->active_oncpu++;
1927         if (!ctx->nr_active++)
1928                 perf_event_ctx_activate(ctx);
1929         if (event->attr.freq && event->attr.sample_freq)
1930                 ctx->nr_freq++;
1931 
1932         if (event->attr.exclusive)
1933                 cpuctx->exclusive = 1;
1934 
1935         if (is_orphaned_child(event))
1936                 schedule_orphans_remove(ctx);
1937 
1938 out:
1939         perf_pmu_enable(event->pmu);
1940 
1941         return ret;
1942 }
1943 
1944 static int
1945 group_sched_in(struct perf_event *group_event,
1946                struct perf_cpu_context *cpuctx,
1947                struct perf_event_context *ctx)
1948 {
1949         struct perf_event *event, *partial_group = NULL;
1950         struct pmu *pmu = ctx->pmu;
1951         u64 now = ctx->time;
1952         bool simulate = false;
1953 
1954         if (group_event->state == PERF_EVENT_STATE_OFF)
1955                 return 0;
1956 
1957         pmu->start_txn(pmu);
1958 
1959         if (event_sched_in(group_event, cpuctx, ctx)) {
1960                 pmu->cancel_txn(pmu);
1961                 perf_cpu_hrtimer_restart(cpuctx);
1962                 return -EAGAIN;
1963         }
1964 
1965         /*
1966          * Schedule in siblings as one group (if any):
1967          */
1968         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1969                 if (event_sched_in(event, cpuctx, ctx)) {
1970                         partial_group = event;
1971                         goto group_error;
1972                 }
1973         }
1974 
1975         if (!pmu->commit_txn(pmu))
1976                 return 0;
1977 
1978 group_error:
1979         /*
1980          * Groups can be scheduled in as one unit only, so undo any
1981          * partial group before returning:
1982          * The events up to the failed event are scheduled out normally,
1983          * tstamp_stopped will be updated.
1984          *
1985          * The failed events and the remaining siblings need to have
1986          * their timings updated as if they had gone thru event_sched_in()
1987          * and event_sched_out(). This is required to get consistent timings
1988          * across the group. This also takes care of the case where the group
1989          * could never be scheduled by ensuring tstamp_stopped is set to mark
1990          * the time the event was actually stopped, such that time delta
1991          * calculation in update_event_times() is correct.
1992          */
1993         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1994                 if (event == partial_group)
1995                         simulate = true;
1996 
1997                 if (simulate) {
1998                         event->tstamp_running += now - event->tstamp_stopped;
1999                         event->tstamp_stopped = now;
2000                 } else {
2001                         event_sched_out(event, cpuctx, ctx);
2002                 }
2003         }
2004         event_sched_out(group_event, cpuctx, ctx);
2005 
2006         pmu->cancel_txn(pmu);
2007 
2008         perf_cpu_hrtimer_restart(cpuctx);
2009 
2010         return -EAGAIN;
2011 }
2012 
2013 /*
2014  * Work out whether we can put this event group on the CPU now.
2015  */
2016 static int group_can_go_on(struct perf_event *event,
2017                            struct perf_cpu_context *cpuctx,
2018                            int can_add_hw)
2019 {
2020         /*
2021          * Groups consisting entirely of software events can always go on.
2022          */
2023         if (event->group_flags & PERF_GROUP_SOFTWARE)
2024                 return 1;
2025         /*
2026          * If an exclusive group is already on, no other hardware
2027          * events can go on.
2028          */
2029         if (cpuctx->exclusive)
2030                 return 0;
2031         /*
2032          * If this group is exclusive and there are already
2033          * events on the CPU, it can't go on.
2034          */
2035         if (event->attr.exclusive && cpuctx->active_oncpu)
2036                 return 0;
2037         /*
2038          * Otherwise, try to add it if all previous groups were able
2039          * to go on.
2040          */
2041         return can_add_hw;
2042 }
2043 
2044 static void add_event_to_ctx(struct perf_event *event,
2045                                struct perf_event_context *ctx)
2046 {
2047         u64 tstamp = perf_event_time(event);
2048 
2049         list_add_event(event, ctx);
2050         perf_group_attach(event);
2051         event->tstamp_enabled = tstamp;
2052         event->tstamp_running = tstamp;
2053         event->tstamp_stopped = tstamp;
2054 }
2055 
2056 static void task_ctx_sched_out(struct perf_event_context *ctx);
2057 static void
2058 ctx_sched_in(struct perf_event_context *ctx,
2059              struct perf_cpu_context *cpuctx,
2060              enum event_type_t event_type,
2061              struct task_struct *task);
2062 
2063 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2064                                 struct perf_event_context *ctx,
2065                                 struct task_struct *task)
2066 {
2067         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2068         if (ctx)
2069                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2070         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2071         if (ctx)
2072                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2073 }
2074 
2075 /*
2076  * Cross CPU call to install and enable a performance event
2077  *
2078  * Must be called with ctx->mutex held
2079  */
2080 static int  __perf_install_in_context(void *info)
2081 {
2082         struct perf_event *event = info;
2083         struct perf_event_context *ctx = event->ctx;
2084         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2085         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2086         struct task_struct *task = current;
2087 
2088         perf_ctx_lock(cpuctx, task_ctx);
2089         perf_pmu_disable(cpuctx->ctx.pmu);
2090 
2091         /*
2092          * If there was an active task_ctx schedule it out.
2093          */
2094         if (task_ctx)
2095                 task_ctx_sched_out(task_ctx);
2096 
2097         /*
2098          * If the context we're installing events in is not the
2099          * active task_ctx, flip them.
2100          */
2101         if (ctx->task && task_ctx != ctx) {
2102                 if (task_ctx)
2103                         raw_spin_unlock(&task_ctx->lock);
2104                 raw_spin_lock(&ctx->lock);
2105                 task_ctx = ctx;
2106         }
2107 
2108         if (task_ctx) {
2109                 cpuctx->task_ctx = task_ctx;
2110                 task = task_ctx->task;
2111         }
2112 
2113         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2114 
2115         update_context_time(ctx);
2116         /*
2117          * update cgrp time only if current cgrp
2118          * matches event->cgrp. Must be done before
2119          * calling add_event_to_ctx()
2120          */
2121         update_cgrp_time_from_event(event);
2122 
2123         add_event_to_ctx(event, ctx);
2124 
2125         /*
2126          * Schedule everything back in
2127          */
2128         perf_event_sched_in(cpuctx, task_ctx, task);
2129 
2130         perf_pmu_enable(cpuctx->ctx.pmu);
2131         perf_ctx_unlock(cpuctx, task_ctx);
2132 
2133         return 0;
2134 }
2135 
2136 /*
2137  * Attach a performance event to a context
2138  *
2139  * First we add the event to the list with the hardware enable bit
2140  * in event->hw_config cleared.
2141  *
2142  * If the event is attached to a task which is on a CPU we use a smp
2143  * call to enable it in the task context. The task might have been
2144  * scheduled away, but we check this in the smp call again.
2145  */
2146 static void
2147 perf_install_in_context(struct perf_event_context *ctx,
2148                         struct perf_event *event,
2149                         int cpu)
2150 {
2151         struct task_struct *task = ctx->task;
2152 
2153         lockdep_assert_held(&ctx->mutex);
2154 
2155         event->ctx = ctx;
2156         if (event->cpu != -1)
2157                 event->cpu = cpu;
2158 
2159         if (!task) {
2160                 /*
2161                  * Per cpu events are installed via an smp call and
2162                  * the install is always successful.
2163                  */
2164                 cpu_function_call(cpu, __perf_install_in_context, event);
2165                 return;
2166         }
2167 
2168 retry:
2169         if (!task_function_call(task, __perf_install_in_context, event))
2170                 return;
2171 
2172         raw_spin_lock_irq(&ctx->lock);
2173         /*
2174          * If we failed to find a running task, but find the context active now
2175          * that we've acquired the ctx->lock, retry.
2176          */
2177         if (ctx->is_active) {
2178                 raw_spin_unlock_irq(&ctx->lock);
2179                 /*
2180                  * Reload the task pointer, it might have been changed by
2181                  * a concurrent perf_event_context_sched_out().
2182                  */
2183                 task = ctx->task;
2184                 goto retry;
2185         }
2186 
2187         /*
2188          * Since the task isn't running, its safe to add the event, us holding
2189          * the ctx->lock ensures the task won't get scheduled in.
2190          */
2191         add_event_to_ctx(event, ctx);
2192         raw_spin_unlock_irq(&ctx->lock);
2193 }
2194 
2195 /*
2196  * Put a event into inactive state and update time fields.
2197  * Enabling the leader of a group effectively enables all
2198  * the group members that aren't explicitly disabled, so we
2199  * have to update their ->tstamp_enabled also.
2200  * Note: this works for group members as well as group leaders
2201  * since the non-leader members' sibling_lists will be empty.
2202  */
2203 static void __perf_event_mark_enabled(struct perf_event *event)
2204 {
2205         struct perf_event *sub;
2206         u64 tstamp = perf_event_time(event);
2207 
2208         event->state = PERF_EVENT_STATE_INACTIVE;
2209         event->tstamp_enabled = tstamp - event->total_time_enabled;
2210         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2211                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2212                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2213         }
2214 }
2215 
2216 /*
2217  * Cross CPU call to enable a performance event
2218  */
2219 static int __perf_event_enable(void *info)
2220 {
2221         struct perf_event *event = info;
2222         struct perf_event_context *ctx = event->ctx;
2223         struct perf_event *leader = event->group_leader;
2224         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2225         int err;
2226 
2227         /*
2228          * There's a time window between 'ctx->is_active' check
2229          * in perf_event_enable function and this place having:
2230          *   - IRQs on
2231          *   - ctx->lock unlocked
2232          *
2233          * where the task could be killed and 'ctx' deactivated
2234          * by perf_event_exit_task.
2235          */
2236         if (!ctx->is_active)
2237                 return -EINVAL;
2238 
2239         raw_spin_lock(&ctx->lock);
2240         update_context_time(ctx);
2241 
2242         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2243                 goto unlock;
2244 
2245         /*
2246          * set current task's cgroup time reference point
2247          */
2248         perf_cgroup_set_timestamp(current, ctx);
2249 
2250         __perf_event_mark_enabled(event);
2251 
2252         if (!event_filter_match(event)) {
2253                 if (is_cgroup_event(event))
2254                         perf_cgroup_defer_enabled(event);
2255                 goto unlock;
2256         }
2257 
2258         /*
2259          * If the event is in a group and isn't the group leader,
2260          * then don't put it on unless the group is on.
2261          */
2262         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2263                 goto unlock;
2264 
2265         if (!group_can_go_on(event, cpuctx, 1)) {
2266                 err = -EEXIST;
2267         } else {
2268                 if (event == leader)
2269                         err = group_sched_in(event, cpuctx, ctx);
2270                 else
2271                         err = event_sched_in(event, cpuctx, ctx);
2272         }
2273 
2274         if (err) {
2275                 /*
2276                  * If this event can't go on and it's part of a
2277                  * group, then the whole group has to come off.
2278                  */
2279                 if (leader != event) {
2280                         group_sched_out(leader, cpuctx, ctx);
2281                         perf_cpu_hrtimer_restart(cpuctx);
2282                 }
2283                 if (leader->attr.pinned) {
2284                         update_group_times(leader);
2285                         leader->state = PERF_EVENT_STATE_ERROR;
2286                 }
2287         }
2288 
2289 unlock:
2290         raw_spin_unlock(&ctx->lock);
2291 
2292         return 0;
2293 }
2294 
2295 /*
2296  * Enable a event.
2297  *
2298  * If event->ctx is a cloned context, callers must make sure that
2299  * every task struct that event->ctx->task could possibly point to
2300  * remains valid.  This condition is satisfied when called through
2301  * perf_event_for_each_child or perf_event_for_each as described
2302  * for perf_event_disable.
2303  */
2304 static void _perf_event_enable(struct perf_event *event)
2305 {
2306         struct perf_event_context *ctx = event->ctx;
2307         struct task_struct *task = ctx->task;
2308 
2309         if (!task) {
2310                 /*
2311                  * Enable the event on the cpu that it's on
2312                  */
2313                 cpu_function_call(event->cpu, __perf_event_enable, event);
2314                 return;
2315         }
2316 
2317         raw_spin_lock_irq(&ctx->lock);
2318         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2319                 goto out;
2320 
2321         /*
2322          * If the event is in error state, clear that first.
2323          * That way, if we see the event in error state below, we
2324          * know that it has gone back into error state, as distinct
2325          * from the task having been scheduled away before the
2326          * cross-call arrived.
2327          */
2328         if (event->state == PERF_EVENT_STATE_ERROR)
2329                 event->state = PERF_EVENT_STATE_OFF;
2330 
2331 retry:
2332         if (!ctx->is_active) {
2333                 __perf_event_mark_enabled(event);
2334                 goto out;
2335         }
2336 
2337         raw_spin_unlock_irq(&ctx->lock);
2338 
2339         if (!task_function_call(task, __perf_event_enable, event))
2340                 return;
2341 
2342         raw_spin_lock_irq(&ctx->lock);
2343 
2344         /*
2345          * If the context is active and the event is still off,
2346          * we need to retry the cross-call.
2347          */
2348         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2349                 /*
2350                  * task could have been flipped by a concurrent
2351                  * perf_event_context_sched_out()
2352                  */
2353                 task = ctx->task;
2354                 goto retry;
2355         }
2356 
2357 out:
2358         raw_spin_unlock_irq(&ctx->lock);
2359 }
2360 
2361 /*
2362  * See perf_event_disable();
2363  */
2364 void perf_event_enable(struct perf_event *event)
2365 {
2366         struct perf_event_context *ctx;
2367 
2368         ctx = perf_event_ctx_lock(event);
2369         _perf_event_enable(event);
2370         perf_event_ctx_unlock(event, ctx);
2371 }
2372 EXPORT_SYMBOL_GPL(perf_event_enable);
2373 
2374 static int _perf_event_refresh(struct perf_event *event, int refresh)
2375 {
2376         /*
2377          * not supported on inherited events
2378          */
2379         if (event->attr.inherit || !is_sampling_event(event))
2380                 return -EINVAL;
2381 
2382         atomic_add(refresh, &event->event_limit);
2383         _perf_event_enable(event);
2384 
2385         return 0;
2386 }
2387 
2388 /*
2389  * See perf_event_disable()
2390  */
2391 int perf_event_refresh(struct perf_event *event, int refresh)
2392 {
2393         struct perf_event_context *ctx;
2394         int ret;
2395 
2396         ctx = perf_event_ctx_lock(event);
2397         ret = _perf_event_refresh(event, refresh);
2398         perf_event_ctx_unlock(event, ctx);
2399 
2400         return ret;
2401 }
2402 EXPORT_SYMBOL_GPL(perf_event_refresh);
2403 
2404 static void ctx_sched_out(struct perf_event_context *ctx,
2405                           struct perf_cpu_context *cpuctx,
2406                           enum event_type_t event_type)
2407 {
2408         struct perf_event *event;
2409         int is_active = ctx->is_active;
2410 
2411         ctx->is_active &= ~event_type;
2412         if (likely(!ctx->nr_events))
2413                 return;
2414 
2415         update_context_time(ctx);
2416         update_cgrp_time_from_cpuctx(cpuctx);
2417         if (!ctx->nr_active)
2418                 return;
2419 
2420         perf_pmu_disable(ctx->pmu);
2421         if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2422                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2423                         group_sched_out(event, cpuctx, ctx);
2424         }
2425 
2426         if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2427                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2428                         group_sched_out(event, cpuctx, ctx);
2429         }
2430         perf_pmu_enable(ctx->pmu);
2431 }
2432 
2433 /*
2434  * Test whether two contexts are equivalent, i.e. whether they have both been
2435  * cloned from the same version of the same context.
2436  *
2437  * Equivalence is measured using a generation number in the context that is
2438  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2439  * and list_del_event().
2440  */
2441 static int context_equiv(struct perf_event_context *ctx1,
2442                          struct perf_event_context *ctx2)
2443 {
2444         lockdep_assert_held(&ctx1->lock);
2445         lockdep_assert_held(&ctx2->lock);
2446 
2447         /* Pinning disables the swap optimization */
2448         if (ctx1->pin_count || ctx2->pin_count)
2449                 return 0;
2450 
2451         /* If ctx1 is the parent of ctx2 */
2452         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2453                 return 1;
2454 
2455         /* If ctx2 is the parent of ctx1 */
2456         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2457                 return 1;
2458 
2459         /*
2460          * If ctx1 and ctx2 have the same parent; we flatten the parent
2461          * hierarchy, see perf_event_init_context().
2462          */
2463         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2464                         ctx1->parent_gen == ctx2->parent_gen)
2465                 return 1;
2466 
2467         /* Unmatched */
2468         return 0;
2469 }
2470 
2471 static void __perf_event_sync_stat(struct perf_event *event,
2472                                      struct perf_event *next_event)
2473 {
2474         u64 value;
2475 
2476         if (!event->attr.inherit_stat)
2477                 return;
2478 
2479         /*
2480          * Update the event value, we cannot use perf_event_read()
2481          * because we're in the middle of a context switch and have IRQs
2482          * disabled, which upsets smp_call_function_single(), however
2483          * we know the event must be on the current CPU, therefore we
2484          * don't need to use it.
2485          */
2486         switch (event->state) {
2487         case PERF_EVENT_STATE_ACTIVE:
2488                 event->pmu->read(event);
2489                 /* fall-through */
2490 
2491         case PERF_EVENT_STATE_INACTIVE:
2492                 update_event_times(event);
2493                 break;
2494 
2495         default:
2496                 break;
2497         }
2498 
2499         /*
2500          * In order to keep per-task stats reliable we need to flip the event
2501          * values when we flip the contexts.
2502          */
2503         value = local64_read(&next_event->count);
2504         value = local64_xchg(&event->count, value);
2505         local64_set(&next_event->count, value);
2506 
2507         swap(event->total_time_enabled, next_event->total_time_enabled);
2508         swap(event->total_time_running, next_event->total_time_running);
2509 
2510         /*
2511          * Since we swizzled the values, update the user visible data too.
2512          */
2513         perf_event_update_userpage(event);
2514         perf_event_update_userpage(next_event);
2515 }
2516 
2517 static void perf_event_sync_stat(struct perf_event_context *ctx,
2518                                    struct perf_event_context *next_ctx)
2519 {
2520         struct perf_event *event, *next_event;
2521 
2522         if (!ctx->nr_stat)
2523                 return;
2524 
2525         update_context_time(ctx);
2526 
2527         event = list_first_entry(&ctx->event_list,
2528                                    struct perf_event, event_entry);
2529 
2530         next_event = list_first_entry(&next_ctx->event_list,
2531                                         struct perf_event, event_entry);
2532 
2533         while (&event->event_entry != &ctx->event_list &&
2534                &next_event->event_entry != &next_ctx->event_list) {
2535 
2536                 __perf_event_sync_stat(event, next_event);
2537 
2538                 event = list_next_entry(event, event_entry);
2539                 next_event = list_next_entry(next_event, event_entry);
2540         }
2541 }
2542 
2543 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2544                                          struct task_struct *next)
2545 {
2546         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2547         struct perf_event_context *next_ctx;
2548         struct perf_event_context *parent, *next_parent;
2549         struct perf_cpu_context *cpuctx;
2550         int do_switch = 1;
2551 
2552         if (likely(!ctx))
2553                 return;
2554 
2555         cpuctx = __get_cpu_context(ctx);
2556         if (!cpuctx->task_ctx)
2557                 return;
2558 
2559         rcu_read_lock();
2560         next_ctx = next->perf_event_ctxp[ctxn];
2561         if (!next_ctx)
2562                 goto unlock;
2563 
2564         parent = rcu_dereference(ctx->parent_ctx);
2565         next_parent = rcu_dereference(next_ctx->parent_ctx);
2566 
2567         /* If neither context have a parent context; they cannot be clones. */
2568         if (!parent && !next_parent)
2569                 goto unlock;
2570 
2571         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2572                 /*
2573                  * Looks like the two contexts are clones, so we might be
2574                  * able to optimize the context switch.  We lock both
2575                  * contexts and check that they are clones under the
2576                  * lock (including re-checking that neither has been
2577                  * uncloned in the meantime).  It doesn't matter which
2578                  * order we take the locks because no other cpu could
2579                  * be trying to lock both of these tasks.
2580                  */
2581                 raw_spin_lock(&ctx->lock);
2582                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2583                 if (context_equiv(ctx, next_ctx)) {
2584                         /*
2585                          * XXX do we need a memory barrier of sorts
2586                          * wrt to rcu_dereference() of perf_event_ctxp
2587                          */
2588                         task->perf_event_ctxp[ctxn] = next_ctx;
2589                         next->perf_event_ctxp[ctxn] = ctx;
2590                         ctx->task = next;
2591                         next_ctx->task = task;
2592 
2593                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2594 
2595                         do_switch = 0;
2596 
2597                         perf_event_sync_stat(ctx, next_ctx);
2598                 }
2599                 raw_spin_unlock(&next_ctx->lock);
2600                 raw_spin_unlock(&ctx->lock);
2601         }
2602 unlock:
2603         rcu_read_unlock();
2604 
2605         if (do_switch) {
2606                 raw_spin_lock(&ctx->lock);
2607                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2608                 cpuctx->task_ctx = NULL;
2609                 raw_spin_unlock(&ctx->lock);
2610         }
2611 }
2612 
2613 void perf_sched_cb_dec(struct pmu *pmu)
2614 {
2615         this_cpu_dec(perf_sched_cb_usages);
2616 }
2617 
2618 void perf_sched_cb_inc(struct pmu *pmu)
2619 {
2620         this_cpu_inc(perf_sched_cb_usages);
2621 }
2622 
2623 /*
2624  * This function provides the context switch callback to the lower code
2625  * layer. It is invoked ONLY when the context switch callback is enabled.
2626  */
2627 static void perf_pmu_sched_task(struct task_struct *prev,
2628                                 struct task_struct *next,
2629                                 bool sched_in)
2630 {
2631         struct perf_cpu_context *cpuctx;
2632         struct pmu *pmu;
2633         unsigned long flags;
2634 
2635         if (prev == next)
2636                 return;
2637 
2638         local_irq_save(flags);
2639 
2640         rcu_read_lock();
2641 
2642         list_for_each_entry_rcu(pmu, &pmus, entry) {
2643                 if (pmu->sched_task) {
2644                         cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2645 
2646                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2647 
2648                         perf_pmu_disable(pmu);
2649 
2650                         pmu->sched_task(cpuctx->task_ctx, sched_in);
2651 
2652                         perf_pmu_enable(pmu);
2653 
2654                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2655                 }
2656         }
2657 
2658         rcu_read_unlock();
2659 
2660         local_irq_restore(flags);
2661 }
2662 
2663 #define for_each_task_context_nr(ctxn)                                  \
2664         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2665 
2666 /*
2667  * Called from scheduler to remove the events of the current task,
2668  * with interrupts disabled.
2669  *
2670  * We stop each event and update the event value in event->count.
2671  *
2672  * This does not protect us against NMI, but disable()
2673  * sets the disabled bit in the control field of event _before_
2674  * accessing the event control register. If a NMI hits, then it will
2675  * not restart the event.
2676  */
2677 void __perf_event_task_sched_out(struct task_struct *task,
2678                                  struct task_struct *next)
2679 {
2680         int ctxn;
2681 
2682         if (__this_cpu_read(perf_sched_cb_usages))
2683                 perf_pmu_sched_task(task, next, false);
2684 
2685         for_each_task_context_nr(ctxn)
2686                 perf_event_context_sched_out(task, ctxn, next);
2687 
2688         /*
2689          * if cgroup events exist on this CPU, then we need
2690          * to check if we have to switch out PMU state.
2691          * cgroup event are system-wide mode only
2692          */
2693         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2694                 perf_cgroup_sched_out(task, next);
2695 }
2696 
2697 static void task_ctx_sched_out(struct perf_event_context *ctx)
2698 {
2699         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2700 
2701         if (!cpuctx->task_ctx)
2702                 return;
2703 
2704         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2705                 return;
2706 
2707         ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2708         cpuctx->task_ctx = NULL;
2709 }
2710 
2711 /*
2712  * Called with IRQs disabled
2713  */
2714 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2715                               enum event_type_t event_type)
2716 {
2717         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2718 }
2719 
2720 static void
2721 ctx_pinned_sched_in(struct perf_event_context *ctx,
2722                     struct perf_cpu_context *cpuctx)
2723 {
2724         struct perf_event *event;
2725 
2726         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2727                 if (event->state <= PERF_EVENT_STATE_OFF)
2728                         continue;
2729                 if (!event_filter_match(event))
2730                         continue;
2731 
2732                 /* may need to reset tstamp_enabled */
2733                 if (is_cgroup_event(event))
2734                         perf_cgroup_mark_enabled(event, ctx);
2735 
2736                 if (group_can_go_on(event, cpuctx, 1))
2737                         group_sched_in(event, cpuctx, ctx);
2738 
2739                 /*
2740                  * If this pinned group hasn't been scheduled,
2741                  * put it in error state.
2742                  */
2743                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2744                         update_group_times(event);
2745                         event->state = PERF_EVENT_STATE_ERROR;
2746                 }
2747         }
2748 }
2749 
2750 static void
2751 ctx_flexible_sched_in(struct perf_event_context *ctx,
2752                       struct perf_cpu_context *cpuctx)
2753 {
2754         struct perf_event *event;
2755         int can_add_hw = 1;
2756 
2757         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2758                 /* Ignore events in OFF or ERROR state */
2759                 if (event->state <= PERF_EVENT_STATE_OFF)
2760                         continue;
2761                 /*
2762                  * Listen to the 'cpu' scheduling filter constraint
2763                  * of events:
2764                  */
2765                 if (!event_filter_match(event))
2766                         continue;
2767 
2768                 /* may need to reset tstamp_enabled */
2769                 if (is_cgroup_event(event))
2770                         perf_cgroup_mark_enabled(event, ctx);
2771 
2772                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2773                         if (group_sched_in(event, cpuctx, ctx))
2774                                 can_add_hw = 0;
2775                 }
2776         }
2777 }
2778 
2779 static void
2780 ctx_sched_in(struct perf_event_context *ctx,
2781              struct perf_cpu_context *cpuctx,
2782              enum event_type_t event_type,
2783              struct task_struct *task)
2784 {
2785         u64 now;
2786         int is_active = ctx->is_active;
2787 
2788         ctx->is_active |= event_type;
2789         if (likely(!ctx->nr_events))
2790                 return;
2791 
2792         now = perf_clock();
2793         ctx->timestamp = now;
2794         perf_cgroup_set_timestamp(task, ctx);
2795         /*
2796          * First go through the list and put on any pinned groups
2797          * in order to give them the best chance of going on.
2798          */
2799         if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2800                 ctx_pinned_sched_in(ctx, cpuctx);
2801 
2802         /* Then walk through the lower prio flexible groups */
2803         if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2804                 ctx_flexible_sched_in(ctx, cpuctx);
2805 }
2806 
2807 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2808                              enum event_type_t event_type,
2809                              struct task_struct *task)
2810 {
2811         struct perf_event_context *ctx = &cpuctx->ctx;
2812 
2813         ctx_sched_in(ctx, cpuctx, event_type, task);
2814 }
2815 
2816 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2817                                         struct task_struct *task)
2818 {
2819         struct perf_cpu_context *cpuctx;
2820 
2821         cpuctx = __get_cpu_context(ctx);
2822         if (cpuctx->task_ctx == ctx)
2823                 return;
2824 
2825         perf_ctx_lock(cpuctx, ctx);
2826         perf_pmu_disable(ctx->pmu);
2827         /*
2828          * We want to keep the following priority order:
2829          * cpu pinned (that don't need to move), task pinned,
2830          * cpu flexible, task flexible.
2831          */
2832         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2833 
2834         if (ctx->nr_events)
2835                 cpuctx->task_ctx = ctx;
2836 
2837         perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2838 
2839         perf_pmu_enable(ctx->pmu);
2840         perf_ctx_unlock(cpuctx, ctx);
2841 }
2842 
2843 /*
2844  * Called from scheduler to add the events of the current task
2845  * with interrupts disabled.
2846  *
2847  * We restore the event value and then enable it.
2848  *
2849  * This does not protect us against NMI, but enable()
2850  * sets the enabled bit in the control field of event _before_
2851  * accessing the event control register. If a NMI hits, then it will
2852  * keep the event running.
2853  */
2854 void __perf_event_task_sched_in(struct task_struct *prev,
2855                                 struct task_struct *task)
2856 {
2857         struct perf_event_context *ctx;
2858         int ctxn;
2859 
2860         for_each_task_context_nr(ctxn) {
2861                 ctx = task->perf_event_ctxp[ctxn];
2862                 if (likely(!ctx))
2863                         continue;
2864 
2865                 perf_event_context_sched_in(ctx, task);
2866         }
2867         /*
2868          * if cgroup events exist on this CPU, then we need
2869          * to check if we have to switch in PMU state.
2870          * cgroup event are system-wide mode only
2871          */
2872         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2873                 perf_cgroup_sched_in(prev, task);
2874 
2875         if (__this_cpu_read(perf_sched_cb_usages))
2876                 perf_pmu_sched_task(prev, task, true);
2877 }
2878 
2879 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2880 {
2881         u64 frequency = event->attr.sample_freq;
2882         u64 sec = NSEC_PER_SEC;
2883         u64 divisor, dividend;
2884 
2885         int count_fls, nsec_fls, frequency_fls, sec_fls;
2886 
2887         count_fls = fls64(count);
2888         nsec_fls = fls64(nsec);
2889         frequency_fls = fls64(frequency);
2890         sec_fls = 30;
2891 
2892         /*
2893          * We got @count in @nsec, with a target of sample_freq HZ
2894          * the target period becomes:
2895          *
2896          *             @count * 10^9
2897          * period = -------------------
2898          *          @nsec * sample_freq
2899          *
2900          */
2901 
2902         /*
2903          * Reduce accuracy by one bit such that @a and @b converge
2904          * to a similar magnitude.
2905          */
2906 #define REDUCE_FLS(a, b)                \
2907 do {                                    \
2908         if (a##_fls > b##_fls) {        \
2909                 a >>= 1;                \
2910                 a##_fls--;              \
2911         } else {                        \
2912                 b >>= 1;                \
2913                 b##_fls--;              \
2914         }                               \
2915 } while (0)
2916 
2917         /*
2918          * Reduce accuracy until either term fits in a u64, then proceed with
2919          * the other, so that finally we can do a u64/u64 division.
2920          */
2921         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2922                 REDUCE_FLS(nsec, frequency);
2923                 REDUCE_FLS(sec, count);
2924         }
2925 
2926         if (count_fls + sec_fls > 64) {
2927                 divisor = nsec * frequency;
2928 
2929                 while (count_fls + sec_fls > 64) {
2930                         REDUCE_FLS(count, sec);
2931                         divisor >>= 1;
2932                 }
2933 
2934                 dividend = count * sec;
2935         } else {
2936                 dividend = count * sec;
2937 
2938                 while (nsec_fls + frequency_fls > 64) {
2939                         REDUCE_FLS(nsec, frequency);
2940                         dividend >>= 1;
2941                 }
2942 
2943                 divisor = nsec * frequency;
2944         }
2945 
2946         if (!divisor)
2947                 return dividend;
2948 
2949         return div64_u64(dividend, divisor);
2950 }
2951 
2952 static DEFINE_PER_CPU(int, perf_throttled_count);
2953 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2954 
2955 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2956 {
2957         struct hw_perf_event *hwc = &event->hw;
2958         s64 period, sample_period;
2959         s64 delta;
2960 
2961         period = perf_calculate_period(event, nsec, count);
2962 
2963         delta = (s64)(period - hwc->sample_period);
2964         delta = (delta + 7) / 8; /* low pass filter */
2965 
2966         sample_period = hwc->sample_period + delta;
2967 
2968         if (!sample_period)
2969                 sample_period = 1;
2970 
2971         hwc->sample_period = sample_period;
2972 
2973         if (local64_read(&hwc->period_left) > 8*sample_period) {
2974                 if (disable)
2975                         event->pmu->stop(event, PERF_EF_UPDATE);
2976 
2977                 local64_set(&hwc->period_left, 0);
2978 
2979                 if (disable)
2980                         event->pmu->start(event, PERF_EF_RELOAD);
2981         }
2982 }
2983 
2984 /*
2985  * combine freq adjustment with unthrottling to avoid two passes over the
2986  * events. At the same time, make sure, having freq events does not change
2987  * the rate of unthrottling as that would introduce bias.
2988  */
2989 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2990                                            int needs_unthr)
2991 {
2992         struct perf_event *event;
2993         struct hw_perf_event *hwc;
2994         u64 now, period = TICK_NSEC;
2995         s64 delta;
2996 
2997         /*
2998          * only need to iterate over all events iff:
2999          * - context have events in frequency mode (needs freq adjust)
3000          * - there are events to unthrottle on this cpu
3001          */
3002         if (!(ctx->nr_freq || needs_unthr))
3003                 return;
3004 
3005         raw_spin_lock(&ctx->lock);
3006         perf_pmu_disable(ctx->pmu);
3007 
3008         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3009                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3010                         continue;
3011 
3012                 if (!event_filter_match(event))
3013                         continue;
3014 
3015                 perf_pmu_disable(event->pmu);
3016 
3017                 hwc = &event->hw;
3018 
3019                 if (hwc->interrupts == MAX_INTERRUPTS) {
3020                         hwc->interrupts = 0;
3021                         perf_log_throttle(event, 1);
3022                         event->pmu->start(event, 0);
3023                 }
3024 
3025                 if (!event->attr.freq || !event->attr.sample_freq)
3026                         goto next;
3027 
3028                 /*
3029                  * stop the event and update event->count
3030                  */
3031                 event->pmu->stop(event, PERF_EF_UPDATE);
3032 
3033                 now = local64_read(&event->count);
3034                 delta = now - hwc->freq_count_stamp;
3035                 hwc->freq_count_stamp = now;
3036 
3037                 /*
3038                  * restart the event
3039                  * reload only if value has changed
3040                  * we have stopped the event so tell that
3041                  * to perf_adjust_period() to avoid stopping it
3042                  * twice.
3043                  */
3044                 if (delta > 0)
3045                         perf_adjust_period(event, period, delta, false);
3046 
3047                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3048         next:
3049                 perf_pmu_enable(event->pmu);
3050         }
3051 
3052         perf_pmu_enable(ctx->pmu);
3053         raw_spin_unlock(&ctx->lock);
3054 }
3055 
3056 /*
3057  * Round-robin a context's events:
3058  */
3059 static void rotate_ctx(struct perf_event_context *ctx)
3060 {
3061         /*
3062          * Rotate the first entry last of non-pinned groups. Rotation might be
3063          * disabled by the inheritance code.
3064          */
3065         if (!ctx->rotate_disable)
3066                 list_rotate_left(&ctx->flexible_groups);
3067 }
3068 
3069 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3070 {
3071         struct perf_event_context *ctx = NULL;
3072         int rotate = 0;
3073 
3074         if (cpuctx->ctx.nr_events) {
3075                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3076                         rotate = 1;
3077         }
3078 
3079         ctx = cpuctx->task_ctx;
3080         if (ctx && ctx->nr_events) {
3081                 if (ctx->nr_events != ctx->nr_active)
3082                         rotate = 1;
3083         }
3084 
3085         if (!rotate)
3086                 goto done;
3087 
3088         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3089         perf_pmu_disable(cpuctx->ctx.pmu);
3090 
3091         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3092         if (ctx)
3093                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3094 
3095         rotate_ctx(&cpuctx->ctx);
3096         if (ctx)
3097                 rotate_ctx(ctx);
3098 
3099         perf_event_sched_in(cpuctx, ctx, current);
3100 
3101         perf_pmu_enable(cpuctx->ctx.pmu);
3102         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3103 done:
3104 
3105         return rotate;
3106 }
3107 
3108 #ifdef CONFIG_NO_HZ_FULL
3109 bool perf_event_can_stop_tick(void)
3110 {
3111         if (atomic_read(&nr_freq_events) ||
3112             __this_cpu_read(perf_throttled_count))
3113                 return false;
3114         else
3115                 return true;
3116 }
3117 #endif
3118 
3119 void perf_event_task_tick(void)
3120 {
3121         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3122         struct perf_event_context *ctx, *tmp;
3123         int throttled;
3124 
3125         WARN_ON(!irqs_disabled());
3126 
3127         __this_cpu_inc(perf_throttled_seq);
3128         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3129 
3130         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3131                 perf_adjust_freq_unthr_context(ctx, throttled);
3132 }
3133 
3134 static int event_enable_on_exec(struct perf_event *event,
3135                                 struct perf_event_context *ctx)
3136 {
3137         if (!event->attr.enable_on_exec)
3138                 return 0;
3139 
3140         event->attr.enable_on_exec = 0;
3141         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3142                 return 0;
3143 
3144         __perf_event_mark_enabled(event);
3145 
3146         return 1;
3147 }
3148 
3149 /*
3150  * Enable all of a task's events that have been marked enable-on-exec.
3151  * This expects task == current.
3152  */
3153 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3154 {
3155         struct perf_event_context *clone_ctx = NULL;
3156         struct perf_event *event;
3157         unsigned long flags;
3158         int enabled = 0;
3159         int ret;
3160 
3161         local_irq_save(flags);
3162         if (!ctx || !ctx->nr_events)
3163                 goto out;
3164 
3165         /*
3166          * We must ctxsw out cgroup events to avoid conflict
3167          * when invoking perf_task_event_sched_in() later on
3168          * in this function. Otherwise we end up trying to
3169          * ctxswin cgroup events which are already scheduled
3170          * in.
3171          */
3172         perf_cgroup_sched_out(current, NULL);
3173 
3174         raw_spin_lock(&ctx->lock);
3175         task_ctx_sched_out(ctx);
3176 
3177         list_for_each_entry(event, &ctx->event_list, event_entry) {
3178                 ret = event_enable_on_exec(event, ctx);
3179                 if (ret)
3180                         enabled = 1;
3181         }
3182 
3183         /*
3184          * Unclone this context if we enabled any event.
3185          */
3186         if (enabled)
3187                 clone_ctx = unclone_ctx(ctx);
3188 
3189         raw_spin_unlock(&ctx->lock);
3190 
3191         /*
3192          * Also calls ctxswin for cgroup events, if any:
3193          */
3194         perf_event_context_sched_in(ctx, ctx->task);
3195 out:
3196         local_irq_restore(flags);
3197 
3198         if (clone_ctx)
3199                 put_ctx(clone_ctx);
3200 }
3201 
3202 void perf_event_exec(void)
3203 {
3204         struct perf_event_context *ctx;
3205         int ctxn;
3206 
3207         rcu_read_lock();
3208         for_each_task_context_nr(ctxn) {
3209                 ctx = current->perf_event_ctxp[ctxn];
3210                 if (!ctx)
3211                         continue;
3212 
3213                 perf_event_enable_on_exec(ctx);
3214         }
3215         rcu_read_unlock();
3216 }
3217 
3218 /*
3219  * Cross CPU call to read the hardware event
3220  */
3221 static void __perf_event_read(void *info)
3222 {
3223         struct perf_event *event = info;
3224         struct perf_event_context *ctx = event->ctx;
3225         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3226 
3227         /*
3228          * If this is a task context, we need to check whether it is
3229          * the current task context of this cpu.  If not it has been
3230          * scheduled out before the smp call arrived.  In that case
3231          * event->count would have been updated to a recent sample
3232          * when the event was scheduled out.
3233          */
3234         if (ctx->task && cpuctx->task_ctx != ctx)
3235                 return;
3236 
3237         raw_spin_lock(&ctx->lock);
3238         if (ctx->is_active) {
3239                 update_context_time(ctx);
3240                 update_cgrp_time_from_event(event);
3241         }
3242         update_event_times(event);
3243         if (event->state == PERF_EVENT_STATE_ACTIVE)
3244                 event->pmu->read(event);
3245         raw_spin_unlock(&ctx->lock);
3246 }
3247 
3248 static inline u64 perf_event_count(struct perf_event *event)
3249 {
3250         if (event->pmu->count)
3251                 return event->pmu->count(event);
3252 
3253         return __perf_event_count(event);
3254 }
3255 
3256 static u64 perf_event_read(struct perf_event *event)
3257 {
3258         /*
3259          * If event is enabled and currently active on a CPU, update the
3260          * value in the event structure:
3261          */
3262         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3263                 smp_call_function_single(event->oncpu,
3264                                          __perf_event_read, event, 1);
3265         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3266                 struct perf_event_context *ctx = event->ctx;
3267                 unsigned long flags;
3268 
3269                 raw_spin_lock_irqsave(&ctx->lock, flags);
3270                 /*
3271                  * may read while context is not active
3272                  * (e.g., thread is blocked), in that case
3273                  * we cannot update context time
3274                  */
3275                 if (ctx->is_active) {
3276                         update_context_time(ctx);
3277                         update_cgrp_time_from_event(event);
3278                 }
3279                 update_event_times(event);
3280                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3281         }
3282 
3283         return perf_event_count(event);
3284 }
3285 
3286 /*
3287  * Initialize the perf_event context in a task_struct:
3288  */
3289 static void __perf_event_init_context(struct perf_event_context *ctx)
3290 {
3291         raw_spin_lock_init(&ctx->lock);
3292         mutex_init(&ctx->mutex);
3293         INIT_LIST_HEAD(&ctx->active_ctx_list);
3294         INIT_LIST_HEAD(&ctx->pinned_groups);
3295         INIT_LIST_HEAD(&ctx->flexible_groups);
3296         INIT_LIST_HEAD(&ctx->event_list);
3297         atomic_set(&ctx->refcount, 1);
3298         INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3299 }
3300 
3301 static struct perf_event_context *
3302 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3303 {
3304         struct perf_event_context *ctx;
3305 
3306         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3307         if (!ctx)
3308                 return NULL;
3309 
3310         __perf_event_init_context(ctx);
3311         if (task) {
3312                 ctx->task = task;
3313                 get_task_struct(task);
3314         }
3315         ctx->pmu = pmu;
3316 
3317         return ctx;
3318 }
3319 
3320 static struct task_struct *
3321 find_lively_task_by_vpid(pid_t vpid)
3322 {
3323         struct task_struct *task;
3324         int err;
3325 
3326         rcu_read_lock();
3327         if (!vpid)
3328                 task = current;
3329         else
3330                 task = find_task_by_vpid(vpid);
3331         if (task)
3332                 get_task_struct(task);
3333         rcu_read_unlock();
3334 
3335         if (!task)
3336                 return ERR_PTR(-ESRCH);
3337 
3338         /* Reuse ptrace permission checks for now. */
3339         err = -EACCES;
3340         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
3341                 goto errout;
3342 
3343         return task;
3344 errout:
3345         put_task_struct(task);
3346         return ERR_PTR(err);
3347 
3348 }
3349 
3350 /*
3351  * Returns a matching context with refcount and pincount.
3352  */
3353 static struct perf_event_context *
3354 find_get_context(struct pmu *pmu, struct task_struct *task,
3355                 struct perf_event *event)
3356 {
3357         struct perf_event_context *ctx, *clone_ctx = NULL;
3358         struct perf_cpu_context *cpuctx;
3359         void *task_ctx_data = NULL;
3360         unsigned long flags;
3361         int ctxn, err;
3362         int cpu = event->cpu;
3363 
3364         if (!task) {
3365                 /* Must be root to operate on a CPU event: */
3366                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3367                         return ERR_PTR(-EACCES);
3368 
3369                 /*
3370                  * We could be clever and allow to attach a event to an
3371                  * offline CPU and activate it when the CPU comes up, but
3372                  * that's for later.
3373                  */
3374                 if (!cpu_online(cpu))
3375                         return ERR_PTR(-ENODEV);
3376 
3377                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3378                 ctx = &cpuctx->ctx;
3379                 get_ctx(ctx);
3380                 ++ctx->pin_count;
3381 
3382                 return ctx;
3383         }
3384 
3385         err = -EINVAL;
3386         ctxn = pmu->task_ctx_nr;
3387         if (ctxn < 0)
3388                 goto errout;
3389 
3390         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3391                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3392                 if (!task_ctx_data) {
3393                         err = -ENOMEM;
3394                         goto errout;
3395                 }
3396         }
3397 
3398 retry:
3399         ctx = perf_lock_task_context(task, ctxn, &flags);
3400         if (ctx) {
3401                 clone_ctx = unclone_ctx(ctx);
3402                 ++ctx->pin_count;
3403 
3404                 if (task_ctx_data && !ctx->task_ctx_data) {
3405                         ctx->task_ctx_data = task_ctx_data;
3406                         task_ctx_data = NULL;
3407                 }
3408                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3409 
3410                 if (clone_ctx)
3411                         put_ctx(clone_ctx);
3412         } else {
3413                 ctx = alloc_perf_context(pmu, task);
3414                 err = -ENOMEM;
3415                 if (!ctx)
3416                         goto errout;
3417 
3418                 if (task_ctx_data) {
3419                         ctx->task_ctx_data = task_ctx_data;
3420                         task_ctx_data = NULL;
3421                 }
3422 
3423                 err = 0;
3424                 mutex_lock(&task->perf_event_mutex);
3425                 /*
3426                  * If it has already passed perf_event_exit_task().
3427                  * we must see PF_EXITING, it takes this mutex too.
3428                  */
3429                 if (task->flags & PF_EXITING)
3430                         err = -ESRCH;
3431                 else if (task->perf_event_ctxp[ctxn])
3432                         err = -EAGAIN;
3433                 else {
3434                         get_ctx(ctx);
3435                         ++ctx->pin_count;
3436                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3437                 }
3438                 mutex_unlock(&task->perf_event_mutex);
3439 
3440                 if (unlikely(err)) {
3441                         put_ctx(ctx);
3442 
3443                         if (err == -EAGAIN)
3444                                 goto retry;
3445                         goto errout;
3446                 }
3447         }
3448 
3449         kfree(task_ctx_data);
3450         return ctx;
3451 
3452 errout:
3453         kfree(task_ctx_data);
3454         return ERR_PTR(err);
3455 }
3456 
3457 static void perf_event_free_filter(struct perf_event *event);
3458 static void perf_event_free_bpf_prog(struct perf_event *event);
3459 
3460 static void free_event_rcu(struct rcu_head *head)
3461 {
3462         struct perf_event *event;
3463 
3464         event = container_of(head, struct perf_event, rcu_head);
3465         if (event->ns)
3466                 put_pid_ns(event->ns);
3467         perf_event_free_filter(event);
3468         kfree(event);
3469 }
3470 
3471 static void ring_buffer_attach(struct perf_event *event,
3472                                struct ring_buffer *rb);
3473 
3474 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3475 {
3476         if (event->parent)
3477                 return;
3478 
3479         if (is_cgroup_event(event))
3480                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3481 }
3482 
3483 static void unaccount_event(struct perf_event *event)
3484 {
3485         if (event->parent)
3486                 return;
3487 
3488         if (event->attach_state & PERF_ATTACH_TASK)
3489                 static_key_slow_dec_deferred(&perf_sched_events);
3490         if (event->attr.mmap || event->attr.mmap_data)
3491                 atomic_dec(&nr_mmap_events);
3492         if (event->attr.comm)
3493                 atomic_dec(&nr_comm_events);
3494         if (event->attr.task)
3495                 atomic_dec(&nr_task_events);
3496         if (event->attr.freq)
3497                 atomic_dec(&nr_freq_events);
3498         if (is_cgroup_event(event))
3499                 static_key_slow_dec_deferred(&perf_sched_events);
3500         if (has_branch_stack(event))
3501                 static_key_slow_dec_deferred(&perf_sched_events);
3502 
3503         unaccount_event_cpu(event, event->cpu);
3504 }
3505 
3506 /*
3507  * The following implement mutual exclusion of events on "exclusive" pmus
3508  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3509  * at a time, so we disallow creating events that might conflict, namely:
3510  *
3511  *  1) cpu-wide events in the presence of per-task events,
3512  *  2) per-task events in the presence of cpu-wide events,
3513  *  3) two matching events on the same context.
3514  *
3515  * The former two cases are handled in the allocation path (perf_event_alloc(),
3516  * __free_event()), the latter -- before the first perf_install_in_context().
3517  */
3518 static int exclusive_event_init(struct perf_event *event)
3519 {
3520         struct pmu *pmu = event->pmu;
3521 
3522         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3523                 return 0;
3524 
3525         /*
3526          * Prevent co-existence of per-task and cpu-wide events on the
3527          * same exclusive pmu.
3528          *
3529          * Negative pmu::exclusive_cnt means there are cpu-wide
3530          * events on this "exclusive" pmu, positive means there are
3531          * per-task events.
3532          *
3533          * Since this is called in perf_event_alloc() path, event::ctx
3534          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3535          * to mean "per-task event", because unlike other attach states it
3536          * never gets cleared.
3537          */
3538         if (event->attach_state & PERF_ATTACH_TASK) {
3539                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3540                         return -EBUSY;
3541         } else {
3542                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3543                         return -EBUSY;
3544         }
3545 
3546         return 0;
3547 }
3548 
3549 static void exclusive_event_destroy(struct perf_event *event)
3550 {
3551         struct pmu *pmu = event->pmu;
3552 
3553         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3554                 return;
3555 
3556         /* see comment in exclusive_event_init() */
3557         if (event->attach_state & PERF_ATTACH_TASK)
3558                 atomic_dec(&pmu->exclusive_cnt);
3559         else
3560                 atomic_inc(&pmu->exclusive_cnt);
3561 }
3562 
3563 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3564 {
3565         if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3566             (e1->cpu == e2->cpu ||
3567              e1->cpu == -1 ||
3568              e2->cpu == -1))
3569                 return true;
3570         return false;
3571 }
3572 
3573 /* Called under the same ctx::mutex as perf_install_in_context() */
3574 static bool exclusive_event_installable(struct perf_event *event,
3575                                         struct perf_event_context *ctx)
3576 {
3577         struct perf_event *iter_event;
3578         struct pmu *pmu = event->pmu;
3579 
3580         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3581                 return true;
3582 
3583         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3584                 if (exclusive_event_match(iter_event, event))
3585                         return false;
3586         }
3587 
3588         return true;
3589 }
3590 
3591 static void __free_event(struct perf_event *event)
3592 {
3593         if (!event->parent) {
3594                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3595                         put_callchain_buffers();
3596         }
3597 
3598         perf_event_free_bpf_prog(event);
3599 
3600         if (event->destroy)
3601                 event->destroy(event);
3602 
3603         if (event->ctx)
3604                 put_ctx(event->ctx);
3605 
3606         if (event->pmu) {
3607                 exclusive_event_destroy(event);
3608                 module_put(event->pmu->module);
3609         }
3610 
3611         call_rcu(&event->rcu_head, free_event_rcu);
3612 }
3613 
3614 static void _free_event(struct perf_event *event)
3615 {
3616         irq_work_sync(&event->pending);
3617 
3618         unaccount_event(event);
3619 
3620         if (event->rb) {
3621                 /*
3622                  * Can happen when we close an event with re-directed output.
3623                  *
3624                  * Since we have a 0 refcount, perf_mmap_close() will skip
3625                  * over us; possibly making our ring_buffer_put() the last.
3626                  */
3627                 mutex_lock(&event->mmap_mutex);
3628                 ring_buffer_attach(event, NULL);
3629                 mutex_unlock(&event->mmap_mutex);
3630         }
3631 
3632         if (is_cgroup_event(event))
3633                 perf_detach_cgroup(event);
3634 
3635         __free_event(event);
3636 }
3637 
3638 /*
3639  * Used to free events which have a known refcount of 1, such as in error paths
3640  * where the event isn't exposed yet and inherited events.
3641  */
3642 static void free_event(struct perf_event *event)
3643 {
3644         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3645                                 "unexpected event refcount: %ld; ptr=%p\n",
3646                                 atomic_long_read(&event->refcount), event)) {
3647                 /* leak to avoid use-after-free */
3648                 return;
3649         }
3650 
3651         _free_event(event);
3652 }
3653 
3654 /*
3655  * Remove user event from the owner task.
3656  */
3657 static void perf_remove_from_owner(struct perf_event *event)
3658 {
3659         struct task_struct *owner;
3660 
3661         rcu_read_lock();
3662         owner = ACCESS_ONCE(event->owner);
3663         /*
3664          * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3665          * !owner it means the list deletion is complete and we can indeed
3666          * free this event, otherwise we need to serialize on
3667          * owner->perf_event_mutex.
3668          */
3669         smp_read_barrier_depends();
3670         if (owner) {
3671                 /*
3672                  * Since delayed_put_task_struct() also drops the last
3673                  * task reference we can safely take a new reference
3674                  * while holding the rcu_read_lock().
3675                  */
3676                 get_task_struct(owner);
3677         }
3678         rcu_read_unlock();
3679 
3680         if (owner) {
3681                 /*
3682                  * If we're here through perf_event_exit_task() we're already
3683                  * holding ctx->mutex which would be an inversion wrt. the
3684                  * normal lock order.
3685                  *
3686                  * However we can safely take this lock because its the child
3687                  * ctx->mutex.
3688                  */
3689                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3690 
3691                 /*
3692                  * We have to re-check the event->owner field, if it is cleared
3693                  * we raced with perf_event_exit_task(), acquiring the mutex
3694                  * ensured they're done, and we can proceed with freeing the
3695                  * event.
3696                  */
3697                 if (event->owner)
3698                         list_del_init(&event->owner_entry);
3699                 mutex_unlock(&owner->perf_event_mutex);
3700                 put_task_struct(owner);
3701         }
3702 }
3703 
3704 static void put_event(struct perf_event *event)
3705 {
3706         struct perf_event_context *ctx;
3707 
3708         if (!atomic_long_dec_and_test(&event->refcount))
3709                 return;
3710 
3711         if (!is_kernel_event(event))
3712                 perf_remove_from_owner(event);
3713 
3714         /*
3715          * There are two ways this annotation is useful:
3716          *
3717          *  1) there is a lock recursion from perf_event_exit_task
3718          *     see the comment there.
3719          *
3720          *  2) there is a lock-inversion with mmap_sem through
3721          *     perf_event_read_group(), which takes faults while
3722          *     holding ctx->mutex, however this is called after
3723          *     the last filedesc died, so there is no possibility
3724          *     to trigger the AB-BA case.
3725          */
3726         ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3727         WARN_ON_ONCE(ctx->parent_ctx);
3728         perf_remove_from_context(event, true);
3729         perf_event_ctx_unlock(event, ctx);
3730 
3731         _free_event(event);
3732 }
3733 
3734 int perf_event_release_kernel(struct perf_event *event)
3735 {
3736         put_event(event);
3737         return 0;
3738 }
3739 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3740 
3741 /*
3742  * Called when the last reference to the file is gone.
3743  */
3744 static int perf_release(struct inode *inode, struct file *file)
3745 {
3746         put_event(file->private_data);
3747         return 0;
3748 }
3749 
3750 /*
3751  * Remove all orphanes events from the context.
3752  */
3753 static void orphans_remove_work(struct work_struct *work)
3754 {
3755         struct perf_event_context *ctx;
3756         struct perf_event *event, *tmp;
3757 
3758         ctx = container_of(work, struct perf_event_context,
3759                            orphans_remove.work);
3760 
3761         mutex_lock(&ctx->mutex);
3762         list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3763                 struct perf_event *parent_event = event->parent;
3764 
3765                 if (!is_orphaned_child(event))
3766                         continue;
3767 
3768                 perf_remove_from_context(event, true);
3769 
3770                 mutex_lock(&parent_event->child_mutex);
3771                 list_del_init(&event->child_list);
3772                 mutex_unlock(&parent_event->child_mutex);
3773 
3774                 free_event(event);
3775                 put_event(parent_event);
3776         }
3777 
3778         raw_spin_lock_irq(&ctx->lock);
3779         ctx->orphans_remove_sched = false;
3780         raw_spin_unlock_irq(&ctx->lock);
3781         mutex_unlock(&ctx->mutex);
3782 
3783         put_ctx(ctx);
3784 }
3785 
3786 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3787 {
3788         struct perf_event *child;
3789         u64 total = 0;
3790 
3791         *enabled = 0;
3792         *running = 0;
3793 
3794         mutex_lock(&event->child_mutex);
3795         total += perf_event_read(event);
3796         *enabled += event->total_time_enabled +
3797                         atomic64_read(&event->child_total_time_enabled);
3798         *running += event->total_time_running +
3799                         atomic64_read(&event->child_total_time_running);
3800 
3801         list_for_each_entry(child, &event->child_list, child_list) {
3802                 total += perf_event_read(child);
3803                 *enabled += child->total_time_enabled;
3804                 *running += child->total_time_running;
3805         }
3806         mutex_unlock(&event->child_mutex);
3807 
3808         return total;
3809 }
3810 EXPORT_SYMBOL_GPL(perf_event_read_value);
3811 
3812 static int perf_event_read_group(struct perf_event *event,
3813                                    u64 read_format, char __user *buf)
3814 {
3815         struct perf_event *leader = event->group_leader, *sub;
3816         struct perf_event_context *ctx = leader->ctx;
3817         int n = 0, size = 0, ret;
3818         u64 count, enabled, running;
3819         u64 values[5];
3820 
3821         lockdep_assert_held(&ctx->mutex);
3822 
3823         count = perf_event_read_value(leader, &enabled, &running);
3824 
3825         values[n++] = 1 + leader->nr_siblings;
3826         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3827                 values[n++] = enabled;
3828         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3829                 values[n++] = running;
3830         values[n++] = count;
3831         if (read_format & PERF_FORMAT_ID)
3832                 values[n++] = primary_event_id(leader);
3833 
3834         size = n * sizeof(u64);
3835 
3836         if (copy_to_user(buf, values, size))
3837                 return -EFAULT;
3838 
3839         ret = size;
3840 
3841         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3842                 n = 0;
3843 
3844                 values[n++] = perf_event_read_value(sub, &enabled, &running);
3845                 if (read_format & PERF_FORMAT_ID)
3846                         values[n++] = primary_event_id(sub);
3847 
3848                 size = n * sizeof(u64);
3849 
3850                 if (copy_to_user(buf + ret, values, size)) {
3851                         return -EFAULT;
3852                 }
3853 
3854                 ret += size;
3855         }
3856 
3857         return ret;
3858 }
3859 
3860 static int perf_event_read_one(struct perf_event *event,
3861                                  u64 read_format, char __user *buf)
3862 {
3863         u64 enabled, running;
3864         u64 values[4];
3865         int n = 0;
3866 
3867         values[n++] = perf_event_read_value(event, &enabled, &running);
3868         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3869                 values[n++] = enabled;
3870         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3871                 values[n++] = running;
3872         if (read_format & PERF_FORMAT_ID)
3873                 values[n++] = primary_event_id(event);
3874 
3875         if (copy_to_user(buf, values, n * sizeof(u64)))
3876                 return -EFAULT;
3877 
3878         return n * sizeof(u64);
3879 }
3880 
3881 static bool is_event_hup(struct perf_event *event)
3882 {
3883         bool no_children;
3884 
3885         if (event->state != PERF_EVENT_STATE_EXIT)
3886                 return false;
3887 
3888         mutex_lock(&event->child_mutex);
3889         no_children = list_empty(&event->child_list);
3890         mutex_unlock(&event->child_mutex);
3891         return no_children;
3892 }
3893 
3894 /*
3895  * Read the performance event - simple non blocking version for now
3896  */
3897 static ssize_t
3898 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3899 {
3900         u64 read_format = event->attr.read_format;
3901         int ret;
3902 
3903         /*
3904          * Return end-of-file for a read on a event that is in
3905          * error state (i.e. because it was pinned but it couldn't be
3906          * scheduled on to the CPU at some point).
3907          */
3908         if (event->state == PERF_EVENT_STATE_ERROR)
3909                 return 0;
3910 
3911         if (count < event->read_size)
3912                 return -ENOSPC;
3913 
3914         WARN_ON_ONCE(event->ctx->parent_ctx);
3915         if (read_format & PERF_FORMAT_GROUP)
3916                 ret = perf_event_read_group(event, read_format, buf);
3917         else
3918                 ret = perf_event_read_one(event, read_format, buf);
3919 
3920         return ret;
3921 }
3922 
3923 static ssize_t
3924 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3925 {
3926         struct perf_event *event = file->private_data;
3927         struct perf_event_context *ctx;
3928         int ret;
3929 
3930         ctx = perf_event_ctx_lock(event);
3931         ret = perf_read_hw(event, buf, count);
3932         perf_event_ctx_unlock(event, ctx);
3933 
3934         return ret;
3935 }
3936 
3937 static unsigned int perf_poll(struct file *file, poll_table *wait)
3938 {
3939         struct perf_event *event = file->private_data;
3940         struct ring_buffer *rb;
3941         unsigned int events = POLLHUP;
3942 
3943         poll_wait(file, &event->waitq, wait);
3944 
3945         if (is_event_hup(event))
3946                 return events;
3947 
3948         /*
3949          * Pin the event->rb by taking event->mmap_mutex; otherwise
3950          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3951          */
3952         mutex_lock(&event->mmap_mutex);
3953         rb = event->rb;
3954         if (rb)
3955                 events = atomic_xchg(&rb->poll, 0);
3956         mutex_unlock(&event->mmap_mutex);
3957         return events;
3958 }
3959 
3960 static void _perf_event_reset(struct perf_event *event)
3961 {
3962         (void)perf_event_read(event);
3963         local64_set(&event->count, 0);
3964         perf_event_update_userpage(event);
3965 }
3966 
3967 /*
3968  * Holding the top-level event's child_mutex means that any
3969  * descendant process that has inherited this event will block
3970  * in sync_child_event if it goes to exit, thus satisfying the
3971  * task existence requirements of perf_event_enable/disable.
3972  */
3973 static void perf_event_for_each_child(struct perf_event *event,
3974                                         void (*func)(struct perf_event *))
3975 {
3976         struct perf_event *child;
3977 
3978         WARN_ON_ONCE(event->ctx->parent_ctx);
3979 
3980         mutex_lock(&event->child_mutex);
3981         func(event);
3982         list_for_each_entry(child, &event->child_list, child_list)
3983                 func(child);
3984         mutex_unlock(&event->child_mutex);
3985 }
3986 
3987 static void perf_event_for_each(struct perf_event *event,
3988                                   void (*func)(struct perf_event *))
3989 {
3990         struct perf_event_context *ctx = event->ctx;
3991         struct perf_event *sibling;
3992 
3993         lockdep_assert_held(&ctx->mutex);
3994 
3995         event = event->group_leader;
3996 
3997         perf_event_for_each_child(event, func);
3998         list_for_each_entry(sibling, &event->sibling_list, group_entry)
3999                 perf_event_for_each_child(sibling, func);
4000 }
4001 
4002 struct period_event {
4003         struct perf_event *event;
4004         u64 value;
4005 };
4006 
4007 static int __perf_event_period(void *info)
4008 {
4009         struct period_event *pe = info;
4010         struct perf_event *event = pe->event;
4011         struct perf_event_context *ctx = event->ctx;
4012         u64 value = pe->value;
4013         bool active;
4014 
4015         raw_spin_lock(&ctx->lock);
4016         if (event->attr.freq) {
4017                 event->attr.sample_freq = value;
4018         } else {
4019                 event->attr.sample_period = value;
4020                 event->hw.sample_period = value;
4021         }
4022 
4023         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4024         if (active) {
4025                 perf_pmu_disable(ctx->pmu);
4026                 event->pmu->stop(event, PERF_EF_UPDATE);
4027         }
4028 
4029         local64_set(&event->hw.period_left, 0);
4030 
4031         if (active) {
4032                 event->pmu->start(event, PERF_EF_RELOAD);
4033                 perf_pmu_enable(ctx->pmu);
4034         }
4035         raw_spin_unlock(&ctx->lock);
4036 
4037         return 0;
4038 }
4039 
4040 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4041 {
4042         struct period_event pe = { .event = event, };
4043         struct perf_event_context *ctx = event->ctx;
4044         struct task_struct *task;
4045         u64 value;
4046 
4047         if (!is_sampling_event(event))
4048                 return -EINVAL;
4049 
4050         if (copy_from_user(&value, arg, sizeof(value)))
4051                 return -EFAULT;
4052 
4053         if (!value)
4054                 return -EINVAL;
4055 
4056         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4057                 return -EINVAL;
4058 
4059         task = ctx->task;
4060         pe.value = value;
4061 
4062         if (!task) {
4063                 cpu_function_call(event->cpu, __perf_event_period, &pe);
4064                 return 0;
4065         }
4066 
4067 retry:
4068         if (!task_function_call(task, __perf_event_period, &pe))
4069                 return 0;
4070 
4071         raw_spin_lock_irq(&ctx->lock);
4072         if (ctx->is_active) {
4073                 raw_spin_unlock_irq(&ctx->lock);
4074                 task = ctx->task;
4075                 goto retry;
4076         }
4077 
4078         __perf_event_period(&pe);
4079         raw_spin_unlock_irq(&ctx->lock);
4080 
4081         return 0;
4082 }
4083 
4084 static const struct file_operations perf_fops;
4085 
4086 static inline int perf_fget_light(int fd, struct fd *p)
4087 {
4088         struct fd f = fdget(fd);
4089         if (!f.file)
4090                 return -EBADF;
4091 
4092         if (f.file->f_op != &perf_fops) {
4093                 fdput(f);
4094                 return -EBADF;
4095         }
4096         *p = f;
4097         return 0;
4098 }
4099 
4100 static int perf_event_set_output(struct perf_event *event,
4101                                  struct perf_event *output_event);
4102 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4103 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4104 
4105 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4106 {
4107         void (*func)(struct perf_event *);
4108         u32 flags = arg;
4109 
4110         switch (cmd) {
4111         case PERF_EVENT_IOC_ENABLE:
4112                 func = _perf_event_enable;
4113                 break;
4114         case PERF_EVENT_IOC_DISABLE:
4115                 func = _perf_event_disable;
4116                 break;
4117         case PERF_EVENT_IOC_RESET:
4118                 func = _perf_event_reset;
4119                 break;
4120 
4121         case PERF_EVENT_IOC_REFRESH:
4122                 return _perf_event_refresh(event, arg);
4123 
4124         case PERF_EVENT_IOC_PERIOD:
4125                 return perf_event_period(event, (u64 __user *)arg);
4126 
4127         case PERF_EVENT_IOC_ID:
4128         {
4129                 u64 id = primary_event_id(event);
4130 
4131                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4132                         return -EFAULT;
4133                 return 0;
4134         }
4135 
4136         case PERF_EVENT_IOC_SET_OUTPUT:
4137         {
4138                 int ret;
4139                 if (arg != -1) {
4140                         struct perf_event *output_event;
4141                         struct fd output;
4142                         ret = perf_fget_light(arg, &output);
4143                         if (ret)
4144                                 return ret;
4145                         output_event = output.file->private_data;
4146                         ret = perf_event_set_output(event, output_event);
4147                         fdput(output);
4148                 } else {
4149                         ret = perf_event_set_output(event, NULL);
4150                 }
4151                 return ret;
4152         }
4153 
4154         case PERF_EVENT_IOC_SET_FILTER:
4155                 return perf_event_set_filter(event, (void __user *)arg);
4156 
4157         case PERF_EVENT_IOC_SET_BPF:
4158                 return perf_event_set_bpf_prog(event, arg);
4159 
4160         default:
4161                 return -ENOTTY;
4162         }
4163 
4164         if (flags & PERF_IOC_FLAG_GROUP)
4165                 perf_event_for_each(event, func);
4166         else
4167                 perf_event_for_each_child(event, func);
4168 
4169         return 0;
4170 }
4171 
4172 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4173 {
4174         struct perf_event *event = file->private_data;
4175         struct perf_event_context *ctx;
4176         long ret;
4177 
4178         ctx = perf_event_ctx_lock(event);
4179         ret = _perf_ioctl(event, cmd, arg);
4180         perf_event_ctx_unlock(event, ctx);
4181 
4182         return ret;
4183 }
4184 
4185 #ifdef CONFIG_COMPAT
4186 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4187                                 unsigned long arg)
4188 {
4189         switch (_IOC_NR(cmd)) {
4190         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4191         case _IOC_NR(PERF_EVENT_IOC_ID):
4192                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4193                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4194                         cmd &= ~IOCSIZE_MASK;
4195                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4196                 }
4197                 break;
4198         }
4199         return perf_ioctl(file, cmd, arg);
4200 }
4201 #else
4202 # define perf_compat_ioctl NULL
4203 #endif
4204 
4205 int perf_event_task_enable(void)
4206 {
4207         struct perf_event_context *ctx;
4208         struct perf_event *event;
4209 
4210         mutex_lock(&current->perf_event_mutex);
4211         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4212                 ctx = perf_event_ctx_lock(event);
4213                 perf_event_for_each_child(event, _perf_event_enable);
4214                 perf_event_ctx_unlock(event, ctx);
4215         }
4216         mutex_unlock(&current->perf_event_mutex);
4217 
4218         return 0;
4219 }
4220 
4221 int perf_event_task_disable(void)
4222 {
4223         struct perf_event_context *ctx;
4224         struct perf_event *event;
4225 
4226         mutex_lock(&current->perf_event_mutex);
4227         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4228                 ctx = perf_event_ctx_lock(event);
4229                 perf_event_for_each_child(event, _perf_event_disable);
4230                 perf_event_ctx_unlock(event, ctx);
4231         }
4232         mutex_unlock(&current->perf_event_mutex);
4233 
4234         return 0;
4235 }
4236 
4237 static int perf_event_index(struct perf_event *event)
4238 {
4239         if (event->hw.state & PERF_HES_STOPPED)
4240                 return 0;
4241 
4242         if (event->state != PERF_EVENT_STATE_ACTIVE)
4243                 return 0;
4244 
4245         return event->pmu->event_idx(event);
4246 }
4247 
4248 static void calc_timer_values(struct perf_event *event,
4249                                 u64 *now,
4250                                 u64 *enabled,
4251                                 u64 *running)
4252 {
4253         u64 ctx_time;
4254 
4255         *now = perf_clock();
4256         ctx_time = event->shadow_ctx_time + *now;
4257         *enabled = ctx_time - event->tstamp_enabled;
4258         *running = ctx_time - event->tstamp_running;
4259 }
4260 
4261 static void perf_event_init_userpage(struct perf_event *event)
4262 {
4263         struct perf_event_mmap_page *userpg;
4264         struct ring_buffer *rb;
4265 
4266         rcu_read_lock();
4267         rb = rcu_dereference(event->rb);
4268         if (!rb)
4269                 goto unlock;
4270 
4271         userpg = rb->user_page;
4272 
4273         /* Allow new userspace to detect that bit 0 is deprecated */
4274         userpg->cap_bit0_is_deprecated = 1;
4275         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4276         userpg->data_offset = PAGE_SIZE;
4277         userpg->data_size = perf_data_size(rb);
4278 
4279 unlock:
4280         rcu_read_unlock();
4281 }
4282 
4283 void __weak arch_perf_update_userpage(
4284         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4285 {
4286 }
4287 
4288 /*
4289  * Callers need to ensure there can be no nesting of this function, otherwise
4290  * the seqlock logic goes bad. We can not serialize this because the arch
4291  * code calls this from NMI context.
4292  */
4293 void perf_event_update_userpage(struct perf_event *event)
4294 {
4295         struct perf_event_mmap_page *userpg;
4296         struct ring_buffer *rb;
4297         u64 enabled, running, now;
4298 
4299         rcu_read_lock();
4300         rb = rcu_dereference(event->rb);
4301         if (!rb)
4302                 goto unlock;
4303 
4304         /*
4305          * compute total_time_enabled, total_time_running
4306          * based on snapshot values taken when the event
4307          * was last scheduled in.
4308          *
4309          * we cannot simply called update_context_time()
4310          * because of locking issue as we can be called in
4311          * NMI context
4312          */
4313         calc_timer_values(event, &now, &enabled, &running);
4314 
4315         userpg = rb->user_page;
4316         /*
4317          * Disable preemption so as to not let the corresponding user-space
4318          * spin too long if we get preempted.
4319          */
4320         preempt_disable();
4321         ++userpg->lock;
4322         barrier();
4323         userpg->index = perf_event_index(event);
4324         userpg->offset = perf_event_count(event);
4325         if (userpg->index)
4326                 userpg->offset -= local64_read(&event->hw.prev_count);
4327 
4328         userpg->time_enabled = enabled +
4329                         atomic64_read(&event->child_total_time_enabled);
4330 
4331         userpg->time_running = running +
4332                         atomic64_read(&event->child_total_time_running);
4333 
4334         arch_perf_update_userpage(event, userpg, now);
4335 
4336         barrier();
4337         ++userpg->lock;
4338         preempt_enable();
4339 unlock:
4340         rcu_read_unlock();
4341 }
4342 
4343 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4344 {
4345         struct perf_event *event = vma->vm_file->private_data;
4346         struct ring_buffer *rb;
4347         int ret = VM_FAULT_SIGBUS;
4348 
4349         if (vmf->flags & FAULT_FLAG_MKWRITE) {
4350                 if (vmf->pgoff == 0)
4351                         ret = 0;
4352                 return ret;
4353         }
4354 
4355         rcu_read_lock();
4356         rb = rcu_dereference(event->rb);
4357         if (!rb)
4358                 goto unlock;
4359 
4360         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4361                 goto unlock;
4362 
4363         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4364         if (!vmf->page)
4365                 goto unlock;
4366 
4367         get_page(vmf->page);
4368         vmf->page->mapping = vma->vm_file->f_mapping;
4369         vmf->page->index   = vmf->pgoff;
4370 
4371         ret = 0;
4372 unlock:
4373         rcu_read_unlock();
4374 
4375         return ret;
4376 }
4377 
4378 static void ring_buffer_attach(struct perf_event *event,
4379                                struct ring_buffer *rb)
4380 {
4381         struct ring_buffer *old_rb = NULL;
4382         unsigned long flags;
4383 
4384         if (event->rb) {
4385                 /*
4386                  * Should be impossible, we set this when removing
4387                  * event->rb_entry and wait/clear when adding event->rb_entry.
4388                  */
4389                 WARN_ON_ONCE(event->rcu_pending);
4390 
4391                 old_rb = event->rb;
4392                 spin_lock_irqsave(&old_rb->event_lock, flags);
4393                 list_del_rcu(&event->rb_entry);
4394                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4395 
4396                 event->rcu_batches = get_state_synchronize_rcu();
4397                 event->rcu_pending = 1;
4398         }
4399 
4400         if (rb) {
4401                 if (event->rcu_pending) {
4402                         cond_synchronize_rcu(event->rcu_batches);
4403                         event->rcu_pending = 0;
4404                 }
4405 
4406                 spin_lock_irqsave(&rb->event_lock, flags);
4407                 list_add_rcu(&event->rb_entry, &rb->event_list);
4408                 spin_unlock_irqrestore(&rb->event_lock, flags);
4409         }
4410 
4411         rcu_assign_pointer(event->rb, rb);
4412 
4413         if (old_rb) {
4414                 ring_buffer_put(old_rb);
4415                 /*
4416                  * Since we detached before setting the new rb, so that we
4417                  * could attach the new rb, we could have missed a wakeup.
4418                  * Provide it now.
4419                  */
4420                 wake_up_all(&event->waitq);
4421         }
4422 }
4423 
4424 static void ring_buffer_wakeup(struct perf_event *event)
4425 {
4426         struct ring_buffer *rb;
4427 
4428         rcu_read_lock();
4429         rb = rcu_dereference(event->rb);
4430         if (rb) {
4431                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4432                         wake_up_all(&event->waitq);
4433         }
4434         rcu_read_unlock();
4435 }
4436 
4437 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4438 {
4439         struct ring_buffer *rb;
4440 
4441         rcu_read_lock();
4442         rb = rcu_dereference(event->rb);
4443         if (rb) {
4444                 if (!atomic_inc_not_zero(&rb->refcount))
4445                         rb = NULL;
4446         }
4447         rcu_read_unlock();
4448 
4449         return rb;
4450 }
4451 
4452 void ring_buffer_put(struct ring_buffer *rb)
4453 {
4454         if (!atomic_dec_and_test(&rb->refcount))
4455                 return;
4456 
4457         WARN_ON_ONCE(!list_empty(&rb->event_list));
4458 
4459         call_rcu(&rb->rcu_head, rb_free_rcu);
4460 }
4461 
4462 static void perf_mmap_open(struct vm_area_struct *vma)
4463 {
4464         struct perf_event *event = vma->vm_file->private_data;
4465 
4466         atomic_inc(&event->mmap_count);
4467         atomic_inc(&event->rb->mmap_count);
4468 
4469         if (vma->vm_pgoff)
4470                 atomic_inc(&event->rb->aux_mmap_count);
4471 
4472         if (event->pmu->event_mapped)
4473                 event->pmu->event_mapped(event);
4474 }
4475 
4476 /*
4477  * A buffer can be mmap()ed multiple times; either directly through the same
4478  * event, or through other events by use of perf_event_set_output().
4479  *
4480  * In order to undo the VM accounting done by perf_mmap() we need to destroy
4481  * the buffer here, where we still have a VM context. This means we need
4482  * to detach all events redirecting to us.
4483  */
4484 static void perf_mmap_close(struct vm_area_struct *vma)
4485 {
4486         struct perf_event *event = vma->vm_file->private_data;
4487 
4488         struct ring_buffer *rb = ring_buffer_get(event);
4489         struct user_struct *mmap_user = rb->mmap_user;
4490         int mmap_locked = rb->mmap_locked;
4491         unsigned long size = perf_data_size(rb);
4492 
4493         if (event->pmu->event_unmapped)
4494                 event->pmu->event_unmapped(event);
4495 
4496         /*
4497          * rb->aux_mmap_count will always drop before rb->mmap_count and
4498          * event->mmap_count, so it is ok to use event->mmap_mutex to
4499          * serialize with perf_mmap here.
4500          */
4501         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4502             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4503                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4504                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4505 
4506                 rb_free_aux(rb);
4507                 mutex_unlock(&event->mmap_mutex);
4508         }
4509 
4510         atomic_dec(&rb->mmap_count);
4511 
4512         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4513                 goto out_put;
4514 
4515         ring_buffer_attach(event, NULL);
4516         mutex_unlock(&event->mmap_mutex);
4517 
4518         /* If there's still other mmap()s of this buffer, we're done. */
4519         if (atomic_read(&rb->mmap_count))
4520                 goto out_put;
4521 
4522         /*
4523          * No other mmap()s, detach from all other events that might redirect
4524          * into the now unreachable buffer. Somewhat complicated by the
4525          * fact that rb::event_lock otherwise nests inside mmap_mutex.
4526          */
4527 again:
4528         rcu_read_lock();
4529         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4530                 if (!atomic_long_inc_not_zero(&event->refcount)) {
4531                         /*
4532                          * This event is en-route to free_event() which will
4533                          * detach it and remove it from the list.
4534                          */
4535                         continue;
4536                 }
4537                 rcu_read_unlock();
4538 
4539                 mutex_lock(&event->mmap_mutex);
4540                 /*
4541                  * Check we didn't race with perf_event_set_output() which can
4542                  * swizzle the rb from under us while we were waiting to
4543                  * acquire mmap_mutex.
4544                  *
4545                  * If we find a different rb; ignore this event, a next
4546                  * iteration will no longer find it on the list. We have to
4547                  * still restart the iteration to make sure we're not now
4548                  * iterating the wrong list.
4549                  */
4550                 if (event->rb == rb)
4551                         ring_buffer_attach(event, NULL);
4552 
4553                 mutex_unlock(&event->mmap_mutex);
4554                 put_event(event);
4555 
4556                 /*
4557                  * Restart the iteration; either we're on the wrong list or
4558                  * destroyed its integrity by doing a deletion.
4559                  */
4560                 goto again;
4561         }
4562         rcu_read_unlock();
4563 
4564         /*
4565          * It could be there's still a few 0-ref events on the list; they'll
4566          * get cleaned up by free_event() -- they'll also still have their
4567          * ref on the rb and will free it whenever they are done with it.
4568          *
4569          * Aside from that, this buffer is 'fully' detached and unmapped,
4570          * undo the VM accounting.
4571          */
4572 
4573         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4574         vma->vm_mm->pinned_vm -= mmap_locked;
4575         free_uid(mmap_user);
4576 
4577 out_put:
4578         ring_buffer_put(rb); /* could be last */
4579 }
4580 
4581 static const struct vm_operations_struct perf_mmap_vmops = {
4582         .open           = perf_mmap_open,
4583         .close          = perf_mmap_close, /* non mergable */
4584         .fault          = perf_mmap_fault,
4585         .page_mkwrite   = perf_mmap_fault,
4586 };
4587 
4588 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4589 {
4590         struct perf_event *event = file->private_data;
4591         unsigned long user_locked, user_lock_limit;
4592         struct user_struct *user = current_user();
4593         unsigned long locked, lock_limit;
4594         struct ring_buffer *rb = NULL;
4595         unsigned long vma_size;
4596         unsigned long nr_pages;
4597         long user_extra = 0, extra = 0;
4598         int ret = 0, flags = 0;
4599 
4600         /*
4601          * Don't allow mmap() of inherited per-task counters. This would
4602          * create a performance issue due to all children writing to the
4603          * same rb.
4604          */
4605         if (event->cpu == -1 && event->attr.inherit)
4606                 return -EINVAL;
4607 
4608         if (!(vma->vm_flags & VM_SHARED))
4609                 return -EINVAL;
4610 
4611         vma_size = vma->vm_end - vma->vm_start;
4612 
4613         if (vma->vm_pgoff == 0) {
4614                 nr_pages = (vma_size / PAGE_SIZE) - 1;
4615         } else {
4616                 /*
4617                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4618                  * mapped, all subsequent mappings should have the same size
4619                  * and offset. Must be above the normal perf buffer.
4620                  */
4621                 u64 aux_offset, aux_size;
4622 
4623                 if (!event->rb)
4624                         return -EINVAL;
4625 
4626                 nr_pages = vma_size / PAGE_SIZE;
4627 
4628                 mutex_lock(&event->mmap_mutex);
4629                 ret = -EINVAL;
4630 
4631                 rb = event->rb;
4632                 if (!rb)
4633                         goto aux_unlock;
4634 
4635                 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4636                 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4637 
4638                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4639                         goto aux_unlock;
4640 
4641                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4642                         goto aux_unlock;
4643 
4644                 /* already mapped with a different offset */
4645                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4646                         goto aux_unlock;
4647 
4648                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4649                         goto aux_unlock;
4650 
4651                 /* already mapped with a different size */
4652                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4653                         goto aux_unlock;
4654 
4655                 if (!is_power_of_2(nr_pages))
4656                         goto aux_unlock;
4657 
4658                 if (!atomic_inc_not_zero(&rb->mmap_count))
4659                         goto aux_unlock;
4660 
4661                 if (rb_has_aux(rb)) {
4662                         atomic_inc(&rb->aux_mmap_count);
4663                         ret = 0;
4664                         goto unlock;
4665                 }
4666 
4667                 atomic_set(&rb->aux_mmap_count, 1);
4668                 user_extra = nr_pages;
4669 
4670                 goto accounting;
4671         }
4672 
4673         /*
4674          * If we have rb pages ensure they're a power-of-two number, so we
4675          * can do bitmasks instead of modulo.
4676          */
4677         if (nr_pages != 0 && !is_power_of_2(nr_pages))
4678                 return -EINVAL;
4679 
4680         if (vma_size != PAGE_SIZE * (1 + nr_pages))
4681                 return -EINVAL;
4682 
4683         WARN_ON_ONCE(event->ctx->parent_ctx);
4684 again:
4685         mutex_lock(&event->mmap_mutex);
4686         if (event->rb) {
4687                 if (event->rb->nr_pages != nr_pages) {
4688                         ret = -EINVAL;
4689                         goto unlock;
4690                 }
4691 
4692                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4693                         /*
4694                          * Raced against perf_mmap_close() through
4695                          * perf_event_set_output(). Try again, hope for better
4696                          * luck.
4697                          */
4698                         mutex_unlock(&event->mmap_mutex);
4699                         goto again;
4700                 }
4701 
4702                 goto unlock;
4703         }
4704 
4705         user_extra = nr_pages + 1;
4706 
4707 accounting:
4708         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4709 
4710         /*
4711          * Increase the limit linearly with more CPUs:
4712          */
4713         user_lock_limit *= num_online_cpus();
4714 
4715         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4716 
4717         if (user_locked > user_lock_limit)
4718                 extra = user_locked - user_lock_limit;
4719 
4720         lock_limit = rlimit(RLIMIT_MEMLOCK);
4721         lock_limit >>= PAGE_SHIFT;
4722         locked = vma->vm_mm->pinned_vm + extra;
4723 
4724         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4725                 !capable(CAP_IPC_LOCK)) {
4726                 ret = -EPERM;
4727                 goto unlock;
4728         }
4729 
4730         WARN_ON(!rb && event->rb);
4731 
4732         if (vma->vm_flags & VM_WRITE)
4733                 flags |= RING_BUFFER_WRITABLE;
4734 
4735         if (!rb) {
4736                 rb = rb_alloc(nr_pages,
4737                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
4738                               event->cpu, flags);
4739 
4740                 if (!rb) {
4741                         ret = -ENOMEM;
4742                         goto unlock;
4743                 }
4744 
4745                 atomic_set(&rb->mmap_count, 1);
4746                 rb->mmap_user = get_current_user();
4747                 rb->mmap_locked = extra;
4748 
4749                 ring_buffer_attach(event, rb);
4750 
4751                 perf_event_init_userpage(event);
4752                 perf_event_update_userpage(event);
4753         } else {
4754                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4755                                    event->attr.aux_watermark, flags);
4756                 if (!ret)
4757                         rb->aux_mmap_locked = extra;
4758         }
4759 
4760 unlock:
4761         if (!ret) {
4762                 atomic_long_add(user_extra, &user->locked_vm);
4763                 vma->vm_mm->pinned_vm += extra;
4764 
4765                 atomic_inc(&event->mmap_count);
4766         } else if (rb) {
4767                 atomic_dec(&rb->mmap_count);
4768         }
4769 aux_unlock:
4770         mutex_unlock(&event->mmap_mutex);
4771 
4772         /*
4773          * Since pinned accounting is per vm we cannot allow fork() to copy our
4774          * vma.
4775          */
4776         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4777         vma->vm_ops = &perf_mmap_vmops;
4778 
4779         if (event->pmu->event_mapped)
4780                 event->pmu->event_mapped(event);
4781 
4782         return ret;
4783 }
4784 
4785 static int perf_fasync(int fd, struct file *filp, int on)
4786 {
4787         struct inode *inode = file_inode(filp);
4788         struct perf_event *event = filp->private_data;
4789         int retval;
4790 
4791         mutex_lock(&inode->i_mutex);
4792         retval = fasync_helper(fd, filp, on, &event->fasync);
4793         mutex_unlock(&inode->i_mutex);
4794 
4795         if (retval < 0)
4796                 return retval;
4797 
4798         return 0;
4799 }
4800 
4801 static const struct file_operations perf_fops = {
4802         .llseek                 = no_llseek,
4803         .release                = perf_release,
4804         .read                   = perf_read,
4805         .poll                   = perf_poll,
4806         .unlocked_ioctl         = perf_ioctl,
4807         .compat_ioctl           = perf_compat_ioctl,
4808         .mmap                   = perf_mmap,
4809         .fasync                 = perf_fasync,
4810 };
4811 
4812 /*
4813  * Perf event wakeup
4814  *
4815  * If there's data, ensure we set the poll() state and publish everything
4816  * to user-space before waking everybody up.
4817  */
4818 
4819 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4820 {
4821         /* only the parent has fasync state */
4822         if (event->parent)
4823                 event = event->parent;
4824         return &event->fasync;
4825 }
4826 
4827 void perf_event_wakeup(struct perf_event *event)
4828 {
4829         ring_buffer_wakeup(event);
4830 
4831         if (event->pending_kill) {
4832                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4833                 event->pending_kill = 0;
4834         }
4835 }
4836 
4837 static void perf_pending_event(struct irq_work *entry)
4838 {
4839         struct perf_event *event = container_of(entry,
4840                         struct perf_event, pending);
4841         int rctx;
4842 
4843         rctx = perf_swevent_get_recursion_context();
4844         /*
4845          * If we 'fail' here, that's OK, it means recursion is already disabled
4846          * and we won't recurse 'further'.
4847          */
4848 
4849         if (event->pending_disable) {
4850                 event->pending_disable = 0;
4851                 __perf_event_disable(event);
4852         }
4853 
4854         if (event->pending_wakeup) {
4855                 event->pending_wakeup = 0;
4856                 perf_event_wakeup(event);
4857         }
4858 
4859         if (rctx >= 0)
4860                 perf_swevent_put_recursion_context(rctx);
4861 }
4862 
4863 /*
4864  * We assume there is only KVM supporting the callbacks.
4865  * Later on, we might change it to a list if there is
4866  * another virtualization implementation supporting the callbacks.
4867  */
4868 struct perf_guest_info_callbacks *perf_guest_cbs;
4869 
4870 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4871 {
4872         perf_guest_cbs = cbs;
4873         return 0;
4874 }
4875 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4876 
4877 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4878 {
4879         perf_guest_cbs = NULL;
4880         return 0;
4881 }
4882 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4883 
4884 static void
4885 perf_output_sample_regs(struct perf_output_handle *handle,
4886                         struct pt_regs *regs, u64 mask)
4887 {
4888         int bit;
4889 
4890         for_each_set_bit(bit, (const unsigned long *) &mask,
4891                          sizeof(mask) * BITS_PER_BYTE) {
4892                 u64 val;
4893 
4894                 val = perf_reg_value(regs, bit);
4895                 perf_output_put(handle, val);
4896         }
4897 }
4898 
4899 static void perf_sample_regs_user(struct perf_regs *regs_user,
4900                                   struct pt_regs *regs,
4901                                   struct pt_regs *regs_user_copy)
4902 {
4903         if (user_mode(regs)) {
4904                 regs_user->abi = perf_reg_abi(current);
4905                 regs_user->regs = regs;
4906         } else if (current->mm) {
4907                 perf_get_regs_user(regs_user, regs, regs_user_copy);
4908         } else {
4909                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4910                 regs_user->regs = NULL;
4911         }
4912 }
4913 
4914 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4915                                   struct pt_regs *regs)
4916 {
4917         regs_intr->regs = regs;
4918         regs_intr->abi  = perf_reg_abi(current);
4919 }
4920 
4921 
4922 /*
4923  * Get remaining task size from user stack pointer.
4924  *
4925  * It'd be better to take stack vma map and limit this more
4926  * precisly, but there's no way to get it safely under interrupt,
4927  * so using TASK_SIZE as limit.
4928  */
4929 static u64 perf_ustack_task_size(struct pt_regs *regs)
4930 {
4931         unsigned long addr = perf_user_stack_pointer(regs);
4932 
4933         if (!addr || addr >= TASK_SIZE)
4934                 return 0;
4935 
4936         return TASK_SIZE - addr;
4937 }
4938 
4939 static u16
4940 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4941                         struct pt_regs *regs)
4942 {
4943         u64 task_size;
4944 
4945         /* No regs, no stack pointer, no dump. */
4946         if (!regs)
4947                 return 0;
4948 
4949         /*
4950          * Check if we fit in with the requested stack size into the:
4951          * - TASK_SIZE
4952          *   If we don't, we limit the size to the TASK_SIZE.
4953          *
4954          * - remaining sample size
4955          *   If we don't, we customize the stack size to
4956          *   fit in to the remaining sample size.
4957          */
4958 
4959         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4960         stack_size = min(stack_size, (u16) task_size);
4961 
4962         /* Current header size plus static size and dynamic size. */
4963         header_size += 2 * sizeof(u64);
4964 
4965         /* Do we fit in with the current stack dump size? */
4966         if ((u16) (header_size + stack_size) < header_size) {
4967                 /*
4968                  * If we overflow the maximum size for the sample,
4969                  * we customize the stack dump size to fit in.
4970                  */
4971                 stack_size = USHRT_MAX - header_size - sizeof(u64);
4972                 stack_size = round_up(stack_size, sizeof(u64));
4973         }
4974 
4975         return stack_size;
4976 }
4977 
4978 static void
4979 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4980                           struct pt_regs *regs)
4981 {
4982         /* Case of a kernel thread, nothing to dump */
4983         if (!regs) {
4984                 u64 size = 0;
4985                 perf_output_put(handle, size);
4986         } else {
4987                 unsigned long sp;
4988                 unsigned int rem;
4989                 u64 dyn_size;
4990 
4991                 /*
4992                  * We dump:
4993                  * static size
4994                  *   - the size requested by user or the best one we can fit
4995                  *     in to the sample max size
4996                  * data
4997                  *   - user stack dump data
4998                  * dynamic size
4999                  *   - the actual dumped size
5000                  */
5001 
5002                 /* Static size. */
5003                 perf_output_put(handle, dump_size);
5004 
5005                 /* Data. */
5006                 sp = perf_user_stack_pointer(regs);
5007                 rem = __output_copy_user(handle, (void *) sp, dump_size);
5008                 dyn_size = dump_size - rem;
5009 
5010                 perf_output_skip(handle, rem);
5011 
5012                 /* Dynamic size. */
5013                 perf_output_put(handle, dyn_size);
5014         }
5015 }
5016 
5017 static void __perf_event_header__init_id(struct perf_event_header *header,
5018                                          struct perf_sample_data *data,
5019                                          struct perf_event *event)
5020 {
5021         u64 sample_type = event->attr.sample_type;
5022 
5023         data->type = sample_type;
5024         header->size += event->id_header_size;
5025 
5026         if (sample_type & PERF_SAMPLE_TID) {
5027                 /* namespace issues */
5028                 data->tid_entry.pid = perf_event_pid(event, current);
5029                 data->tid_entry.tid = perf_event_tid(event, current);
5030         }
5031 
5032         if (sample_type & PERF_SAMPLE_TIME)
5033                 data->time = perf_event_clock(event);
5034 
5035         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5036                 data->id = primary_event_id(event);
5037 
5038         if (sample_type & PERF_SAMPLE_STREAM_ID)
5039                 data->stream_id = event->id;
5040 
5041         if (sample_type & PERF_SAMPLE_CPU) {
5042                 data->cpu_entry.cpu      = raw_smp_processor_id();
5043                 data->cpu_entry.reserved = 0;
5044         }
5045 }
5046 
5047 void perf_event_header__init_id(struct perf_event_header *header,
5048                                 struct perf_sample_data *data,
5049                                 struct perf_event *event)
5050 {
5051         if (event->attr.sample_id_all)
5052                 __perf_event_header__init_id(header, data, event);
5053 }
5054 
5055 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5056                                            struct perf_sample_data *data)
5057 {
5058         u64 sample_type = data->type;
5059 
5060         if (sample_type & PERF_SAMPLE_TID)
5061                 perf_output_put(handle, data->tid_entry);
5062 
5063         if (sample_type & PERF_SAMPLE_TIME)
5064                 perf_output_put(handle, data->time);
5065 
5066         if (sample_type & PERF_SAMPLE_ID)
5067                 perf_output_put(handle, data->id);
5068 
5069         if (sample_type & PERF_SAMPLE_STREAM_ID)
5070                 perf_output_put(handle, data->stream_id);
5071 
5072         if (sample_type & PERF_SAMPLE_CPU)
5073                 perf_output_put(handle, data->cpu_entry);
5074 
5075         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5076                 perf_output_put(handle, data->id);
5077 }
5078 
5079 void perf_event__output_id_sample(struct perf_event *event,
5080                                   struct perf_output_handle *handle,
5081                                   struct perf_sample_data *sample)
5082 {
5083         if (event->attr.sample_id_all)
5084                 __perf_event__output_id_sample(handle, sample);
5085 }
5086 
5087 static void perf_output_read_one(struct perf_output_handle *handle,
5088                                  struct perf_event *event,
5089                                  u64 enabled, u64 running)
5090 {
5091         u64 read_format = event->attr.read_format;
5092         u64 values[4];
5093         int n = 0;
5094 
5095         values[n++] = perf_event_count(event);
5096         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5097                 values[n++] = enabled +
5098                         atomic64_read(&event->child_total_time_enabled);
5099         }
5100         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5101                 values[n++] = running +
5102                         atomic64_read(&event->child_total_time_running);
5103         }
5104         if (read_format & PERF_FORMAT_ID)
5105                 values[n++] = primary_event_id(event);
5106 
5107         __output_copy(handle, values, n * sizeof(u64));
5108 }
5109 
5110 static void perf_output_read_group(struct perf_output_handle *handle,
5111                             struct perf_event *event,
5112                             u64 enabled, u64 running)
5113 {
5114         struct perf_event *leader = event->group_leader, *sub;
5115         u64 read_format = event->attr.read_format;
5116         u64 values[5];
5117         int n = 0;
5118 
5119         values[n++] = 1 + leader->nr_siblings;
5120 
5121         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5122                 values[n++] = enabled;
5123 
5124         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5125                 values[n++] = running;
5126 
5127         if (leader != event)
5128                 leader->pmu->read(leader);
5129 
5130         values[n++] = perf_event_count(leader);
5131         if (read_format & PERF_FORMAT_ID)
5132                 values[n++] = primary_event_id(leader);
5133 
5134         __output_copy(handle, values, n * sizeof(u64));
5135 
5136         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5137                 n = 0;
5138 
5139                 if ((sub != event) &&
5140                     (sub->state == PERF_EVENT_STATE_ACTIVE))
5141                         sub->pmu->read(sub);
5142 
5143                 values[n++] = perf_event_count(sub);
5144                 if (read_format & PERF_FORMAT_ID)
5145                         values[n++] = primary_event_id(sub);
5146 
5147                 __output_copy(handle, values, n * sizeof(u64));
5148         }
5149 }
5150 
5151 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5152                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5153 
5154 /*
5155  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
5156  *
5157  * The problem is that its both hard and excessively expensive to iterate the
5158  * child list, not to mention that its impossible to IPI the children running
5159  * on another CPU, from interrupt/NMI context.
5160  */
5161 static void perf_output_read(struct perf_output_handle *handle,
5162                              struct perf_event *event)
5163 {
5164         u64 enabled = 0, running = 0, now;
5165         u64 read_format = event->attr.read_format;
5166 
5167         /*
5168          * compute total_time_enabled, total_time_running
5169          * based on snapshot values taken when the event
5170          * was last scheduled in.
5171          *
5172          * we cannot simply called update_context_time()
5173          * because of locking issue as we are called in
5174          * NMI context
5175          */
5176         if (read_format & PERF_FORMAT_TOTAL_TIMES)
5177                 calc_timer_values(event, &now, &enabled, &running);
5178 
5179         if (event->attr.read_format & PERF_FORMAT_GROUP)
5180                 perf_output_read_group(handle, event, enabled, running);
5181         else
5182                 perf_output_read_one(handle, event, enabled, running);
5183 }
5184 
5185 void perf_output_sample(struct perf_output_handle *handle,
5186                         struct perf_event_header *header,
5187                         struct perf_sample_data *data,
5188                         struct perf_event *event)
5189 {
5190         u64 sample_type = data->type;
5191 
5192         perf_output_put(handle, *header);
5193 
5194         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5195                 perf_output_put(handle, data->id);
5196 
5197         if (sample_type & PERF_SAMPLE_IP)
5198                 perf_output_put(handle, data->ip);
5199 
5200         if (sample_type & PERF_SAMPLE_TID)
5201                 perf_output_put(handle, data->tid_entry);
5202 
5203         if (sample_type & PERF_SAMPLE_TIME)
5204                 perf_output_put(handle, data->time);
5205 
5206         if (sample_type & PERF_SAMPLE_ADDR)
5207                 perf_output_put(handle, data->addr);
5208 
5209         if (sample_type & PERF_SAMPLE_ID)
5210                 perf_output_put(handle, data->id);
5211 
5212         if (sample_type & PERF_SAMPLE_STREAM_ID)
5213                 perf_output_put(handle, data->stream_id);
5214 
5215         if (sample_type & PERF_SAMPLE_CPU)
5216                 perf_output_put(handle, data->cpu_entry);
5217 
5218         if (sample_type & PERF_SAMPLE_PERIOD)
5219                 perf_output_put(handle, data->period);
5220 
5221         if (sample_type & PERF_SAMPLE_READ)
5222                 perf_output_read(handle, event);
5223 
5224         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5225                 if (data->callchain) {
5226                         int size = 1;
5227 
5228                         if (data->callchain)
5229                                 size += data->callchain->nr;
5230 
5231                         size *= sizeof(u64);
5232 
5233                         __output_copy(handle, data->callchain, size);
5234                 } else {
5235                         u64 nr = 0;
5236                         perf_output_put(handle, nr);
5237                 }
5238         }
5239 
5240         if (sample_type & PERF_SAMPLE_RAW) {
5241                 if (data->raw) {
5242                         perf_output_put(handle, data->raw->size);
5243                         __output_copy(handle, data->raw->data,
5244                                            data->raw->size);
5245                 } else {
5246                         struct {
5247                                 u32     size;
5248                                 u32     data;
5249                         } raw = {
5250                                 .size = sizeof(u32),
5251                                 .data = 0,
5252                         };
5253                         perf_output_put(handle, raw);
5254                 }
5255         }
5256 
5257         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5258                 if (data->br_stack) {
5259                         size_t size;
5260 
5261                         size = data->br_stack->nr
5262                              * sizeof(struct perf_branch_entry);
5263 
5264                         perf_output_put(handle, data->br_stack->nr);
5265                         perf_output_copy(handle, data->br_stack->entries, size);
5266                 } else {
5267                         /*
5268                          * we always store at least the value of nr
5269                          */
5270                         u64 nr = 0;
5271                         perf_output_put(handle, nr);
5272                 }
5273         }
5274 
5275         if (sample_type & PERF_SAMPLE_REGS_USER) {
5276                 u64 abi = data->regs_user.abi;
5277 
5278                 /*
5279                  * If there are no regs to dump, notice it through
5280                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5281                  */
5282                 perf_output_put(handle, abi);
5283 
5284                 if (abi) {
5285                         u64 mask = event->attr.sample_regs_user;
5286                         perf_output_sample_regs(handle,
5287                                                 data->regs_user.regs,
5288                                                 mask);
5289                 }
5290         }
5291 
5292         if (sample_type & PERF_SAMPLE_STACK_USER) {
5293                 perf_output_sample_ustack(handle,
5294                                           data->stack_user_size,
5295                                           data->regs_user.regs);
5296         }
5297 
5298         if (sample_type & PERF_SAMPLE_WEIGHT)
5299                 perf_output_put(handle, data->weight);
5300 
5301         if (sample_type & PERF_SAMPLE_DATA_SRC)
5302                 perf_output_put(handle, data->data_src.val);
5303 
5304         if (sample_type & PERF_SAMPLE_TRANSACTION)
5305                 perf_output_put(handle, data->txn);
5306 
5307         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5308                 u64 abi = data->regs_intr.abi;
5309                 /*
5310                  * If there are no regs to dump, notice it through
5311                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5312                  */
5313                 perf_output_put(handle, abi);
5314 
5315                 if (abi) {
5316                         u64 mask = event->attr.sample_regs_intr;
5317 
5318                         perf_output_sample_regs(handle,
5319                                                 data->regs_intr.regs,
5320                                                 mask);
5321                 }
5322         }
5323 
5324         if (!event->attr.watermark) {
5325                 int wakeup_events = event->attr.wakeup_events;
5326 
5327                 if (wakeup_events) {
5328                         struct ring_buffer *rb = handle->rb;
5329                         int events = local_inc_return(&rb->events);
5330 
5331                         if (events >= wakeup_events) {
5332                                 local_sub(wakeup_events, &rb->events);
5333                                 local_inc(&rb->wakeup);
5334                         }
5335                 }
5336         }
5337 }
5338 
5339 void perf_prepare_sample(struct perf_event_header *header,
5340                          struct perf_sample_data *data,
5341                          struct perf_event *event,
5342                          struct pt_regs *regs)
5343 {
5344         u64 sample_type = event->attr.sample_type;
5345 
5346         header->type = PERF_RECORD_SAMPLE;
5347         header->size = sizeof(*header) + event->header_size;
5348 
5349         header->misc = 0;
5350         header->misc |= perf_misc_flags(regs);
5351 
5352         __perf_event_header__init_id(header, data, event);
5353 
5354         if (sample_type & PERF_SAMPLE_IP)
5355                 data->ip = perf_instruction_pointer(regs);
5356 
5357         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5358                 int size = 1;
5359 
5360                 data->callchain = perf_callchain(event, regs);
5361 
5362                 if (data->callchain)
5363                         size += data->callchain->nr;
5364 
5365                 header->size += size * sizeof(u64);
5366         }
5367 
5368         if (sample_type & PERF_SAMPLE_RAW) {
5369                 int size = sizeof(u32);
5370 
5371                 if (data->raw)
5372                         size += data->raw->size;
5373                 else
5374                         size += sizeof(u32);
5375 
5376                 WARN_ON_ONCE(size & (sizeof(u64)-1));
5377                 header->size += size;
5378         }
5379 
5380         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5381                 int size = sizeof(u64); /* nr */
5382                 if (data->br_stack) {
5383                         size += data->br_stack->nr
5384                               * sizeof(struct perf_branch_entry);
5385                 }
5386                 header->size += size;
5387         }
5388 
5389         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5390                 perf_sample_regs_user(&data->regs_user, regs,
5391                                       &data->regs_user_copy);
5392 
5393         if (sample_type & PERF_SAMPLE_REGS_USER) {
5394                 /* regs dump ABI info */
5395                 int size = sizeof(u64);
5396 
5397                 if (data->regs_user.regs) {
5398                         u64 mask = event->attr.sample_regs_user;
5399                         size += hweight64(mask) * sizeof(u64);
5400                 }
5401 
5402                 header->size += size;
5403         }
5404 
5405         if (sample_type & PERF_SAMPLE_STACK_USER) {
5406                 /*
5407                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5408                  * processed as the last one or have additional check added
5409                  * in case new sample type is added, because we could eat
5410                  * up the rest of the sample size.
5411                  */
5412                 u16 stack_size = event->attr.sample_stack_user;
5413                 u16 size = sizeof(u64);
5414 
5415                 stack_size = perf_sample_ustack_size(stack_size, header->size,
5416                                                      data->regs_user.regs);
5417 
5418                 /*
5419                  * If there is something to dump, add space for the dump
5420                  * itself and for the field that tells the dynamic size,
5421                  * which is how many have been actually dumped.
5422                  */
5423                 if (stack_size)
5424                         size += sizeof(u64) + stack_size;
5425 
5426                 data->stack_user_size = stack_size;
5427                 header->size += size;
5428         }
5429 
5430         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5431                 /* regs dump ABI info */
5432                 int size = sizeof(u64);
5433 
5434                 perf_sample_regs_intr(&data->regs_intr, regs);
5435 
5436                 if (data->regs_intr.regs) {
5437                         u64 mask = event->attr.sample_regs_intr;
5438 
5439                         size += hweight64(mask) * sizeof(u64);
5440                 }
5441 
5442                 header->size += size;
5443         }
5444 }
5445 
5446 static void perf_event_output(struct perf_event *event,
5447                                 struct perf_sample_data *data,
5448                                 struct pt_regs *regs)
5449 {
5450         struct perf_output_handle handle;
5451         struct perf_event_header header;
5452 
5453         /* protect the callchain buffers */
5454         rcu_read_lock();
5455 
5456         perf_prepare_sample(&header, data, event, regs);
5457 
5458         if (perf_output_begin(&handle, event, header.size))
5459                 goto exit;
5460 
5461         perf_output_sample(&handle, &header, data, event);
5462 
5463         perf_output_end(&handle);
5464 
5465 exit:
5466         rcu_read_unlock();
5467 }
5468 
5469 /*
5470  * read event_id
5471  */
5472 
5473 struct perf_read_event {
5474         struct perf_event_header        header;
5475 
5476         u32                             pid;
5477         u32                             tid;
5478 };
5479 
5480 static void
5481 perf_event_read_event(struct perf_event *event,
5482                         struct task_struct *task)
5483 {
5484         struct perf_output_handle handle;
5485         struct perf_sample_data sample;
5486         struct perf_read_event read_event = {
5487                 .header = {
5488                         .type = PERF_RECORD_READ,
5489                         .misc = 0,
5490                         .size = sizeof(read_event) + event->read_size,
5491                 },
5492                 .pid = perf_event_pid(event, task),
5493                 .tid = perf_event_tid(event, task),
5494         };
5495         int ret;
5496 
5497         perf_event_header__init_id(&read_event.header, &sample, event);
5498         ret = perf_output_begin(&handle, event, read_event.header.size);
5499         if (ret)
5500                 return;
5501 
5502         perf_output_put(&handle, read_event);
5503         perf_output_read(&handle, event);
5504         perf_event__output_id_sample(event, &handle, &sample);
5505 
5506         perf_output_end(&handle);
5507 }
5508 
5509 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5510 
5511 static void
5512 perf_event_aux_ctx(struct perf_event_context *ctx,
5513                    perf_event_aux_output_cb output,
5514                    void *data)
5515 {
5516         struct perf_event *event;
5517 
5518         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5519                 if (event->state < PERF_EVENT_STATE_INACTIVE)
5520                         continue;
5521                 if (!event_filter_match(event))
5522                         continue;
5523                 output(event, data);
5524         }
5525 }
5526 
5527 static void
5528 perf_event_aux(perf_event_aux_output_cb output, void *data,
5529                struct perf_event_context *task_ctx)
5530 {
5531         struct perf_cpu_context *cpuctx;
5532         struct perf_event_context *ctx;
5533         struct pmu *pmu;
5534         int ctxn;
5535 
5536         rcu_read_lock();
5537         list_for_each_entry_rcu(pmu, &pmus, entry) {
5538                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5539                 if (cpuctx->unique_pmu != pmu)
5540                         goto next;
5541                 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5542                 if (task_ctx)
5543                         goto next;
5544                 ctxn = pmu->task_ctx_nr;
5545                 if (ctxn < 0)
5546                         goto next;
5547                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5548                 if (ctx)
5549                         perf_event_aux_ctx(ctx, output, data);
5550 next:
5551                 put_cpu_ptr(pmu->pmu_cpu_context);
5552         }
5553 
5554         if (task_ctx) {
5555                 preempt_disable();
5556                 perf_event_aux_ctx(task_ctx, output, data);
5557                 preempt_enable();
5558         }
5559         rcu_read_unlock();
5560 }
5561 
5562 /*
5563  * task tracking -- fork/exit
5564  *
5565  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5566  */
5567 
5568 struct perf_task_event {
5569         struct task_struct              *task;
5570         struct perf_event_context       *task_ctx;
5571 
5572         struct {
5573                 struct perf_event_header        header;
5574 
5575                 u32                             pid;
5576                 u32                             ppid;
5577                 u32                             tid;
5578                 u32                             ptid;
5579                 u64                             time;
5580         } event_id;
5581 };
5582 
5583 static int perf_event_task_match(struct perf_event *event)
5584 {
5585         return event->attr.comm  || event->attr.mmap ||
5586                event->attr.mmap2 || event->attr.mmap_data ||
5587                event->attr.task;
5588 }
5589 
5590 static void perf_event_task_output(struct perf_event *event,
5591                                    void *data)
5592 {
5593         struct perf_task_event *task_event = data;
5594         struct perf_output_handle handle;
5595         struct perf_sample_data sample;
5596         struct task_struct *task = task_event->task;
5597         int ret, size = task_event->event_id.header.size;
5598 
5599         if (!perf_event_task_match(event))
5600                 return;
5601 
5602         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5603 
5604         ret = perf_output_begin(&handle, event,
5605                                 task_event->event_id.header.size);
5606         if (ret)
5607                 goto out;
5608 
5609         task_event->event_id.pid = perf_event_pid(event, task);
5610         task_event->event_id.ppid = perf_event_pid(event, current);
5611 
5612         task_event->event_id.tid = perf_event_tid(event, task);
5613         task_event->event_id.ptid = perf_event_tid(event, current);
5614 
5615         task_event->event_id.time = perf_event_clock(event);
5616 
5617         perf_output_put(&handle, task_event->event_id);
5618 
5619         perf_event__output_id_sample(event, &handle, &sample);
5620 
5621         perf_output_end(&handle);
5622 out:
5623         task_event->event_id.header.size = size;
5624 }
5625 
5626 static void perf_event_task(struct task_struct *task,
5627                               struct perf_event_context *task_ctx,
5628                               int new)
5629 {
5630         struct perf_task_event task_event;
5631 
5632         if (!atomic_read(&nr_comm_events) &&
5633             !atomic_read(&nr_mmap_events) &&
5634             !atomic_read(&nr_task_events))
5635                 return;
5636 
5637         task_event = (struct perf_task_event){
5638                 .task     = task,
5639                 .task_ctx = task_ctx,
5640                 .event_id    = {
5641                         .header = {
5642                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5643                                 .misc = 0,
5644                                 .size = sizeof(task_event.event_id),
5645                         },
5646                         /* .pid  */
5647                         /* .ppid */
5648                         /* .tid  */
5649                         /* .ptid */
5650                         /* .time */
5651                 },
5652         };
5653 
5654         perf_event_aux(perf_event_task_output,
5655                        &task_event,
5656                        task_ctx);
5657 }
5658 
5659 void perf_event_fork(struct task_struct *task)
5660 {
5661         perf_event_task(task, NULL, 1);
5662 }
5663 
5664 /*
5665  * comm tracking
5666  */
5667 
5668 struct perf_comm_event {
5669         struct task_struct      *task;
5670         char                    *comm;
5671         int                     comm_size;
5672 
5673         struct {
5674                 struct perf_event_header        header;
5675 
5676                 u32                             pid;
5677                 u32                             tid;
5678         } event_id;
5679 };
5680 
5681 static int perf_event_comm_match(struct perf_event *event)
5682 {
5683         return event->attr.comm;
5684 }
5685 
5686 static void perf_event_comm_output(struct perf_event *event,
5687                                    void *data)
5688 {
5689         struct perf_comm_event *comm_event = data;
5690         struct perf_output_handle handle;
5691         struct perf_sample_data sample;
5692         int size = comm_event->event_id.header.size;
5693         int ret;
5694 
5695         if (!perf_event_comm_match(event))
5696                 return;
5697 
5698         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5699         ret = perf_output_begin(&handle, event,
5700                                 comm_event->event_id.header.size);
5701 
5702         if (ret)
5703                 goto out;
5704 
5705         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5706         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5707 
5708         perf_output_put(&handle, comm_event->event_id);
5709         __output_copy(&handle, comm_event->comm,
5710                                    comm_event->comm_size);
5711 
5712         perf_event__output_id_sample(event, &handle, &sample);
5713 
5714         perf_output_end(&handle);
5715 out:
5716         comm_event->event_id.header.size = size;
5717 }
5718 
5719 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5720 {
5721         char comm[TASK_COMM_LEN];
5722         unsigned int size;
5723 
5724         memset(comm, 0, sizeof(comm));
5725         strlcpy(comm, comm_event->task->comm, sizeof(comm));
5726         size = ALIGN(strlen(comm)+1, sizeof(u64));
5727 
5728         comm_event->comm = comm;
5729         comm_event->comm_size = size;
5730 
5731         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5732 
5733         perf_event_aux(perf_event_comm_output,
5734                        comm_event,
5735                        NULL);
5736 }
5737 
5738 void perf_event_comm(struct task_struct *task, bool exec)
5739 {
5740         struct perf_comm_event comm_event;
5741 
5742         if (!atomic_read(&nr_comm_events))
5743                 return;
5744 
5745         comm_event = (struct perf_comm_event){
5746                 .task   = task,
5747                 /* .comm      */
5748                 /* .comm_size */
5749                 .event_id  = {
5750                         .header = {
5751                                 .type = PERF_RECORD_COMM,
5752                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5753                                 /* .size */
5754                         },
5755                         /* .pid */
5756                         /* .tid */
5757                 },
5758         };
5759 
5760         perf_event_comm_event(&comm_event);
5761 }
5762 
5763 /*
5764  * mmap tracking
5765  */
5766 
5767 struct perf_mmap_event {
5768         struct vm_area_struct   *vma;
5769 
5770         const char              *file_name;
5771         int                     file_size;
5772         int                     maj, min;
5773         u64                     ino;
5774         u64                     ino_generation;
5775         u32                     prot, flags;
5776 
5777         struct {
5778                 struct perf_event_header        header;
5779 
5780                 u32                             pid;
5781                 u32                             tid;
5782                 u64                             start;
5783                 u64                             len;
5784                 u64                             pgoff;
5785         } event_id;
5786 };
5787 
5788 static int perf_event_mmap_match(struct perf_event *event,
5789                                  void *data)
5790 {
5791         struct perf_mmap_event *mmap_event = data;
5792         struct vm_area_struct *vma = mmap_event->vma;
5793         int executable = vma->vm_flags & VM_EXEC;
5794 
5795         return (!executable && event->attr.mmap_data) ||
5796                (executable && (event->attr.mmap || event->attr.mmap2));
5797 }
5798 
5799 static void perf_event_mmap_output(struct perf_event *event,
5800                                    void *data)
5801 {
5802         struct perf_mmap_event *mmap_event = data;
5803         struct perf_output_handle handle;
5804         struct perf_sample_data sample;
5805         int size = mmap_event->event_id.header.size;
5806         int ret;
5807 
5808         if (!perf_event_mmap_match(event, data))
5809                 return;
5810 
5811         if (event->attr.mmap2) {
5812                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5813                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5814                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5815                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5816                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5817                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5818                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5819         }
5820 
5821         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5822         ret = perf_output_begin(&handle, event,
5823                                 mmap_event->event_id.header.size);
5824         if (ret)
5825                 goto out;
5826 
5827         mmap_event->event_id.pid = perf_event_pid(event, current);
5828         mmap_event->event_id.tid = perf_event_tid(event, current);
5829 
5830         perf_output_put(&handle, mmap_event->event_id);
5831 
5832         if (event->attr.mmap2) {
5833                 perf_output_put(&handle, mmap_event->maj);
5834                 perf_output_put(&handle, mmap_event->min);
5835                 perf_output_put(&handle, mmap_event->ino);
5836                 perf_output_put(&handle, mmap_event->ino_generation);
5837                 perf_output_put(&handle, mmap_event->prot);
5838                 perf_output_put(&handle, mmap_event->flags);
5839         }
5840 
5841         __output_copy(&handle, mmap_event->file_name,
5842                                    mmap_event->file_size);
5843 
5844         perf_event__output_id_sample(event, &handle, &sample);
5845 
5846         perf_output_end(&handle);
5847 out:
5848         mmap_event->event_id.header.size = size;
5849 }
5850 
5851 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5852 {
5853         struct vm_area_struct *vma = mmap_event->vma;
5854         struct file *file = vma->vm_file;
5855         int maj = 0, min = 0;
5856         u64 ino = 0, gen = 0;
5857         u32 prot = 0, flags = 0;
5858         unsigned int size;
5859         char tmp[16];
5860         char *buf = NULL;
5861         char *name;
5862 
5863         if (vma->vm_flags & VM_READ)
5864                 prot |= PROT_READ;
5865         if (vma->vm_flags & VM_WRITE)
5866                 prot |= PROT_WRITE;
5867         if (vma->vm_flags & VM_EXEC)
5868                 prot |= PROT_EXEC;
5869 
5870         if (vma->vm_flags & VM_MAYSHARE)
5871                 flags = MAP_SHARED;
5872         else
5873                 flags = MAP_PRIVATE;
5874 
5875         if (vma->vm_flags & VM_DENYWRITE)
5876                 flags |= MAP_DENYWRITE;
5877         if (vma->vm_flags & VM_MAYEXEC)
5878                 flags |= MAP_EXECUTABLE;
5879         if (vma->vm_flags & VM_LOCKED)
5880                 flags |= MAP_LOCKED;
5881         if (vma->vm_flags & VM_HUGETLB)
5882                 flags |= MAP_HUGETLB;
5883 
5884         if (file) {
5885                 struct inode *inode;
5886                 dev_t dev;
5887 
5888                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5889                 if (!buf) {
5890                         name = "//enomem";
5891                         goto cpy_name;
5892                 }
5893                 /*
5894                  * d_path() works from the end of the rb backwards, so we
5895                  * need to add enough zero bytes after the string to handle
5896                  * the 64bit alignment we do later.
5897                  */
5898                 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5899                 if (IS_ERR(name)) {
5900                         name = "//toolong";
5901                         goto cpy_name;
5902                 }
5903                 inode = file_inode(vma->vm_file);
5904                 dev = inode->i_sb->s_dev;
5905                 ino = inode->i_ino;
5906                 gen = inode->i_generation;
5907                 maj = MAJOR(dev);
5908                 min = MINOR(dev);
5909 
5910                 goto got_name;
5911         } else {
5912                 if (vma->vm_ops && vma->vm_ops->name) {
5913                         name = (char *) vma->vm_ops->name(vma);
5914                         if (name)
5915                                 goto cpy_name;
5916                 }
5917 
5918                 name = (char *)arch_vma_name(vma);
5919                 if (name)
5920                         goto cpy_name;
5921 
5922                 if (vma->vm_start <= vma->vm_mm->start_brk &&
5923                                 vma->vm_end >= vma->vm_mm->brk) {
5924                         name = "[heap]";
5925                         goto cpy_name;
5926                 }
5927                 if (vma->vm_start <= vma->vm_mm->start_stack &&
5928                                 vma->vm_end >= vma->vm_mm->start_stack) {
5929                         name = "[stack]";
5930                         goto cpy_name;
5931                 }
5932 
5933                 name = "//anon";
5934                 goto cpy_name;
5935         }
5936 
5937 cpy_name:
5938         strlcpy(tmp, name, sizeof(tmp));
5939         name = tmp;
5940 got_name:
5941         /*
5942          * Since our buffer works in 8 byte units we need to align our string
5943          * size to a multiple of 8. However, we must guarantee the tail end is
5944          * zero'd out to avoid leaking random bits to userspace.
5945          */
5946         size = strlen(name)+1;
5947         while (!IS_ALIGNED(size, sizeof(u64)))
5948                 name[size++] = '\0';
5949 
5950         mmap_event->file_name = name;
5951         mmap_event->file_size = size;
5952         mmap_event->maj = maj;
5953         mmap_event->min = min;
5954         mmap_event->ino = ino;
5955         mmap_event->ino_generation = gen;
5956         mmap_event->prot = prot;
5957         mmap_event->flags = flags;
5958 
5959         if (!(vma->vm_flags & VM_EXEC))
5960                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5961 
5962         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5963 
5964         perf_event_aux(perf_event_mmap_output,
5965                        mmap_event,
5966                        NULL);
5967 
5968         kfree(buf);
5969 }
5970 
5971 void perf_event_mmap(struct vm_area_struct *vma)
5972 {
5973         struct perf_mmap_event mmap_event;
5974 
5975         if (!atomic_read(&nr_mmap_events))
5976                 return;
5977 
5978         mmap_event = (struct perf_mmap_event){
5979                 .vma    = vma,
5980                 /* .file_name */
5981                 /* .file_size */
5982                 .event_id  = {
5983                         .header = {
5984                                 .type = PERF_RECORD_MMAP,
5985                                 .misc = PERF_RECORD_MISC_USER,
5986                                 /* .size */
5987                         },
5988                         /* .pid */
5989