1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance events core code: 4 * 5 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 7 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 8 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/cpu.h> 14 #include <linux/smp.h> 15 #include <linux/idr.h> 16 #include <linux/file.h> 17 #include <linux/poll.h> 18 #include <linux/slab.h> 19 #include <linux/hash.h> 20 #include <linux/tick.h> 21 #include <linux/sysfs.h> 22 #include <linux/dcache.h> 23 #include <linux/percpu.h> 24 #include <linux/ptrace.h> 25 #include <linux/reboot.h> 26 #include <linux/vmstat.h> 27 #include <linux/device.h> 28 #include <linux/export.h> 29 #include <linux/vmalloc.h> 30 #include <linux/hardirq.h> 31 #include <linux/hugetlb.h> 32 #include <linux/rculist.h> 33 #include <linux/uaccess.h> 34 #include <linux/syscalls.h> 35 #include <linux/anon_inodes.h> 36 #include <linux/kernel_stat.h> 37 #include <linux/cgroup.h> 38 #include <linux/perf_event.h> 39 #include <linux/trace_events.h> 40 #include <linux/hw_breakpoint.h> 41 #include <linux/mm_types.h> 42 #include <linux/module.h> 43 #include <linux/mman.h> 44 #include <linux/compat.h> 45 #include <linux/bpf.h> 46 #include <linux/filter.h> 47 #include <linux/namei.h> 48 #include <linux/parser.h> 49 #include <linux/sched/clock.h> 50 #include <linux/sched/mm.h> 51 #include <linux/proc_ns.h> 52 #include <linux/mount.h> 53 #include <linux/min_heap.h> 54 #include <linux/highmem.h> 55 #include <linux/pgtable.h> 56 #include <linux/buildid.h> 57 58 #include "internal.h" 59 60 #include <asm/irq_regs.h> 61 62 typedef int (*remote_function_f)(void *); 63 64 struct remote_function_call { 65 struct task_struct *p; 66 remote_function_f func; 67 void *info; 68 int ret; 69 }; 70 71 static void remote_function(void *data) 72 { 73 struct remote_function_call *tfc = data; 74 struct task_struct *p = tfc->p; 75 76 if (p) { 77 /* -EAGAIN */ 78 if (task_cpu(p) != smp_processor_id()) 79 return; 80 81 /* 82 * Now that we're on right CPU with IRQs disabled, we can test 83 * if we hit the right task without races. 84 */ 85 86 tfc->ret = -ESRCH; /* No such (running) process */ 87 if (p != current) 88 return; 89 } 90 91 tfc->ret = tfc->func(tfc->info); 92 } 93 94 /** 95 * task_function_call - call a function on the cpu on which a task runs 96 * @p: the task to evaluate 97 * @func: the function to be called 98 * @info: the function call argument 99 * 100 * Calls the function @func when the task is currently running. This might 101 * be on the current CPU, which just calls the function directly. This will 102 * retry due to any failures in smp_call_function_single(), such as if the 103 * task_cpu() goes offline concurrently. 104 * 105 * returns @func return value or -ESRCH or -ENXIO when the process isn't running 106 */ 107 static int 108 task_function_call(struct task_struct *p, remote_function_f func, void *info) 109 { 110 struct remote_function_call data = { 111 .p = p, 112 .func = func, 113 .info = info, 114 .ret = -EAGAIN, 115 }; 116 int ret; 117 118 for (;;) { 119 ret = smp_call_function_single(task_cpu(p), remote_function, 120 &data, 1); 121 if (!ret) 122 ret = data.ret; 123 124 if (ret != -EAGAIN) 125 break; 126 127 cond_resched(); 128 } 129 130 return ret; 131 } 132 133 /** 134 * cpu_function_call - call a function on the cpu 135 * @func: the function to be called 136 * @info: the function call argument 137 * 138 * Calls the function @func on the remote cpu. 139 * 140 * returns: @func return value or -ENXIO when the cpu is offline 141 */ 142 static int cpu_function_call(int cpu, remote_function_f func, void *info) 143 { 144 struct remote_function_call data = { 145 .p = NULL, 146 .func = func, 147 .info = info, 148 .ret = -ENXIO, /* No such CPU */ 149 }; 150 151 smp_call_function_single(cpu, remote_function, &data, 1); 152 153 return data.ret; 154 } 155 156 static inline struct perf_cpu_context * 157 __get_cpu_context(struct perf_event_context *ctx) 158 { 159 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 160 } 161 162 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 163 struct perf_event_context *ctx) 164 { 165 raw_spin_lock(&cpuctx->ctx.lock); 166 if (ctx) 167 raw_spin_lock(&ctx->lock); 168 } 169 170 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 171 struct perf_event_context *ctx) 172 { 173 if (ctx) 174 raw_spin_unlock(&ctx->lock); 175 raw_spin_unlock(&cpuctx->ctx.lock); 176 } 177 178 #define TASK_TOMBSTONE ((void *)-1L) 179 180 static bool is_kernel_event(struct perf_event *event) 181 { 182 return READ_ONCE(event->owner) == TASK_TOMBSTONE; 183 } 184 185 /* 186 * On task ctx scheduling... 187 * 188 * When !ctx->nr_events a task context will not be scheduled. This means 189 * we can disable the scheduler hooks (for performance) without leaving 190 * pending task ctx state. 191 * 192 * This however results in two special cases: 193 * 194 * - removing the last event from a task ctx; this is relatively straight 195 * forward and is done in __perf_remove_from_context. 196 * 197 * - adding the first event to a task ctx; this is tricky because we cannot 198 * rely on ctx->is_active and therefore cannot use event_function_call(). 199 * See perf_install_in_context(). 200 * 201 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 202 */ 203 204 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 205 struct perf_event_context *, void *); 206 207 struct event_function_struct { 208 struct perf_event *event; 209 event_f func; 210 void *data; 211 }; 212 213 static int event_function(void *info) 214 { 215 struct event_function_struct *efs = info; 216 struct perf_event *event = efs->event; 217 struct perf_event_context *ctx = event->ctx; 218 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 219 struct perf_event_context *task_ctx = cpuctx->task_ctx; 220 int ret = 0; 221 222 lockdep_assert_irqs_disabled(); 223 224 perf_ctx_lock(cpuctx, task_ctx); 225 /* 226 * Since we do the IPI call without holding ctx->lock things can have 227 * changed, double check we hit the task we set out to hit. 228 */ 229 if (ctx->task) { 230 if (ctx->task != current) { 231 ret = -ESRCH; 232 goto unlock; 233 } 234 235 /* 236 * We only use event_function_call() on established contexts, 237 * and event_function() is only ever called when active (or 238 * rather, we'll have bailed in task_function_call() or the 239 * above ctx->task != current test), therefore we must have 240 * ctx->is_active here. 241 */ 242 WARN_ON_ONCE(!ctx->is_active); 243 /* 244 * And since we have ctx->is_active, cpuctx->task_ctx must 245 * match. 246 */ 247 WARN_ON_ONCE(task_ctx != ctx); 248 } else { 249 WARN_ON_ONCE(&cpuctx->ctx != ctx); 250 } 251 252 efs->func(event, cpuctx, ctx, efs->data); 253 unlock: 254 perf_ctx_unlock(cpuctx, task_ctx); 255 256 return ret; 257 } 258 259 static void event_function_call(struct perf_event *event, event_f func, void *data) 260 { 261 struct perf_event_context *ctx = event->ctx; 262 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 263 struct event_function_struct efs = { 264 .event = event, 265 .func = func, 266 .data = data, 267 }; 268 269 if (!event->parent) { 270 /* 271 * If this is a !child event, we must hold ctx::mutex to 272 * stabilize the event->ctx relation. See 273 * perf_event_ctx_lock(). 274 */ 275 lockdep_assert_held(&ctx->mutex); 276 } 277 278 if (!task) { 279 cpu_function_call(event->cpu, event_function, &efs); 280 return; 281 } 282 283 if (task == TASK_TOMBSTONE) 284 return; 285 286 again: 287 if (!task_function_call(task, event_function, &efs)) 288 return; 289 290 raw_spin_lock_irq(&ctx->lock); 291 /* 292 * Reload the task pointer, it might have been changed by 293 * a concurrent perf_event_context_sched_out(). 294 */ 295 task = ctx->task; 296 if (task == TASK_TOMBSTONE) { 297 raw_spin_unlock_irq(&ctx->lock); 298 return; 299 } 300 if (ctx->is_active) { 301 raw_spin_unlock_irq(&ctx->lock); 302 goto again; 303 } 304 func(event, NULL, ctx, data); 305 raw_spin_unlock_irq(&ctx->lock); 306 } 307 308 /* 309 * Similar to event_function_call() + event_function(), but hard assumes IRQs 310 * are already disabled and we're on the right CPU. 311 */ 312 static void event_function_local(struct perf_event *event, event_f func, void *data) 313 { 314 struct perf_event_context *ctx = event->ctx; 315 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 316 struct task_struct *task = READ_ONCE(ctx->task); 317 struct perf_event_context *task_ctx = NULL; 318 319 lockdep_assert_irqs_disabled(); 320 321 if (task) { 322 if (task == TASK_TOMBSTONE) 323 return; 324 325 task_ctx = ctx; 326 } 327 328 perf_ctx_lock(cpuctx, task_ctx); 329 330 task = ctx->task; 331 if (task == TASK_TOMBSTONE) 332 goto unlock; 333 334 if (task) { 335 /* 336 * We must be either inactive or active and the right task, 337 * otherwise we're screwed, since we cannot IPI to somewhere 338 * else. 339 */ 340 if (ctx->is_active) { 341 if (WARN_ON_ONCE(task != current)) 342 goto unlock; 343 344 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) 345 goto unlock; 346 } 347 } else { 348 WARN_ON_ONCE(&cpuctx->ctx != ctx); 349 } 350 351 func(event, cpuctx, ctx, data); 352 unlock: 353 perf_ctx_unlock(cpuctx, task_ctx); 354 } 355 356 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 357 PERF_FLAG_FD_OUTPUT |\ 358 PERF_FLAG_PID_CGROUP |\ 359 PERF_FLAG_FD_CLOEXEC) 360 361 /* 362 * branch priv levels that need permission checks 363 */ 364 #define PERF_SAMPLE_BRANCH_PERM_PLM \ 365 (PERF_SAMPLE_BRANCH_KERNEL |\ 366 PERF_SAMPLE_BRANCH_HV) 367 368 enum event_type_t { 369 EVENT_FLEXIBLE = 0x1, 370 EVENT_PINNED = 0x2, 371 EVENT_TIME = 0x4, 372 /* see ctx_resched() for details */ 373 EVENT_CPU = 0x8, 374 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 375 }; 376 377 /* 378 * perf_sched_events : >0 events exist 379 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 380 */ 381 382 static void perf_sched_delayed(struct work_struct *work); 383 DEFINE_STATIC_KEY_FALSE(perf_sched_events); 384 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 385 static DEFINE_MUTEX(perf_sched_mutex); 386 static atomic_t perf_sched_count; 387 388 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 389 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 390 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); 391 392 static atomic_t nr_mmap_events __read_mostly; 393 static atomic_t nr_comm_events __read_mostly; 394 static atomic_t nr_namespaces_events __read_mostly; 395 static atomic_t nr_task_events __read_mostly; 396 static atomic_t nr_freq_events __read_mostly; 397 static atomic_t nr_switch_events __read_mostly; 398 static atomic_t nr_ksymbol_events __read_mostly; 399 static atomic_t nr_bpf_events __read_mostly; 400 static atomic_t nr_cgroup_events __read_mostly; 401 static atomic_t nr_text_poke_events __read_mostly; 402 static atomic_t nr_build_id_events __read_mostly; 403 404 static LIST_HEAD(pmus); 405 static DEFINE_MUTEX(pmus_lock); 406 static struct srcu_struct pmus_srcu; 407 static cpumask_var_t perf_online_mask; 408 409 /* 410 * perf event paranoia level: 411 * -1 - not paranoid at all 412 * 0 - disallow raw tracepoint access for unpriv 413 * 1 - disallow cpu events for unpriv 414 * 2 - disallow kernel profiling for unpriv 415 */ 416 int sysctl_perf_event_paranoid __read_mostly = 2; 417 418 /* Minimum for 512 kiB + 1 user control page */ 419 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ 420 421 /* 422 * max perf event sample rate 423 */ 424 #define DEFAULT_MAX_SAMPLE_RATE 100000 425 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) 426 #define DEFAULT_CPU_TIME_MAX_PERCENT 25 427 428 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 429 430 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 431 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 432 433 static int perf_sample_allowed_ns __read_mostly = 434 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 435 436 static void update_perf_cpu_limits(void) 437 { 438 u64 tmp = perf_sample_period_ns; 439 440 tmp *= sysctl_perf_cpu_time_max_percent; 441 tmp = div_u64(tmp, 100); 442 if (!tmp) 443 tmp = 1; 444 445 WRITE_ONCE(perf_sample_allowed_ns, tmp); 446 } 447 448 static bool perf_rotate_context(struct perf_cpu_context *cpuctx); 449 450 int perf_proc_update_handler(struct ctl_table *table, int write, 451 void *buffer, size_t *lenp, loff_t *ppos) 452 { 453 int ret; 454 int perf_cpu = sysctl_perf_cpu_time_max_percent; 455 /* 456 * If throttling is disabled don't allow the write: 457 */ 458 if (write && (perf_cpu == 100 || perf_cpu == 0)) 459 return -EINVAL; 460 461 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 462 if (ret || !write) 463 return ret; 464 465 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 466 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 467 update_perf_cpu_limits(); 468 469 return 0; 470 } 471 472 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; 473 474 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, 475 void *buffer, size_t *lenp, loff_t *ppos) 476 { 477 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 478 479 if (ret || !write) 480 return ret; 481 482 if (sysctl_perf_cpu_time_max_percent == 100 || 483 sysctl_perf_cpu_time_max_percent == 0) { 484 printk(KERN_WARNING 485 "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); 486 WRITE_ONCE(perf_sample_allowed_ns, 0); 487 } else { 488 update_perf_cpu_limits(); 489 } 490 491 return 0; 492 } 493 494 /* 495 * perf samples are done in some very critical code paths (NMIs). 496 * If they take too much CPU time, the system can lock up and not 497 * get any real work done. This will drop the sample rate when 498 * we detect that events are taking too long. 499 */ 500 #define NR_ACCUMULATED_SAMPLES 128 501 static DEFINE_PER_CPU(u64, running_sample_length); 502 503 static u64 __report_avg; 504 static u64 __report_allowed; 505 506 static void perf_duration_warn(struct irq_work *w) 507 { 508 printk_ratelimited(KERN_INFO 509 "perf: interrupt took too long (%lld > %lld), lowering " 510 "kernel.perf_event_max_sample_rate to %d\n", 511 __report_avg, __report_allowed, 512 sysctl_perf_event_sample_rate); 513 } 514 515 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); 516 517 void perf_sample_event_took(u64 sample_len_ns) 518 { 519 u64 max_len = READ_ONCE(perf_sample_allowed_ns); 520 u64 running_len; 521 u64 avg_len; 522 u32 max; 523 524 if (max_len == 0) 525 return; 526 527 /* Decay the counter by 1 average sample. */ 528 running_len = __this_cpu_read(running_sample_length); 529 running_len -= running_len/NR_ACCUMULATED_SAMPLES; 530 running_len += sample_len_ns; 531 __this_cpu_write(running_sample_length, running_len); 532 533 /* 534 * Note: this will be biased artifically low until we have 535 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us 536 * from having to maintain a count. 537 */ 538 avg_len = running_len/NR_ACCUMULATED_SAMPLES; 539 if (avg_len <= max_len) 540 return; 541 542 __report_avg = avg_len; 543 __report_allowed = max_len; 544 545 /* 546 * Compute a throttle threshold 25% below the current duration. 547 */ 548 avg_len += avg_len / 4; 549 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; 550 if (avg_len < max) 551 max /= (u32)avg_len; 552 else 553 max = 1; 554 555 WRITE_ONCE(perf_sample_allowed_ns, avg_len); 556 WRITE_ONCE(max_samples_per_tick, max); 557 558 sysctl_perf_event_sample_rate = max * HZ; 559 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 560 561 if (!irq_work_queue(&perf_duration_work)) { 562 early_printk("perf: interrupt took too long (%lld > %lld), lowering " 563 "kernel.perf_event_max_sample_rate to %d\n", 564 __report_avg, __report_allowed, 565 sysctl_perf_event_sample_rate); 566 } 567 } 568 569 static atomic64_t perf_event_id; 570 571 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 572 enum event_type_t event_type); 573 574 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 575 enum event_type_t event_type, 576 struct task_struct *task); 577 578 static void update_context_time(struct perf_event_context *ctx); 579 static u64 perf_event_time(struct perf_event *event); 580 581 void __weak perf_event_print_debug(void) { } 582 583 extern __weak const char *perf_pmu_name(void) 584 { 585 return "pmu"; 586 } 587 588 static inline u64 perf_clock(void) 589 { 590 return local_clock(); 591 } 592 593 static inline u64 perf_event_clock(struct perf_event *event) 594 { 595 return event->clock(); 596 } 597 598 /* 599 * State based event timekeeping... 600 * 601 * The basic idea is to use event->state to determine which (if any) time 602 * fields to increment with the current delta. This means we only need to 603 * update timestamps when we change state or when they are explicitly requested 604 * (read). 605 * 606 * Event groups make things a little more complicated, but not terribly so. The 607 * rules for a group are that if the group leader is OFF the entire group is 608 * OFF, irrespecive of what the group member states are. This results in 609 * __perf_effective_state(). 610 * 611 * A futher ramification is that when a group leader flips between OFF and 612 * !OFF, we need to update all group member times. 613 * 614 * 615 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we 616 * need to make sure the relevant context time is updated before we try and 617 * update our timestamps. 618 */ 619 620 static __always_inline enum perf_event_state 621 __perf_effective_state(struct perf_event *event) 622 { 623 struct perf_event *leader = event->group_leader; 624 625 if (leader->state <= PERF_EVENT_STATE_OFF) 626 return leader->state; 627 628 return event->state; 629 } 630 631 static __always_inline void 632 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) 633 { 634 enum perf_event_state state = __perf_effective_state(event); 635 u64 delta = now - event->tstamp; 636 637 *enabled = event->total_time_enabled; 638 if (state >= PERF_EVENT_STATE_INACTIVE) 639 *enabled += delta; 640 641 *running = event->total_time_running; 642 if (state >= PERF_EVENT_STATE_ACTIVE) 643 *running += delta; 644 } 645 646 static void perf_event_update_time(struct perf_event *event) 647 { 648 u64 now = perf_event_time(event); 649 650 __perf_update_times(event, now, &event->total_time_enabled, 651 &event->total_time_running); 652 event->tstamp = now; 653 } 654 655 static void perf_event_update_sibling_time(struct perf_event *leader) 656 { 657 struct perf_event *sibling; 658 659 for_each_sibling_event(sibling, leader) 660 perf_event_update_time(sibling); 661 } 662 663 static void 664 perf_event_set_state(struct perf_event *event, enum perf_event_state state) 665 { 666 if (event->state == state) 667 return; 668 669 perf_event_update_time(event); 670 /* 671 * If a group leader gets enabled/disabled all its siblings 672 * are affected too. 673 */ 674 if ((event->state < 0) ^ (state < 0)) 675 perf_event_update_sibling_time(event); 676 677 WRITE_ONCE(event->state, state); 678 } 679 680 #ifdef CONFIG_CGROUP_PERF 681 682 static inline bool 683 perf_cgroup_match(struct perf_event *event) 684 { 685 struct perf_event_context *ctx = event->ctx; 686 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 687 688 /* @event doesn't care about cgroup */ 689 if (!event->cgrp) 690 return true; 691 692 /* wants specific cgroup scope but @cpuctx isn't associated with any */ 693 if (!cpuctx->cgrp) 694 return false; 695 696 /* 697 * Cgroup scoping is recursive. An event enabled for a cgroup is 698 * also enabled for all its descendant cgroups. If @cpuctx's 699 * cgroup is a descendant of @event's (the test covers identity 700 * case), it's a match. 701 */ 702 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, 703 event->cgrp->css.cgroup); 704 } 705 706 static inline void perf_detach_cgroup(struct perf_event *event) 707 { 708 css_put(&event->cgrp->css); 709 event->cgrp = NULL; 710 } 711 712 static inline int is_cgroup_event(struct perf_event *event) 713 { 714 return event->cgrp != NULL; 715 } 716 717 static inline u64 perf_cgroup_event_time(struct perf_event *event) 718 { 719 struct perf_cgroup_info *t; 720 721 t = per_cpu_ptr(event->cgrp->info, event->cpu); 722 return t->time; 723 } 724 725 static inline void __update_cgrp_time(struct perf_cgroup *cgrp) 726 { 727 struct perf_cgroup_info *info; 728 u64 now; 729 730 now = perf_clock(); 731 732 info = this_cpu_ptr(cgrp->info); 733 734 info->time += now - info->timestamp; 735 info->timestamp = now; 736 } 737 738 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 739 { 740 struct perf_cgroup *cgrp = cpuctx->cgrp; 741 struct cgroup_subsys_state *css; 742 743 if (cgrp) { 744 for (css = &cgrp->css; css; css = css->parent) { 745 cgrp = container_of(css, struct perf_cgroup, css); 746 __update_cgrp_time(cgrp); 747 } 748 } 749 } 750 751 static inline void update_cgrp_time_from_event(struct perf_event *event) 752 { 753 struct perf_cgroup *cgrp; 754 755 /* 756 * ensure we access cgroup data only when needed and 757 * when we know the cgroup is pinned (css_get) 758 */ 759 if (!is_cgroup_event(event)) 760 return; 761 762 cgrp = perf_cgroup_from_task(current, event->ctx); 763 /* 764 * Do not update time when cgroup is not active 765 */ 766 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) 767 __update_cgrp_time(event->cgrp); 768 } 769 770 static inline void 771 perf_cgroup_set_timestamp(struct task_struct *task, 772 struct perf_event_context *ctx) 773 { 774 struct perf_cgroup *cgrp; 775 struct perf_cgroup_info *info; 776 struct cgroup_subsys_state *css; 777 778 /* 779 * ctx->lock held by caller 780 * ensure we do not access cgroup data 781 * unless we have the cgroup pinned (css_get) 782 */ 783 if (!task || !ctx->nr_cgroups) 784 return; 785 786 cgrp = perf_cgroup_from_task(task, ctx); 787 788 for (css = &cgrp->css; css; css = css->parent) { 789 cgrp = container_of(css, struct perf_cgroup, css); 790 info = this_cpu_ptr(cgrp->info); 791 info->timestamp = ctx->timestamp; 792 } 793 } 794 795 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); 796 797 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ 798 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ 799 800 /* 801 * reschedule events based on the cgroup constraint of task. 802 * 803 * mode SWOUT : schedule out everything 804 * mode SWIN : schedule in based on cgroup for next 805 */ 806 static void perf_cgroup_switch(struct task_struct *task, int mode) 807 { 808 struct perf_cpu_context *cpuctx; 809 struct list_head *list; 810 unsigned long flags; 811 812 /* 813 * Disable interrupts and preemption to avoid this CPU's 814 * cgrp_cpuctx_entry to change under us. 815 */ 816 local_irq_save(flags); 817 818 list = this_cpu_ptr(&cgrp_cpuctx_list); 819 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { 820 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); 821 822 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 823 perf_pmu_disable(cpuctx->ctx.pmu); 824 825 if (mode & PERF_CGROUP_SWOUT) { 826 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 827 /* 828 * must not be done before ctxswout due 829 * to event_filter_match() in event_sched_out() 830 */ 831 cpuctx->cgrp = NULL; 832 } 833 834 if (mode & PERF_CGROUP_SWIN) { 835 WARN_ON_ONCE(cpuctx->cgrp); 836 /* 837 * set cgrp before ctxsw in to allow 838 * event_filter_match() to not have to pass 839 * task around 840 * we pass the cpuctx->ctx to perf_cgroup_from_task() 841 * because cgorup events are only per-cpu 842 */ 843 cpuctx->cgrp = perf_cgroup_from_task(task, 844 &cpuctx->ctx); 845 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 846 } 847 perf_pmu_enable(cpuctx->ctx.pmu); 848 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 849 } 850 851 local_irq_restore(flags); 852 } 853 854 static inline void perf_cgroup_sched_out(struct task_struct *task, 855 struct task_struct *next) 856 { 857 struct perf_cgroup *cgrp1; 858 struct perf_cgroup *cgrp2 = NULL; 859 860 rcu_read_lock(); 861 /* 862 * we come here when we know perf_cgroup_events > 0 863 * we do not need to pass the ctx here because we know 864 * we are holding the rcu lock 865 */ 866 cgrp1 = perf_cgroup_from_task(task, NULL); 867 cgrp2 = perf_cgroup_from_task(next, NULL); 868 869 /* 870 * only schedule out current cgroup events if we know 871 * that we are switching to a different cgroup. Otherwise, 872 * do no touch the cgroup events. 873 */ 874 if (cgrp1 != cgrp2) 875 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 876 877 rcu_read_unlock(); 878 } 879 880 static inline void perf_cgroup_sched_in(struct task_struct *prev, 881 struct task_struct *task) 882 { 883 struct perf_cgroup *cgrp1; 884 struct perf_cgroup *cgrp2 = NULL; 885 886 rcu_read_lock(); 887 /* 888 * we come here when we know perf_cgroup_events > 0 889 * we do not need to pass the ctx here because we know 890 * we are holding the rcu lock 891 */ 892 cgrp1 = perf_cgroup_from_task(task, NULL); 893 cgrp2 = perf_cgroup_from_task(prev, NULL); 894 895 /* 896 * only need to schedule in cgroup events if we are changing 897 * cgroup during ctxsw. Cgroup events were not scheduled 898 * out of ctxsw out if that was not the case. 899 */ 900 if (cgrp1 != cgrp2) 901 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 902 903 rcu_read_unlock(); 904 } 905 906 static int perf_cgroup_ensure_storage(struct perf_event *event, 907 struct cgroup_subsys_state *css) 908 { 909 struct perf_cpu_context *cpuctx; 910 struct perf_event **storage; 911 int cpu, heap_size, ret = 0; 912 913 /* 914 * Allow storage to have sufficent space for an iterator for each 915 * possibly nested cgroup plus an iterator for events with no cgroup. 916 */ 917 for (heap_size = 1; css; css = css->parent) 918 heap_size++; 919 920 for_each_possible_cpu(cpu) { 921 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); 922 if (heap_size <= cpuctx->heap_size) 923 continue; 924 925 storage = kmalloc_node(heap_size * sizeof(struct perf_event *), 926 GFP_KERNEL, cpu_to_node(cpu)); 927 if (!storage) { 928 ret = -ENOMEM; 929 break; 930 } 931 932 raw_spin_lock_irq(&cpuctx->ctx.lock); 933 if (cpuctx->heap_size < heap_size) { 934 swap(cpuctx->heap, storage); 935 if (storage == cpuctx->heap_default) 936 storage = NULL; 937 cpuctx->heap_size = heap_size; 938 } 939 raw_spin_unlock_irq(&cpuctx->ctx.lock); 940 941 kfree(storage); 942 } 943 944 return ret; 945 } 946 947 static inline int perf_cgroup_connect(int fd, struct perf_event *event, 948 struct perf_event_attr *attr, 949 struct perf_event *group_leader) 950 { 951 struct perf_cgroup *cgrp; 952 struct cgroup_subsys_state *css; 953 struct fd f = fdget(fd); 954 int ret = 0; 955 956 if (!f.file) 957 return -EBADF; 958 959 css = css_tryget_online_from_dir(f.file->f_path.dentry, 960 &perf_event_cgrp_subsys); 961 if (IS_ERR(css)) { 962 ret = PTR_ERR(css); 963 goto out; 964 } 965 966 ret = perf_cgroup_ensure_storage(event, css); 967 if (ret) 968 goto out; 969 970 cgrp = container_of(css, struct perf_cgroup, css); 971 event->cgrp = cgrp; 972 973 /* 974 * all events in a group must monitor 975 * the same cgroup because a task belongs 976 * to only one perf cgroup at a time 977 */ 978 if (group_leader && group_leader->cgrp != cgrp) { 979 perf_detach_cgroup(event); 980 ret = -EINVAL; 981 } 982 out: 983 fdput(f); 984 return ret; 985 } 986 987 static inline void 988 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 989 { 990 struct perf_cgroup_info *t; 991 t = per_cpu_ptr(event->cgrp->info, event->cpu); 992 event->shadow_ctx_time = now - t->timestamp; 993 } 994 995 static inline void 996 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) 997 { 998 struct perf_cpu_context *cpuctx; 999 1000 if (!is_cgroup_event(event)) 1001 return; 1002 1003 /* 1004 * Because cgroup events are always per-cpu events, 1005 * @ctx == &cpuctx->ctx. 1006 */ 1007 cpuctx = container_of(ctx, struct perf_cpu_context, ctx); 1008 1009 /* 1010 * Since setting cpuctx->cgrp is conditional on the current @cgrp 1011 * matching the event's cgroup, we must do this for every new event, 1012 * because if the first would mismatch, the second would not try again 1013 * and we would leave cpuctx->cgrp unset. 1014 */ 1015 if (ctx->is_active && !cpuctx->cgrp) { 1016 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); 1017 1018 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) 1019 cpuctx->cgrp = cgrp; 1020 } 1021 1022 if (ctx->nr_cgroups++) 1023 return; 1024 1025 list_add(&cpuctx->cgrp_cpuctx_entry, 1026 per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); 1027 } 1028 1029 static inline void 1030 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) 1031 { 1032 struct perf_cpu_context *cpuctx; 1033 1034 if (!is_cgroup_event(event)) 1035 return; 1036 1037 /* 1038 * Because cgroup events are always per-cpu events, 1039 * @ctx == &cpuctx->ctx. 1040 */ 1041 cpuctx = container_of(ctx, struct perf_cpu_context, ctx); 1042 1043 if (--ctx->nr_cgroups) 1044 return; 1045 1046 if (ctx->is_active && cpuctx->cgrp) 1047 cpuctx->cgrp = NULL; 1048 1049 list_del(&cpuctx->cgrp_cpuctx_entry); 1050 } 1051 1052 #else /* !CONFIG_CGROUP_PERF */ 1053 1054 static inline bool 1055 perf_cgroup_match(struct perf_event *event) 1056 { 1057 return true; 1058 } 1059 1060 static inline void perf_detach_cgroup(struct perf_event *event) 1061 {} 1062 1063 static inline int is_cgroup_event(struct perf_event *event) 1064 { 1065 return 0; 1066 } 1067 1068 static inline void update_cgrp_time_from_event(struct perf_event *event) 1069 { 1070 } 1071 1072 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) 1073 { 1074 } 1075 1076 static inline void perf_cgroup_sched_out(struct task_struct *task, 1077 struct task_struct *next) 1078 { 1079 } 1080 1081 static inline void perf_cgroup_sched_in(struct task_struct *prev, 1082 struct task_struct *task) 1083 { 1084 } 1085 1086 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, 1087 struct perf_event_attr *attr, 1088 struct perf_event *group_leader) 1089 { 1090 return -EINVAL; 1091 } 1092 1093 static inline void 1094 perf_cgroup_set_timestamp(struct task_struct *task, 1095 struct perf_event_context *ctx) 1096 { 1097 } 1098 1099 static inline void 1100 perf_cgroup_switch(struct task_struct *task, struct task_struct *next) 1101 { 1102 } 1103 1104 static inline void 1105 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) 1106 { 1107 } 1108 1109 static inline u64 perf_cgroup_event_time(struct perf_event *event) 1110 { 1111 return 0; 1112 } 1113 1114 static inline void 1115 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) 1116 { 1117 } 1118 1119 static inline void 1120 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) 1121 { 1122 } 1123 #endif 1124 1125 /* 1126 * set default to be dependent on timer tick just 1127 * like original code 1128 */ 1129 #define PERF_CPU_HRTIMER (1000 / HZ) 1130 /* 1131 * function must be called with interrupts disabled 1132 */ 1133 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) 1134 { 1135 struct perf_cpu_context *cpuctx; 1136 bool rotations; 1137 1138 lockdep_assert_irqs_disabled(); 1139 1140 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); 1141 rotations = perf_rotate_context(cpuctx); 1142 1143 raw_spin_lock(&cpuctx->hrtimer_lock); 1144 if (rotations) 1145 hrtimer_forward_now(hr, cpuctx->hrtimer_interval); 1146 else 1147 cpuctx->hrtimer_active = 0; 1148 raw_spin_unlock(&cpuctx->hrtimer_lock); 1149 1150 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; 1151 } 1152 1153 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) 1154 { 1155 struct hrtimer *timer = &cpuctx->hrtimer; 1156 struct pmu *pmu = cpuctx->ctx.pmu; 1157 u64 interval; 1158 1159 /* no multiplexing needed for SW PMU */ 1160 if (pmu->task_ctx_nr == perf_sw_context) 1161 return; 1162 1163 /* 1164 * check default is sane, if not set then force to 1165 * default interval (1/tick) 1166 */ 1167 interval = pmu->hrtimer_interval_ms; 1168 if (interval < 1) 1169 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; 1170 1171 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); 1172 1173 raw_spin_lock_init(&cpuctx->hrtimer_lock); 1174 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); 1175 timer->function = perf_mux_hrtimer_handler; 1176 } 1177 1178 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) 1179 { 1180 struct hrtimer *timer = &cpuctx->hrtimer; 1181 struct pmu *pmu = cpuctx->ctx.pmu; 1182 unsigned long flags; 1183 1184 /* not for SW PMU */ 1185 if (pmu->task_ctx_nr == perf_sw_context) 1186 return 0; 1187 1188 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); 1189 if (!cpuctx->hrtimer_active) { 1190 cpuctx->hrtimer_active = 1; 1191 hrtimer_forward_now(timer, cpuctx->hrtimer_interval); 1192 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); 1193 } 1194 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); 1195 1196 return 0; 1197 } 1198 1199 void perf_pmu_disable(struct pmu *pmu) 1200 { 1201 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1202 if (!(*count)++) 1203 pmu->pmu_disable(pmu); 1204 } 1205 1206 void perf_pmu_enable(struct pmu *pmu) 1207 { 1208 int *count = this_cpu_ptr(pmu->pmu_disable_count); 1209 if (!--(*count)) 1210 pmu->pmu_enable(pmu); 1211 } 1212 1213 static DEFINE_PER_CPU(struct list_head, active_ctx_list); 1214 1215 /* 1216 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and 1217 * perf_event_task_tick() are fully serialized because they're strictly cpu 1218 * affine and perf_event_ctx{activate,deactivate} are called with IRQs 1219 * disabled, while perf_event_task_tick is called from IRQ context. 1220 */ 1221 static void perf_event_ctx_activate(struct perf_event_context *ctx) 1222 { 1223 struct list_head *head = this_cpu_ptr(&active_ctx_list); 1224 1225 lockdep_assert_irqs_disabled(); 1226 1227 WARN_ON(!list_empty(&ctx->active_ctx_list)); 1228 1229 list_add(&ctx->active_ctx_list, head); 1230 } 1231 1232 static void perf_event_ctx_deactivate(struct perf_event_context *ctx) 1233 { 1234 lockdep_assert_irqs_disabled(); 1235 1236 WARN_ON(list_empty(&ctx->active_ctx_list)); 1237 1238 list_del_init(&ctx->active_ctx_list); 1239 } 1240 1241 static void get_ctx(struct perf_event_context *ctx) 1242 { 1243 refcount_inc(&ctx->refcount); 1244 } 1245 1246 static void *alloc_task_ctx_data(struct pmu *pmu) 1247 { 1248 if (pmu->task_ctx_cache) 1249 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); 1250 1251 return NULL; 1252 } 1253 1254 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) 1255 { 1256 if (pmu->task_ctx_cache && task_ctx_data) 1257 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); 1258 } 1259 1260 static void free_ctx(struct rcu_head *head) 1261 { 1262 struct perf_event_context *ctx; 1263 1264 ctx = container_of(head, struct perf_event_context, rcu_head); 1265 free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); 1266 kfree(ctx); 1267 } 1268 1269 static void put_ctx(struct perf_event_context *ctx) 1270 { 1271 if (refcount_dec_and_test(&ctx->refcount)) { 1272 if (ctx->parent_ctx) 1273 put_ctx(ctx->parent_ctx); 1274 if (ctx->task && ctx->task != TASK_TOMBSTONE) 1275 put_task_struct(ctx->task); 1276 call_rcu(&ctx->rcu_head, free_ctx); 1277 } 1278 } 1279 1280 /* 1281 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and 1282 * perf_pmu_migrate_context() we need some magic. 1283 * 1284 * Those places that change perf_event::ctx will hold both 1285 * perf_event_ctx::mutex of the 'old' and 'new' ctx value. 1286 * 1287 * Lock ordering is by mutex address. There are two other sites where 1288 * perf_event_context::mutex nests and those are: 1289 * 1290 * - perf_event_exit_task_context() [ child , 0 ] 1291 * perf_event_exit_event() 1292 * put_event() [ parent, 1 ] 1293 * 1294 * - perf_event_init_context() [ parent, 0 ] 1295 * inherit_task_group() 1296 * inherit_group() 1297 * inherit_event() 1298 * perf_event_alloc() 1299 * perf_init_event() 1300 * perf_try_init_event() [ child , 1 ] 1301 * 1302 * While it appears there is an obvious deadlock here -- the parent and child 1303 * nesting levels are inverted between the two. This is in fact safe because 1304 * life-time rules separate them. That is an exiting task cannot fork, and a 1305 * spawning task cannot (yet) exit. 1306 * 1307 * But remember that these are parent<->child context relations, and 1308 * migration does not affect children, therefore these two orderings should not 1309 * interact. 1310 * 1311 * The change in perf_event::ctx does not affect children (as claimed above) 1312 * because the sys_perf_event_open() case will install a new event and break 1313 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only 1314 * concerned with cpuctx and that doesn't have children. 1315 * 1316 * The places that change perf_event::ctx will issue: 1317 * 1318 * perf_remove_from_context(); 1319 * synchronize_rcu(); 1320 * perf_install_in_context(); 1321 * 1322 * to affect the change. The remove_from_context() + synchronize_rcu() should 1323 * quiesce the event, after which we can install it in the new location. This 1324 * means that only external vectors (perf_fops, prctl) can perturb the event 1325 * while in transit. Therefore all such accessors should also acquire 1326 * perf_event_context::mutex to serialize against this. 1327 * 1328 * However; because event->ctx can change while we're waiting to acquire 1329 * ctx->mutex we must be careful and use the below perf_event_ctx_lock() 1330 * function. 1331 * 1332 * Lock order: 1333 * exec_update_lock 1334 * task_struct::perf_event_mutex 1335 * perf_event_context::mutex 1336 * perf_event::child_mutex; 1337 * perf_event_context::lock 1338 * perf_event::mmap_mutex 1339 * mmap_lock 1340 * perf_addr_filters_head::lock 1341 * 1342 * cpu_hotplug_lock 1343 * pmus_lock 1344 * cpuctx->mutex / perf_event_context::mutex 1345 */ 1346 static struct perf_event_context * 1347 perf_event_ctx_lock_nested(struct perf_event *event, int nesting) 1348 { 1349 struct perf_event_context *ctx; 1350 1351 again: 1352 rcu_read_lock(); 1353 ctx = READ_ONCE(event->ctx); 1354 if (!refcount_inc_not_zero(&ctx->refcount)) { 1355 rcu_read_unlock(); 1356 goto again; 1357 } 1358 rcu_read_unlock(); 1359 1360 mutex_lock_nested(&ctx->mutex, nesting); 1361 if (event->ctx != ctx) { 1362 mutex_unlock(&ctx->mutex); 1363 put_ctx(ctx); 1364 goto again; 1365 } 1366 1367 return ctx; 1368 } 1369 1370 static inline struct perf_event_context * 1371 perf_event_ctx_lock(struct perf_event *event) 1372 { 1373 return perf_event_ctx_lock_nested(event, 0); 1374 } 1375 1376 static void perf_event_ctx_unlock(struct perf_event *event, 1377 struct perf_event_context *ctx) 1378 { 1379 mutex_unlock(&ctx->mutex); 1380 put_ctx(ctx); 1381 } 1382 1383 /* 1384 * This must be done under the ctx->lock, such as to serialize against 1385 * context_equiv(), therefore we cannot call put_ctx() since that might end up 1386 * calling scheduler related locks and ctx->lock nests inside those. 1387 */ 1388 static __must_check struct perf_event_context * 1389 unclone_ctx(struct perf_event_context *ctx) 1390 { 1391 struct perf_event_context *parent_ctx = ctx->parent_ctx; 1392 1393 lockdep_assert_held(&ctx->lock); 1394 1395 if (parent_ctx) 1396 ctx->parent_ctx = NULL; 1397 ctx->generation++; 1398 1399 return parent_ctx; 1400 } 1401 1402 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, 1403 enum pid_type type) 1404 { 1405 u32 nr; 1406 /* 1407 * only top level events have the pid namespace they were created in 1408 */ 1409 if (event->parent) 1410 event = event->parent; 1411 1412 nr = __task_pid_nr_ns(p, type, event->ns); 1413 /* avoid -1 if it is idle thread or runs in another ns */ 1414 if (!nr && !pid_alive(p)) 1415 nr = -1; 1416 return nr; 1417 } 1418 1419 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 1420 { 1421 return perf_event_pid_type(event, p, PIDTYPE_TGID); 1422 } 1423 1424 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 1425 { 1426 return perf_event_pid_type(event, p, PIDTYPE_PID); 1427 } 1428 1429 /* 1430 * If we inherit events we want to return the parent event id 1431 * to userspace. 1432 */ 1433 static u64 primary_event_id(struct perf_event *event) 1434 { 1435 u64 id = event->id; 1436 1437 if (event->parent) 1438 id = event->parent->id; 1439 1440 return id; 1441 } 1442 1443 /* 1444 * Get the perf_event_context for a task and lock it. 1445 * 1446 * This has to cope with the fact that until it is locked, 1447 * the context could get moved to another task. 1448 */ 1449 static struct perf_event_context * 1450 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) 1451 { 1452 struct perf_event_context *ctx; 1453 1454 retry: 1455 /* 1456 * One of the few rules of preemptible RCU is that one cannot do 1457 * rcu_read_unlock() while holding a scheduler (or nested) lock when 1458 * part of the read side critical section was irqs-enabled -- see 1459 * rcu_read_unlock_special(). 1460 * 1461 * Since ctx->lock nests under rq->lock we must ensure the entire read 1462 * side critical section has interrupts disabled. 1463 */ 1464 local_irq_save(*flags); 1465 rcu_read_lock(); 1466 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 1467 if (ctx) { 1468 /* 1469 * If this context is a clone of another, it might 1470 * get swapped for another underneath us by 1471 * perf_event_task_sched_out, though the 1472 * rcu_read_lock() protects us from any context 1473 * getting freed. Lock the context and check if it 1474 * got swapped before we could get the lock, and retry 1475 * if so. If we locked the right context, then it 1476 * can't get swapped on us any more. 1477 */ 1478 raw_spin_lock(&ctx->lock); 1479 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 1480 raw_spin_unlock(&ctx->lock); 1481 rcu_read_unlock(); 1482 local_irq_restore(*flags); 1483 goto retry; 1484 } 1485 1486 if (ctx->task == TASK_TOMBSTONE || 1487 !refcount_inc_not_zero(&ctx->refcount)) { 1488 raw_spin_unlock(&ctx->lock); 1489 ctx = NULL; 1490 } else { 1491 WARN_ON_ONCE(ctx->task != task); 1492 } 1493 } 1494 rcu_read_unlock(); 1495 if (!ctx) 1496 local_irq_restore(*flags); 1497 return ctx; 1498 } 1499 1500 /* 1501 * Get the context for a task and increment its pin_count so it 1502 * can't get swapped to another task. This also increments its 1503 * reference count so that the context can't get freed. 1504 */ 1505 static struct perf_event_context * 1506 perf_pin_task_context(struct task_struct *task, int ctxn) 1507 { 1508 struct perf_event_context *ctx; 1509 unsigned long flags; 1510 1511 ctx = perf_lock_task_context(task, ctxn, &flags); 1512 if (ctx) { 1513 ++ctx->pin_count; 1514 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1515 } 1516 return ctx; 1517 } 1518 1519 static void perf_unpin_context(struct perf_event_context *ctx) 1520 { 1521 unsigned long flags; 1522 1523 raw_spin_lock_irqsave(&ctx->lock, flags); 1524 --ctx->pin_count; 1525 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1526 } 1527 1528 /* 1529 * Update the record of the current time in a context. 1530 */ 1531 static void update_context_time(struct perf_event_context *ctx) 1532 { 1533 u64 now = perf_clock(); 1534 1535 ctx->time += now - ctx->timestamp; 1536 ctx->timestamp = now; 1537 } 1538 1539 static u64 perf_event_time(struct perf_event *event) 1540 { 1541 struct perf_event_context *ctx = event->ctx; 1542 1543 if (is_cgroup_event(event)) 1544 return perf_cgroup_event_time(event); 1545 1546 return ctx ? ctx->time : 0; 1547 } 1548 1549 static enum event_type_t get_event_type(struct perf_event *event) 1550 { 1551 struct perf_event_context *ctx = event->ctx; 1552 enum event_type_t event_type; 1553 1554 lockdep_assert_held(&ctx->lock); 1555 1556 /* 1557 * It's 'group type', really, because if our group leader is 1558 * pinned, so are we. 1559 */ 1560 if (event->group_leader != event) 1561 event = event->group_leader; 1562 1563 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; 1564 if (!ctx->task) 1565 event_type |= EVENT_CPU; 1566 1567 return event_type; 1568 } 1569 1570 /* 1571 * Helper function to initialize event group nodes. 1572 */ 1573 static void init_event_group(struct perf_event *event) 1574 { 1575 RB_CLEAR_NODE(&event->group_node); 1576 event->group_index = 0; 1577 } 1578 1579 /* 1580 * Extract pinned or flexible groups from the context 1581 * based on event attrs bits. 1582 */ 1583 static struct perf_event_groups * 1584 get_event_groups(struct perf_event *event, struct perf_event_context *ctx) 1585 { 1586 if (event->attr.pinned) 1587 return &ctx->pinned_groups; 1588 else 1589 return &ctx->flexible_groups; 1590 } 1591 1592 /* 1593 * Helper function to initializes perf_event_group trees. 1594 */ 1595 static void perf_event_groups_init(struct perf_event_groups *groups) 1596 { 1597 groups->tree = RB_ROOT; 1598 groups->index = 0; 1599 } 1600 1601 static inline struct cgroup *event_cgroup(const struct perf_event *event) 1602 { 1603 struct cgroup *cgroup = NULL; 1604 1605 #ifdef CONFIG_CGROUP_PERF 1606 if (event->cgrp) 1607 cgroup = event->cgrp->css.cgroup; 1608 #endif 1609 1610 return cgroup; 1611 } 1612 1613 /* 1614 * Compare function for event groups; 1615 * 1616 * Implements complex key that first sorts by CPU and then by virtual index 1617 * which provides ordering when rotating groups for the same CPU. 1618 */ 1619 static __always_inline int 1620 perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, 1621 const u64 left_group_index, const struct perf_event *right) 1622 { 1623 if (left_cpu < right->cpu) 1624 return -1; 1625 if (left_cpu > right->cpu) 1626 return 1; 1627 1628 #ifdef CONFIG_CGROUP_PERF 1629 { 1630 const struct cgroup *right_cgroup = event_cgroup(right); 1631 1632 if (left_cgroup != right_cgroup) { 1633 if (!left_cgroup) { 1634 /* 1635 * Left has no cgroup but right does, no 1636 * cgroups come first. 1637 */ 1638 return -1; 1639 } 1640 if (!right_cgroup) { 1641 /* 1642 * Right has no cgroup but left does, no 1643 * cgroups come first. 1644 */ 1645 return 1; 1646 } 1647 /* Two dissimilar cgroups, order by id. */ 1648 if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) 1649 return -1; 1650 1651 return 1; 1652 } 1653 } 1654 #endif 1655 1656 if (left_group_index < right->group_index) 1657 return -1; 1658 if (left_group_index > right->group_index) 1659 return 1; 1660 1661 return 0; 1662 } 1663 1664 #define __node_2_pe(node) \ 1665 rb_entry((node), struct perf_event, group_node) 1666 1667 static inline bool __group_less(struct rb_node *a, const struct rb_node *b) 1668 { 1669 struct perf_event *e = __node_2_pe(a); 1670 return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, 1671 __node_2_pe(b)) < 0; 1672 } 1673 1674 struct __group_key { 1675 int cpu; 1676 struct cgroup *cgroup; 1677 }; 1678 1679 static inline int __group_cmp(const void *key, const struct rb_node *node) 1680 { 1681 const struct __group_key *a = key; 1682 const struct perf_event *b = __node_2_pe(node); 1683 1684 /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ 1685 return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); 1686 } 1687 1688 /* 1689 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for 1690 * key (see perf_event_groups_less). This places it last inside the CPU 1691 * subtree. 1692 */ 1693 static void 1694 perf_event_groups_insert(struct perf_event_groups *groups, 1695 struct perf_event *event) 1696 { 1697 event->group_index = ++groups->index; 1698 1699 rb_add(&event->group_node, &groups->tree, __group_less); 1700 } 1701 1702 /* 1703 * Helper function to insert event into the pinned or flexible groups. 1704 */ 1705 static void 1706 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) 1707 { 1708 struct perf_event_groups *groups; 1709 1710 groups = get_event_groups(event, ctx); 1711 perf_event_groups_insert(groups, event); 1712 } 1713 1714 /* 1715 * Delete a group from a tree. 1716 */ 1717 static void 1718 perf_event_groups_delete(struct perf_event_groups *groups, 1719 struct perf_event *event) 1720 { 1721 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || 1722 RB_EMPTY_ROOT(&groups->tree)); 1723 1724 rb_erase(&event->group_node, &groups->tree); 1725 init_event_group(event); 1726 } 1727 1728 /* 1729 * Helper function to delete event from its groups. 1730 */ 1731 static void 1732 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) 1733 { 1734 struct perf_event_groups *groups; 1735 1736 groups = get_event_groups(event, ctx); 1737 perf_event_groups_delete(groups, event); 1738 } 1739 1740 /* 1741 * Get the leftmost event in the cpu/cgroup subtree. 1742 */ 1743 static struct perf_event * 1744 perf_event_groups_first(struct perf_event_groups *groups, int cpu, 1745 struct cgroup *cgrp) 1746 { 1747 struct __group_key key = { 1748 .cpu = cpu, 1749 .cgroup = cgrp, 1750 }; 1751 struct rb_node *node; 1752 1753 node = rb_find_first(&key, &groups->tree, __group_cmp); 1754 if (node) 1755 return __node_2_pe(node); 1756 1757 return NULL; 1758 } 1759 1760 /* 1761 * Like rb_entry_next_safe() for the @cpu subtree. 1762 */ 1763 static struct perf_event * 1764 perf_event_groups_next(struct perf_event *event) 1765 { 1766 struct __group_key key = { 1767 .cpu = event->cpu, 1768 .cgroup = event_cgroup(event), 1769 }; 1770 struct rb_node *next; 1771 1772 next = rb_next_match(&key, &event->group_node, __group_cmp); 1773 if (next) 1774 return __node_2_pe(next); 1775 1776 return NULL; 1777 } 1778 1779 /* 1780 * Iterate through the whole groups tree. 1781 */ 1782 #define perf_event_groups_for_each(event, groups) \ 1783 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ 1784 typeof(*event), group_node); event; \ 1785 event = rb_entry_safe(rb_next(&event->group_node), \ 1786 typeof(*event), group_node)) 1787 1788 /* 1789 * Add an event from the lists for its context. 1790 * Must be called with ctx->mutex and ctx->lock held. 1791 */ 1792 static void 1793 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1794 { 1795 lockdep_assert_held(&ctx->lock); 1796 1797 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1798 event->attach_state |= PERF_ATTACH_CONTEXT; 1799 1800 event->tstamp = perf_event_time(event); 1801 1802 /* 1803 * If we're a stand alone event or group leader, we go to the context 1804 * list, group events are kept attached to the group so that 1805 * perf_group_detach can, at all times, locate all siblings. 1806 */ 1807 if (event->group_leader == event) { 1808 event->group_caps = event->event_caps; 1809 add_event_to_groups(event, ctx); 1810 } 1811 1812 list_add_rcu(&event->event_entry, &ctx->event_list); 1813 ctx->nr_events++; 1814 if (event->attr.inherit_stat) 1815 ctx->nr_stat++; 1816 1817 if (event->state > PERF_EVENT_STATE_OFF) 1818 perf_cgroup_event_enable(event, ctx); 1819 1820 ctx->generation++; 1821 } 1822 1823 /* 1824 * Initialize event state based on the perf_event_attr::disabled. 1825 */ 1826 static inline void perf_event__state_init(struct perf_event *event) 1827 { 1828 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : 1829 PERF_EVENT_STATE_INACTIVE; 1830 } 1831 1832 static void __perf_event_read_size(struct perf_event *event, int nr_siblings) 1833 { 1834 int entry = sizeof(u64); /* value */ 1835 int size = 0; 1836 int nr = 1; 1837 1838 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1839 size += sizeof(u64); 1840 1841 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1842 size += sizeof(u64); 1843 1844 if (event->attr.read_format & PERF_FORMAT_ID) 1845 entry += sizeof(u64); 1846 1847 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1848 nr += nr_siblings; 1849 size += sizeof(u64); 1850 } 1851 1852 size += entry * nr; 1853 event->read_size = size; 1854 } 1855 1856 static void __perf_event_header_size(struct perf_event *event, u64 sample_type) 1857 { 1858 struct perf_sample_data *data; 1859 u16 size = 0; 1860 1861 if (sample_type & PERF_SAMPLE_IP) 1862 size += sizeof(data->ip); 1863 1864 if (sample_type & PERF_SAMPLE_ADDR) 1865 size += sizeof(data->addr); 1866 1867 if (sample_type & PERF_SAMPLE_PERIOD) 1868 size += sizeof(data->period); 1869 1870 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) 1871 size += sizeof(data->weight.full); 1872 1873 if (sample_type & PERF_SAMPLE_READ) 1874 size += event->read_size; 1875 1876 if (sample_type & PERF_SAMPLE_DATA_SRC) 1877 size += sizeof(data->data_src.val); 1878 1879 if (sample_type & PERF_SAMPLE_TRANSACTION) 1880 size += sizeof(data->txn); 1881 1882 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1883 size += sizeof(data->phys_addr); 1884 1885 if (sample_type & PERF_SAMPLE_CGROUP) 1886 size += sizeof(data->cgroup); 1887 1888 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) 1889 size += sizeof(data->data_page_size); 1890 1891 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) 1892 size += sizeof(data->code_page_size); 1893 1894 event->header_size = size; 1895 } 1896 1897 /* 1898 * Called at perf_event creation and when events are attached/detached from a 1899 * group. 1900 */ 1901 static void perf_event__header_size(struct perf_event *event) 1902 { 1903 __perf_event_read_size(event, 1904 event->group_leader->nr_siblings); 1905 __perf_event_header_size(event, event->attr.sample_type); 1906 } 1907 1908 static void perf_event__id_header_size(struct perf_event *event) 1909 { 1910 struct perf_sample_data *data; 1911 u64 sample_type = event->attr.sample_type; 1912 u16 size = 0; 1913 1914 if (sample_type & PERF_SAMPLE_TID) 1915 size += sizeof(data->tid_entry); 1916 1917 if (sample_type & PERF_SAMPLE_TIME) 1918 size += sizeof(data->time); 1919 1920 if (sample_type & PERF_SAMPLE_IDENTIFIER) 1921 size += sizeof(data->id); 1922 1923 if (sample_type & PERF_SAMPLE_ID) 1924 size += sizeof(data->id); 1925 1926 if (sample_type & PERF_SAMPLE_STREAM_ID) 1927 size += sizeof(data->stream_id); 1928 1929 if (sample_type & PERF_SAMPLE_CPU) 1930 size += sizeof(data->cpu_entry); 1931 1932 event->id_header_size = size; 1933 } 1934 1935 static bool perf_event_validate_size(struct perf_event *event) 1936 { 1937 /* 1938 * The values computed here will be over-written when we actually 1939 * attach the event. 1940 */ 1941 __perf_event_read_size(event, event->group_leader->nr_siblings + 1); 1942 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); 1943 perf_event__id_header_size(event); 1944 1945 /* 1946 * Sum the lot; should not exceed the 64k limit we have on records. 1947 * Conservative limit to allow for callchains and other variable fields. 1948 */ 1949 if (event->read_size + event->header_size + 1950 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) 1951 return false; 1952 1953 return true; 1954 } 1955 1956 static void perf_group_attach(struct perf_event *event) 1957 { 1958 struct perf_event *group_leader = event->group_leader, *pos; 1959 1960 lockdep_assert_held(&event->ctx->lock); 1961 1962 /* 1963 * We can have double attach due to group movement in perf_event_open. 1964 */ 1965 if (event->attach_state & PERF_ATTACH_GROUP) 1966 return; 1967 1968 event->attach_state |= PERF_ATTACH_GROUP; 1969 1970 if (group_leader == event) 1971 return; 1972 1973 WARN_ON_ONCE(group_leader->ctx != event->ctx); 1974 1975 group_leader->group_caps &= event->event_caps; 1976 1977 list_add_tail(&event->sibling_list, &group_leader->sibling_list); 1978 group_leader->nr_siblings++; 1979 1980 perf_event__header_size(group_leader); 1981 1982 for_each_sibling_event(pos, group_leader) 1983 perf_event__header_size(pos); 1984 } 1985 1986 /* 1987 * Remove an event from the lists for its context. 1988 * Must be called with ctx->mutex and ctx->lock held. 1989 */ 1990 static void 1991 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1992 { 1993 WARN_ON_ONCE(event->ctx != ctx); 1994 lockdep_assert_held(&ctx->lock); 1995 1996 /* 1997 * We can have double detach due to exit/hot-unplug + close. 1998 */ 1999 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 2000 return; 2001 2002 event->attach_state &= ~PERF_ATTACH_CONTEXT; 2003 2004 ctx->nr_events--; 2005 if (event->attr.inherit_stat) 2006 ctx->nr_stat--; 2007 2008 list_del_rcu(&event->event_entry); 2009 2010 if (event->group_leader == event) 2011 del_event_from_groups(event, ctx); 2012 2013 /* 2014 * If event was in error state, then keep it 2015 * that way, otherwise bogus counts will be 2016 * returned on read(). The only way to get out 2017 * of error state is by explicit re-enabling 2018 * of the event 2019 */ 2020 if (event->state > PERF_EVENT_STATE_OFF) { 2021 perf_cgroup_event_disable(event, ctx); 2022 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2023 } 2024 2025 ctx->generation++; 2026 } 2027 2028 static int 2029 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) 2030 { 2031 if (!has_aux(aux_event)) 2032 return 0; 2033 2034 if (!event->pmu->aux_output_match) 2035 return 0; 2036 2037 return event->pmu->aux_output_match(aux_event); 2038 } 2039 2040 static void put_event(struct perf_event *event); 2041 static void event_sched_out(struct perf_event *event, 2042 struct perf_cpu_context *cpuctx, 2043 struct perf_event_context *ctx); 2044 2045 static void perf_put_aux_event(struct perf_event *event) 2046 { 2047 struct perf_event_context *ctx = event->ctx; 2048 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2049 struct perf_event *iter; 2050 2051 /* 2052 * If event uses aux_event tear down the link 2053 */ 2054 if (event->aux_event) { 2055 iter = event->aux_event; 2056 event->aux_event = NULL; 2057 put_event(iter); 2058 return; 2059 } 2060 2061 /* 2062 * If the event is an aux_event, tear down all links to 2063 * it from other events. 2064 */ 2065 for_each_sibling_event(iter, event->group_leader) { 2066 if (iter->aux_event != event) 2067 continue; 2068 2069 iter->aux_event = NULL; 2070 put_event(event); 2071 2072 /* 2073 * If it's ACTIVE, schedule it out and put it into ERROR 2074 * state so that we don't try to schedule it again. Note 2075 * that perf_event_enable() will clear the ERROR status. 2076 */ 2077 event_sched_out(iter, cpuctx, ctx); 2078 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2079 } 2080 } 2081 2082 static bool perf_need_aux_event(struct perf_event *event) 2083 { 2084 return !!event->attr.aux_output || !!event->attr.aux_sample_size; 2085 } 2086 2087 static int perf_get_aux_event(struct perf_event *event, 2088 struct perf_event *group_leader) 2089 { 2090 /* 2091 * Our group leader must be an aux event if we want to be 2092 * an aux_output. This way, the aux event will precede its 2093 * aux_output events in the group, and therefore will always 2094 * schedule first. 2095 */ 2096 if (!group_leader) 2097 return 0; 2098 2099 /* 2100 * aux_output and aux_sample_size are mutually exclusive. 2101 */ 2102 if (event->attr.aux_output && event->attr.aux_sample_size) 2103 return 0; 2104 2105 if (event->attr.aux_output && 2106 !perf_aux_output_match(event, group_leader)) 2107 return 0; 2108 2109 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) 2110 return 0; 2111 2112 if (!atomic_long_inc_not_zero(&group_leader->refcount)) 2113 return 0; 2114 2115 /* 2116 * Link aux_outputs to their aux event; this is undone in 2117 * perf_group_detach() by perf_put_aux_event(). When the 2118 * group in torn down, the aux_output events loose their 2119 * link to the aux_event and can't schedule any more. 2120 */ 2121 event->aux_event = group_leader; 2122 2123 return 1; 2124 } 2125 2126 static inline struct list_head *get_event_list(struct perf_event *event) 2127 { 2128 struct perf_event_context *ctx = event->ctx; 2129 return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; 2130 } 2131 2132 /* 2133 * Events that have PERF_EV_CAP_SIBLING require being part of a group and 2134 * cannot exist on their own, schedule them out and move them into the ERROR 2135 * state. Also see _perf_event_enable(), it will not be able to recover 2136 * this ERROR state. 2137 */ 2138 static inline void perf_remove_sibling_event(struct perf_event *event) 2139 { 2140 struct perf_event_context *ctx = event->ctx; 2141 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2142 2143 event_sched_out(event, cpuctx, ctx); 2144 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2145 } 2146 2147 static void perf_group_detach(struct perf_event *event) 2148 { 2149 struct perf_event *leader = event->group_leader; 2150 struct perf_event *sibling, *tmp; 2151 struct perf_event_context *ctx = event->ctx; 2152 2153 lockdep_assert_held(&ctx->lock); 2154 2155 /* 2156 * We can have double detach due to exit/hot-unplug + close. 2157 */ 2158 if (!(event->attach_state & PERF_ATTACH_GROUP)) 2159 return; 2160 2161 event->attach_state &= ~PERF_ATTACH_GROUP; 2162 2163 perf_put_aux_event(event); 2164 2165 /* 2166 * If this is a sibling, remove it from its group. 2167 */ 2168 if (leader != event) { 2169 list_del_init(&event->sibling_list); 2170 event->group_leader->nr_siblings--; 2171 goto out; 2172 } 2173 2174 /* 2175 * If this was a group event with sibling events then 2176 * upgrade the siblings to singleton events by adding them 2177 * to whatever list we are on. 2178 */ 2179 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { 2180 2181 if (sibling->event_caps & PERF_EV_CAP_SIBLING) 2182 perf_remove_sibling_event(sibling); 2183 2184 sibling->group_leader = sibling; 2185 list_del_init(&sibling->sibling_list); 2186 2187 /* Inherit group flags from the previous leader */ 2188 sibling->group_caps = event->group_caps; 2189 2190 if (!RB_EMPTY_NODE(&event->group_node)) { 2191 add_event_to_groups(sibling, event->ctx); 2192 2193 if (sibling->state == PERF_EVENT_STATE_ACTIVE) 2194 list_add_tail(&sibling->active_list, get_event_list(sibling)); 2195 } 2196 2197 WARN_ON_ONCE(sibling->ctx != event->ctx); 2198 } 2199 2200 out: 2201 for_each_sibling_event(tmp, leader) 2202 perf_event__header_size(tmp); 2203 2204 perf_event__header_size(leader); 2205 } 2206 2207 static void sync_child_event(struct perf_event *child_event); 2208 2209 static void perf_child_detach(struct perf_event *event) 2210 { 2211 struct perf_event *parent_event = event->parent; 2212 2213 if (!(event->attach_state & PERF_ATTACH_CHILD)) 2214 return; 2215 2216 event->attach_state &= ~PERF_ATTACH_CHILD; 2217 2218 if (WARN_ON_ONCE(!parent_event)) 2219 return; 2220 2221 lockdep_assert_held(&parent_event->child_mutex); 2222 2223 sync_child_event(event); 2224 list_del_init(&event->child_list); 2225 } 2226 2227 static bool is_orphaned_event(struct perf_event *event) 2228 { 2229 return event->state == PERF_EVENT_STATE_DEAD; 2230 } 2231 2232 static inline int __pmu_filter_match(struct perf_event *event) 2233 { 2234 struct pmu *pmu = event->pmu; 2235 return pmu->filter_match ? pmu->filter_match(event) : 1; 2236 } 2237 2238 /* 2239 * Check whether we should attempt to schedule an event group based on 2240 * PMU-specific filtering. An event group can consist of HW and SW events, 2241 * potentially with a SW leader, so we must check all the filters, to 2242 * determine whether a group is schedulable: 2243 */ 2244 static inline int pmu_filter_match(struct perf_event *event) 2245 { 2246 struct perf_event *sibling; 2247 2248 if (!__pmu_filter_match(event)) 2249 return 0; 2250 2251 for_each_sibling_event(sibling, event) { 2252 if (!__pmu_filter_match(sibling)) 2253 return 0; 2254 } 2255 2256 return 1; 2257 } 2258 2259 static inline int 2260 event_filter_match(struct perf_event *event) 2261 { 2262 return (event->cpu == -1 || event->cpu == smp_processor_id()) && 2263 perf_cgroup_match(event) && pmu_filter_match(event); 2264 } 2265 2266 static void 2267 event_sched_out(struct perf_event *event, 2268 struct perf_cpu_context *cpuctx, 2269 struct perf_event_context *ctx) 2270 { 2271 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; 2272 2273 WARN_ON_ONCE(event->ctx != ctx); 2274 lockdep_assert_held(&ctx->lock); 2275 2276 if (event->state != PERF_EVENT_STATE_ACTIVE) 2277 return; 2278 2279 /* 2280 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but 2281 * we can schedule events _OUT_ individually through things like 2282 * __perf_remove_from_context(). 2283 */ 2284 list_del_init(&event->active_list); 2285 2286 perf_pmu_disable(event->pmu); 2287 2288 event->pmu->del(event, 0); 2289 event->oncpu = -1; 2290 2291 if (READ_ONCE(event->pending_disable) >= 0) { 2292 WRITE_ONCE(event->pending_disable, -1); 2293 perf_cgroup_event_disable(event, ctx); 2294 state = PERF_EVENT_STATE_OFF; 2295 } 2296 perf_event_set_state(event, state); 2297 2298 if (!is_software_event(event)) 2299 cpuctx->active_oncpu--; 2300 if (!--ctx->nr_active) 2301 perf_event_ctx_deactivate(ctx); 2302 if (event->attr.freq && event->attr.sample_freq) 2303 ctx->nr_freq--; 2304 if (event->attr.exclusive || !cpuctx->active_oncpu) 2305 cpuctx->exclusive = 0; 2306 2307 perf_pmu_enable(event->pmu); 2308 } 2309 2310 static void 2311 group_sched_out(struct perf_event *group_event, 2312 struct perf_cpu_context *cpuctx, 2313 struct perf_event_context *ctx) 2314 { 2315 struct perf_event *event; 2316 2317 if (group_event->state != PERF_EVENT_STATE_ACTIVE) 2318 return; 2319 2320 perf_pmu_disable(ctx->pmu); 2321 2322 event_sched_out(group_event, cpuctx, ctx); 2323 2324 /* 2325 * Schedule out siblings (if any): 2326 */ 2327 for_each_sibling_event(event, group_event) 2328 event_sched_out(event, cpuctx, ctx); 2329 2330 perf_pmu_enable(ctx->pmu); 2331 } 2332 2333 #define DETACH_GROUP 0x01UL 2334 #define DETACH_CHILD 0x02UL 2335 2336 /* 2337 * Cross CPU call to remove a performance event 2338 * 2339 * We disable the event on the hardware level first. After that we 2340 * remove it from the context list. 2341 */ 2342 static void 2343 __perf_remove_from_context(struct perf_event *event, 2344 struct perf_cpu_context *cpuctx, 2345 struct perf_event_context *ctx, 2346 void *info) 2347 { 2348 unsigned long flags = (unsigned long)info; 2349 2350 if (ctx->is_active & EVENT_TIME) { 2351 update_context_time(ctx); 2352 update_cgrp_time_from_cpuctx(cpuctx); 2353 } 2354 2355 event_sched_out(event, cpuctx, ctx); 2356 if (flags & DETACH_GROUP) 2357 perf_group_detach(event); 2358 if (flags & DETACH_CHILD) 2359 perf_child_detach(event); 2360 list_del_event(event, ctx); 2361 2362 if (!ctx->nr_events && ctx->is_active) { 2363 ctx->is_active = 0; 2364 ctx->rotate_necessary = 0; 2365 if (ctx->task) { 2366 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2367 cpuctx->task_ctx = NULL; 2368 } 2369 } 2370 } 2371 2372 /* 2373 * Remove the event from a task's (or a CPU's) list of events. 2374 * 2375 * If event->ctx is a cloned context, callers must make sure that 2376 * every task struct that event->ctx->task could possibly point to 2377 * remains valid. This is OK when called from perf_release since 2378 * that only calls us on the top-level context, which can't be a clone. 2379 * When called from perf_event_exit_task, it's OK because the 2380 * context has been detached from its task. 2381 */ 2382 static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 2383 { 2384 struct perf_event_context *ctx = event->ctx; 2385 2386 lockdep_assert_held(&ctx->mutex); 2387 2388 /* 2389 * Because of perf_event_exit_task(), perf_remove_from_context() ought 2390 * to work in the face of TASK_TOMBSTONE, unlike every other 2391 * event_function_call() user. 2392 */ 2393 raw_spin_lock_irq(&ctx->lock); 2394 if (!ctx->is_active) { 2395 __perf_remove_from_context(event, __get_cpu_context(ctx), 2396 ctx, (void *)flags); 2397 raw_spin_unlock_irq(&ctx->lock); 2398 return; 2399 } 2400 raw_spin_unlock_irq(&ctx->lock); 2401 2402 event_function_call(event, __perf_remove_from_context, (void *)flags); 2403 } 2404 2405 /* 2406 * Cross CPU call to disable a performance event 2407 */ 2408 static void __perf_event_disable(struct perf_event *event, 2409 struct perf_cpu_context *cpuctx, 2410 struct perf_event_context *ctx, 2411 void *info) 2412 { 2413 if (event->state < PERF_EVENT_STATE_INACTIVE) 2414 return; 2415 2416 if (ctx->is_active & EVENT_TIME) { 2417 update_context_time(ctx); 2418 update_cgrp_time_from_event(event); 2419 } 2420 2421 if (event == event->group_leader) 2422 group_sched_out(event, cpuctx, ctx); 2423 else 2424 event_sched_out(event, cpuctx, ctx); 2425 2426 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2427 perf_cgroup_event_disable(event, ctx); 2428 } 2429 2430 /* 2431 * Disable an event. 2432 * 2433 * If event->ctx is a cloned context, callers must make sure that 2434 * every task struct that event->ctx->task could possibly point to 2435 * remains valid. This condition is satisfied when called through 2436 * perf_event_for_each_child or perf_event_for_each because they 2437 * hold the top-level event's child_mutex, so any descendant that 2438 * goes to exit will block in perf_event_exit_event(). 2439 * 2440 * When called from perf_pending_event it's OK because event->ctx 2441 * is the current context on this CPU and preemption is disabled, 2442 * hence we can't get into perf_event_task_sched_out for this context. 2443 */ 2444 static void _perf_event_disable(struct perf_event *event) 2445 { 2446 struct perf_event_context *ctx = event->ctx; 2447 2448 raw_spin_lock_irq(&ctx->lock); 2449 if (event->state <= PERF_EVENT_STATE_OFF) { 2450 raw_spin_unlock_irq(&ctx->lock); 2451 return; 2452 } 2453 raw_spin_unlock_irq(&ctx->lock); 2454 2455 event_function_call(event, __perf_event_disable, NULL); 2456 } 2457 2458 void perf_event_disable_local(struct perf_event *event) 2459 { 2460 event_function_local(event, __perf_event_disable, NULL); 2461 } 2462 2463 /* 2464 * Strictly speaking kernel users cannot create groups and therefore this 2465 * interface does not need the perf_event_ctx_lock() magic. 2466 */ 2467 void perf_event_disable(struct perf_event *event) 2468 { 2469 struct perf_event_context *ctx; 2470 2471 ctx = perf_event_ctx_lock(event); 2472 _perf_event_disable(event); 2473 perf_event_ctx_unlock(event, ctx); 2474 } 2475 EXPORT_SYMBOL_GPL(perf_event_disable); 2476 2477 void perf_event_disable_inatomic(struct perf_event *event) 2478 { 2479 WRITE_ONCE(event->pending_disable, smp_processor_id()); 2480 /* can fail, see perf_pending_event_disable() */ 2481 irq_work_queue(&event->pending); 2482 } 2483 2484 static void perf_set_shadow_time(struct perf_event *event, 2485 struct perf_event_context *ctx) 2486 { 2487 /* 2488 * use the correct time source for the time snapshot 2489 * 2490 * We could get by without this by leveraging the 2491 * fact that to get to this function, the caller 2492 * has most likely already called update_context_time() 2493 * and update_cgrp_time_xx() and thus both timestamp 2494 * are identical (or very close). Given that tstamp is, 2495 * already adjusted for cgroup, we could say that: 2496 * tstamp - ctx->timestamp 2497 * is equivalent to 2498 * tstamp - cgrp->timestamp. 2499 * 2500 * Then, in perf_output_read(), the calculation would 2501 * work with no changes because: 2502 * - event is guaranteed scheduled in 2503 * - no scheduled out in between 2504 * - thus the timestamp would be the same 2505 * 2506 * But this is a bit hairy. 2507 * 2508 * So instead, we have an explicit cgroup call to remain 2509 * within the time source all along. We believe it 2510 * is cleaner and simpler to understand. 2511 */ 2512 if (is_cgroup_event(event)) 2513 perf_cgroup_set_shadow_time(event, event->tstamp); 2514 else 2515 event->shadow_ctx_time = event->tstamp - ctx->timestamp; 2516 } 2517 2518 #define MAX_INTERRUPTS (~0ULL) 2519 2520 static void perf_log_throttle(struct perf_event *event, int enable); 2521 static void perf_log_itrace_start(struct perf_event *event); 2522 2523 static int 2524 event_sched_in(struct perf_event *event, 2525 struct perf_cpu_context *cpuctx, 2526 struct perf_event_context *ctx) 2527 { 2528 int ret = 0; 2529 2530 WARN_ON_ONCE(event->ctx != ctx); 2531 2532 lockdep_assert_held(&ctx->lock); 2533 2534 if (event->state <= PERF_EVENT_STATE_OFF) 2535 return 0; 2536 2537 WRITE_ONCE(event->oncpu, smp_processor_id()); 2538 /* 2539 * Order event::oncpu write to happen before the ACTIVE state is 2540 * visible. This allows perf_event_{stop,read}() to observe the correct 2541 * ->oncpu if it sees ACTIVE. 2542 */ 2543 smp_wmb(); 2544 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); 2545 2546 /* 2547 * Unthrottle events, since we scheduled we might have missed several 2548 * ticks already, also for a heavily scheduling task there is little 2549 * guarantee it'll get a tick in a timely manner. 2550 */ 2551 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 2552 perf_log_throttle(event, 1); 2553 event->hw.interrupts = 0; 2554 } 2555 2556 perf_pmu_disable(event->pmu); 2557 2558 perf_set_shadow_time(event, ctx); 2559 2560 perf_log_itrace_start(event); 2561 2562 if (event->pmu->add(event, PERF_EF_START)) { 2563 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 2564 event->oncpu = -1; 2565 ret = -EAGAIN; 2566 goto out; 2567 } 2568 2569 if (!is_software_event(event)) 2570 cpuctx->active_oncpu++; 2571 if (!ctx->nr_active++) 2572 perf_event_ctx_activate(ctx); 2573 if (event->attr.freq && event->attr.sample_freq) 2574 ctx->nr_freq++; 2575 2576 if (event->attr.exclusive) 2577 cpuctx->exclusive = 1; 2578 2579 out: 2580 perf_pmu_enable(event->pmu); 2581 2582 return ret; 2583 } 2584 2585 static int 2586 group_sched_in(struct perf_event *group_event, 2587 struct perf_cpu_context *cpuctx, 2588 struct perf_event_context *ctx) 2589 { 2590 struct perf_event *event, *partial_group = NULL; 2591 struct pmu *pmu = ctx->pmu; 2592 2593 if (group_event->state == PERF_EVENT_STATE_OFF) 2594 return 0; 2595 2596 pmu->start_txn(pmu, PERF_PMU_TXN_ADD); 2597 2598 if (event_sched_in(group_event, cpuctx, ctx)) 2599 goto error; 2600 2601 /* 2602 * Schedule in siblings as one group (if any): 2603 */ 2604 for_each_sibling_event(event, group_event) { 2605 if (event_sched_in(event, cpuctx, ctx)) { 2606 partial_group = event; 2607 goto group_error; 2608 } 2609 } 2610 2611 if (!pmu->commit_txn(pmu)) 2612 return 0; 2613 2614 group_error: 2615 /* 2616 * Groups can be scheduled in as one unit only, so undo any 2617 * partial group before returning: 2618 * The events up to the failed event are scheduled out normally. 2619 */ 2620 for_each_sibling_event(event, group_event) { 2621 if (event == partial_group) 2622 break; 2623 2624 event_sched_out(event, cpuctx, ctx); 2625 } 2626 event_sched_out(group_event, cpuctx, ctx); 2627 2628 error: 2629 pmu->cancel_txn(pmu); 2630 return -EAGAIN; 2631 } 2632 2633 /* 2634 * Work out whether we can put this event group on the CPU now. 2635 */ 2636 static int group_can_go_on(struct perf_event *event, 2637 struct perf_cpu_context *cpuctx, 2638 int can_add_hw) 2639 { 2640 /* 2641 * Groups consisting entirely of software events can always go on. 2642 */ 2643 if (event->group_caps & PERF_EV_CAP_SOFTWARE) 2644 return 1; 2645 /* 2646 * If an exclusive group is already on, no other hardware 2647 * events can go on. 2648 */ 2649 if (cpuctx->exclusive) 2650 return 0; 2651 /* 2652 * If this group is exclusive and there are already 2653 * events on the CPU, it can't go on. 2654 */ 2655 if (event->attr.exclusive && !list_empty(get_event_list(event))) 2656 return 0; 2657 /* 2658 * Otherwise, try to add it if all previous groups were able 2659 * to go on. 2660 */ 2661 return can_add_hw; 2662 } 2663 2664 static void add_event_to_ctx(struct perf_event *event, 2665 struct perf_event_context *ctx) 2666 { 2667 list_add_event(event, ctx); 2668 perf_group_attach(event); 2669 } 2670 2671 static void ctx_sched_out(struct perf_event_context *ctx, 2672 struct perf_cpu_context *cpuctx, 2673 enum event_type_t event_type); 2674 static void 2675 ctx_sched_in(struct perf_event_context *ctx, 2676 struct perf_cpu_context *cpuctx, 2677 enum event_type_t event_type, 2678 struct task_struct *task); 2679 2680 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2681 struct perf_event_context *ctx, 2682 enum event_type_t event_type) 2683 { 2684 if (!cpuctx->task_ctx) 2685 return; 2686 2687 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2688 return; 2689 2690 ctx_sched_out(ctx, cpuctx, event_type); 2691 } 2692 2693 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2694 struct perf_event_context *ctx, 2695 struct task_struct *task) 2696 { 2697 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); 2698 if (ctx) 2699 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2700 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); 2701 if (ctx) 2702 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2703 } 2704 2705 /* 2706 * We want to maintain the following priority of scheduling: 2707 * - CPU pinned (EVENT_CPU | EVENT_PINNED) 2708 * - task pinned (EVENT_PINNED) 2709 * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) 2710 * - task flexible (EVENT_FLEXIBLE). 2711 * 2712 * In order to avoid unscheduling and scheduling back in everything every 2713 * time an event is added, only do it for the groups of equal priority and 2714 * below. 2715 * 2716 * This can be called after a batch operation on task events, in which case 2717 * event_type is a bit mask of the types of events involved. For CPU events, 2718 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. 2719 */ 2720 static void ctx_resched(struct perf_cpu_context *cpuctx, 2721 struct perf_event_context *task_ctx, 2722 enum event_type_t event_type) 2723 { 2724 enum event_type_t ctx_event_type; 2725 bool cpu_event = !!(event_type & EVENT_CPU); 2726 2727 /* 2728 * If pinned groups are involved, flexible groups also need to be 2729 * scheduled out. 2730 */ 2731 if (event_type & EVENT_PINNED) 2732 event_type |= EVENT_FLEXIBLE; 2733 2734 ctx_event_type = event_type & EVENT_ALL; 2735 2736 perf_pmu_disable(cpuctx->ctx.pmu); 2737 if (task_ctx) 2738 task_ctx_sched_out(cpuctx, task_ctx, event_type); 2739 2740 /* 2741 * Decide which cpu ctx groups to schedule out based on the types 2742 * of events that caused rescheduling: 2743 * - EVENT_CPU: schedule out corresponding groups; 2744 * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; 2745 * - otherwise, do nothing more. 2746 */ 2747 if (cpu_event) 2748 cpu_ctx_sched_out(cpuctx, ctx_event_type); 2749 else if (ctx_event_type & EVENT_PINNED) 2750 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2751 2752 perf_event_sched_in(cpuctx, task_ctx, current); 2753 perf_pmu_enable(cpuctx->ctx.pmu); 2754 } 2755 2756 void perf_pmu_resched(struct pmu *pmu) 2757 { 2758 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 2759 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2760 2761 perf_ctx_lock(cpuctx, task_ctx); 2762 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); 2763 perf_ctx_unlock(cpuctx, task_ctx); 2764 } 2765 2766 /* 2767 * Cross CPU call to install and enable a performance event 2768 * 2769 * Very similar to remote_function() + event_function() but cannot assume that 2770 * things like ctx->is_active and cpuctx->task_ctx are set. 2771 */ 2772 static int __perf_install_in_context(void *info) 2773 { 2774 struct perf_event *event = info; 2775 struct perf_event_context *ctx = event->ctx; 2776 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2777 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2778 bool reprogram = true; 2779 int ret = 0; 2780 2781 raw_spin_lock(&cpuctx->ctx.lock); 2782 if (ctx->task) { 2783 raw_spin_lock(&ctx->lock); 2784 task_ctx = ctx; 2785 2786 reprogram = (ctx->task == current); 2787 2788 /* 2789 * If the task is running, it must be running on this CPU, 2790 * otherwise we cannot reprogram things. 2791 * 2792 * If its not running, we don't care, ctx->lock will 2793 * serialize against it becoming runnable. 2794 */ 2795 if (task_curr(ctx->task) && !reprogram) { 2796 ret = -ESRCH; 2797 goto unlock; 2798 } 2799 2800 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2801 } else if (task_ctx) { 2802 raw_spin_lock(&task_ctx->lock); 2803 } 2804 2805 #ifdef CONFIG_CGROUP_PERF 2806 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { 2807 /* 2808 * If the current cgroup doesn't match the event's 2809 * cgroup, we should not try to schedule it. 2810 */ 2811 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); 2812 reprogram = cgroup_is_descendant(cgrp->css.cgroup, 2813 event->cgrp->css.cgroup); 2814 } 2815 #endif 2816 2817 if (reprogram) { 2818 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2819 add_event_to_ctx(event, ctx); 2820 ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2821 } else { 2822 add_event_to_ctx(event, ctx); 2823 } 2824 2825 unlock: 2826 perf_ctx_unlock(cpuctx, task_ctx); 2827 2828 return ret; 2829 } 2830 2831 static bool exclusive_event_installable(struct perf_event *event, 2832 struct perf_event_context *ctx); 2833 2834 /* 2835 * Attach a performance event to a context. 2836 * 2837 * Very similar to event_function_call, see comment there. 2838 */ 2839 static void 2840 perf_install_in_context(struct perf_event_context *ctx, 2841 struct perf_event *event, 2842 int cpu) 2843 { 2844 struct task_struct *task = READ_ONCE(ctx->task); 2845 2846 lockdep_assert_held(&ctx->mutex); 2847 2848 WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); 2849 2850 if (event->cpu != -1) 2851 event->cpu = cpu; 2852 2853 /* 2854 * Ensures that if we can observe event->ctx, both the event and ctx 2855 * will be 'complete'. See perf_iterate_sb_cpu(). 2856 */ 2857 smp_store_release(&event->ctx, ctx); 2858 2859 /* 2860 * perf_event_attr::disabled events will not run and can be initialized 2861 * without IPI. Except when this is the first event for the context, in 2862 * that case we need the magic of the IPI to set ctx->is_active. 2863 * 2864 * The IOC_ENABLE that is sure to follow the creation of a disabled 2865 * event will issue the IPI and reprogram the hardware. 2866 */ 2867 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { 2868 raw_spin_lock_irq(&ctx->lock); 2869 if (ctx->task == TASK_TOMBSTONE) { 2870 raw_spin_unlock_irq(&ctx->lock); 2871 return; 2872 } 2873 add_event_to_ctx(event, ctx); 2874 raw_spin_unlock_irq(&ctx->lock); 2875 return; 2876 } 2877 2878 if (!task) { 2879 cpu_function_call(cpu, __perf_install_in_context, event); 2880 return; 2881 } 2882 2883 /* 2884 * Should not happen, we validate the ctx is still alive before calling. 2885 */ 2886 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2887 return; 2888 2889 /* 2890 * Installing events is tricky because we cannot rely on ctx->is_active 2891 * to be set in case this is the nr_events 0 -> 1 transition. 2892 * 2893 * Instead we use task_curr(), which tells us if the task is running. 2894 * However, since we use task_curr() outside of rq::lock, we can race 2895 * against the actual state. This means the result can be wrong. 2896 * 2897 * If we get a false positive, we retry, this is harmless. 2898 * 2899 * If we get a false negative, things are complicated. If we are after 2900 * perf_event_context_sched_in() ctx::lock will serialize us, and the 2901 * value must be correct. If we're before, it doesn't matter since 2902 * perf_event_context_sched_in() will program the counter. 2903 * 2904 * However, this hinges on the remote context switch having observed 2905 * our task->perf_event_ctxp[] store, such that it will in fact take 2906 * ctx::lock in perf_event_context_sched_in(). 2907 * 2908 * We do this by task_function_call(), if the IPI fails to hit the task 2909 * we know any future context switch of task must see the 2910 * perf_event_ctpx[] store. 2911 */ 2912 2913 /* 2914 * This smp_mb() orders the task->perf_event_ctxp[] store with the 2915 * task_cpu() load, such that if the IPI then does not find the task 2916 * running, a future context switch of that task must observe the 2917 * store. 2918 */ 2919 smp_mb(); 2920 again: 2921 if (!task_function_call(task, __perf_install_in_context, event)) 2922 return; 2923 2924 raw_spin_lock_irq(&ctx->lock); 2925 task = ctx->task; 2926 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2927 /* 2928 * Cannot happen because we already checked above (which also 2929 * cannot happen), and we hold ctx->mutex, which serializes us 2930 * against perf_event_exit_task_context(). 2931 */ 2932 raw_spin_unlock_irq(&ctx->lock); 2933 return; 2934 } 2935 /* 2936 * If the task is not running, ctx->lock will avoid it becoming so, 2937 * thus we can safely install the event. 2938 */ 2939 if (task_curr(task)) { 2940 raw_spin_unlock_irq(&ctx->lock); 2941 goto again; 2942 } 2943 add_event_to_ctx(event, ctx); 2944 raw_spin_unlock_irq(&ctx->lock); 2945 } 2946 2947 /* 2948 * Cross CPU call to enable a performance event 2949 */ 2950 static void __perf_event_enable(struct perf_event *event, 2951 struct perf_cpu_context *cpuctx, 2952 struct perf_event_context *ctx, 2953 void *info) 2954 { 2955 struct perf_event *leader = event->group_leader; 2956 struct perf_event_context *task_ctx; 2957 2958 if (event->state >= PERF_EVENT_STATE_INACTIVE || 2959 event->state <= PERF_EVENT_STATE_ERROR) 2960 return; 2961 2962 if (ctx->is_active) 2963 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2964 2965 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 2966 perf_cgroup_event_enable(event, ctx); 2967 2968 if (!ctx->is_active) 2969 return; 2970 2971 if (!event_filter_match(event)) { 2972 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2973 return; 2974 } 2975 2976 /* 2977 * If the event is in a group and isn't the group leader, 2978 * then don't put it on unless the group is on. 2979 */ 2980 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { 2981 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2982 return; 2983 } 2984 2985 task_ctx = cpuctx->task_ctx; 2986 if (ctx->task) 2987 WARN_ON_ONCE(task_ctx != ctx); 2988 2989 ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2990 } 2991 2992 /* 2993 * Enable an event. 2994 * 2995 * If event->ctx is a cloned context, callers must make sure that 2996 * every task struct that event->ctx->task could possibly point to 2997 * remains valid. This condition is satisfied when called through 2998 * perf_event_for_each_child or perf_event_for_each as described 2999 * for perf_event_disable. 3000 */ 3001 static void _perf_event_enable(struct perf_event *event) 3002 { 3003 struct perf_event_context *ctx = event->ctx; 3004 3005 raw_spin_lock_irq(&ctx->lock); 3006 if (event->state >= PERF_EVENT_STATE_INACTIVE || 3007 event->state < PERF_EVENT_STATE_ERROR) { 3008 out: 3009 raw_spin_unlock_irq(&ctx->lock); 3010 return; 3011 } 3012 3013 /* 3014 * If the event is in error state, clear that first. 3015 * 3016 * That way, if we see the event in error state below, we know that it 3017 * has gone back into error state, as distinct from the task having 3018 * been scheduled away before the cross-call arrived. 3019 */ 3020 if (event->state == PERF_EVENT_STATE_ERROR) { 3021 /* 3022 * Detached SIBLING events cannot leave ERROR state. 3023 */ 3024 if (event->event_caps & PERF_EV_CAP_SIBLING && 3025 event->group_leader == event) 3026 goto out; 3027 3028 event->state = PERF_EVENT_STATE_OFF; 3029 } 3030 raw_spin_unlock_irq(&ctx->lock); 3031 3032 event_function_call(event, __perf_event_enable, NULL); 3033 } 3034 3035 /* 3036 * See perf_event_disable(); 3037 */ 3038 void perf_event_enable(struct perf_event *event) 3039 { 3040 struct perf_event_context *ctx; 3041 3042 ctx = perf_event_ctx_lock(event); 3043 _perf_event_enable(event); 3044 perf_event_ctx_unlock(event, ctx); 3045 } 3046 EXPORT_SYMBOL_GPL(perf_event_enable); 3047 3048 struct stop_event_data { 3049 struct perf_event *event; 3050 unsigned int restart; 3051 }; 3052 3053 static int __perf_event_stop(void *info) 3054 { 3055 struct stop_event_data *sd = info; 3056 struct perf_event *event = sd->event; 3057 3058 /* if it's already INACTIVE, do nothing */ 3059 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 3060 return 0; 3061 3062 /* matches smp_wmb() in event_sched_in() */ 3063 smp_rmb(); 3064 3065 /* 3066 * There is a window with interrupts enabled before we get here, 3067 * so we need to check again lest we try to stop another CPU's event. 3068 */ 3069 if (READ_ONCE(event->oncpu) != smp_processor_id()) 3070 return -EAGAIN; 3071 3072 event->pmu->stop(event, PERF_EF_UPDATE); 3073 3074 /* 3075 * May race with the actual stop (through perf_pmu_output_stop()), 3076 * but it is only used for events with AUX ring buffer, and such 3077 * events will refuse to restart because of rb::aux_mmap_count==0, 3078 * see comments in perf_aux_output_begin(). 3079 * 3080 * Since this is happening on an event-local CPU, no trace is lost 3081 * while restarting. 3082 */ 3083 if (sd->restart) 3084 event->pmu->start(event, 0); 3085 3086 return 0; 3087 } 3088 3089 static int perf_event_stop(struct perf_event *event, int restart) 3090 { 3091 struct stop_event_data sd = { 3092 .event = event, 3093 .restart = restart, 3094 }; 3095 int ret = 0; 3096 3097 do { 3098 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 3099 return 0; 3100 3101 /* matches smp_wmb() in event_sched_in() */ 3102 smp_rmb(); 3103 3104 /* 3105 * We only want to restart ACTIVE events, so if the event goes 3106 * inactive here (event->oncpu==-1), there's nothing more to do; 3107 * fall through with ret==-ENXIO. 3108 */ 3109 ret = cpu_function_call(READ_ONCE(event->oncpu), 3110 __perf_event_stop, &sd); 3111 } while (ret == -EAGAIN); 3112 3113 return ret; 3114 } 3115 3116 /* 3117 * In order to contain the amount of racy and tricky in the address filter 3118 * configuration management, it is a two part process: 3119 * 3120 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, 3121 * we update the addresses of corresponding vmas in 3122 * event::addr_filter_ranges array and bump the event::addr_filters_gen; 3123 * (p2) when an event is scheduled in (pmu::add), it calls 3124 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() 3125 * if the generation has changed since the previous call. 3126 * 3127 * If (p1) happens while the event is active, we restart it to force (p2). 3128 * 3129 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on 3130 * pre-existing mappings, called once when new filters arrive via SET_FILTER 3131 * ioctl; 3132 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly 3133 * registered mapping, called for every new mmap(), with mm::mmap_lock down 3134 * for reading; 3135 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process 3136 * of exec. 3137 */ 3138 void perf_event_addr_filters_sync(struct perf_event *event) 3139 { 3140 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 3141 3142 if (!has_addr_filter(event)) 3143 return; 3144 3145 raw_spin_lock(&ifh->lock); 3146 if (event->addr_filters_gen != event->hw.addr_filters_gen) { 3147 event->pmu->addr_filters_sync(event); 3148 event->hw.addr_filters_gen = event->addr_filters_gen; 3149 } 3150 raw_spin_unlock(&ifh->lock); 3151 } 3152 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); 3153 3154 static int _perf_event_refresh(struct perf_event *event, int refresh) 3155 { 3156 /* 3157 * not supported on inherited events 3158 */ 3159 if (event->attr.inherit || !is_sampling_event(event)) 3160 return -EINVAL; 3161 3162 atomic_add(refresh, &event->event_limit); 3163 _perf_event_enable(event); 3164 3165 return 0; 3166 } 3167 3168 /* 3169 * See perf_event_disable() 3170 */ 3171 int perf_event_refresh(struct perf_event *event, int refresh) 3172 { 3173 struct perf_event_context *ctx; 3174 int ret; 3175 3176 ctx = perf_event_ctx_lock(event); 3177 ret = _perf_event_refresh(event, refresh); 3178 perf_event_ctx_unlock(event, ctx); 3179 3180 return ret; 3181 } 3182 EXPORT_SYMBOL_GPL(perf_event_refresh); 3183 3184 static int perf_event_modify_breakpoint(struct perf_event *bp, 3185 struct perf_event_attr *attr) 3186 { 3187 int err; 3188 3189 _perf_event_disable(bp); 3190 3191 err = modify_user_hw_breakpoint_check(bp, attr, true); 3192 3193 if (!bp->attr.disabled) 3194 _perf_event_enable(bp); 3195 3196 return err; 3197 } 3198 3199 static int perf_event_modify_attr(struct perf_event *event, 3200 struct perf_event_attr *attr) 3201 { 3202 if (event->attr.type != attr->type) 3203 return -EINVAL; 3204 3205 switch (event->attr.type) { 3206 case PERF_TYPE_BREAKPOINT: 3207 return perf_event_modify_breakpoint(event, attr); 3208 default: 3209 /* Place holder for future additions. */ 3210 return -EOPNOTSUPP; 3211 } 3212 } 3213 3214 static void ctx_sched_out(struct perf_event_context *ctx, 3215 struct perf_cpu_context *cpuctx, 3216 enum event_type_t event_type) 3217 { 3218 struct perf_event *event, *tmp; 3219 int is_active = ctx->is_active; 3220 3221 lockdep_assert_held(&ctx->lock); 3222 3223 if (likely(!ctx->nr_events)) { 3224 /* 3225 * See __perf_remove_from_context(). 3226 */ 3227 WARN_ON_ONCE(ctx->is_active); 3228 if (ctx->task) 3229 WARN_ON_ONCE(cpuctx->task_ctx); 3230 return; 3231 } 3232 3233 ctx->is_active &= ~event_type; 3234 if (!(ctx->is_active & EVENT_ALL)) 3235 ctx->is_active = 0; 3236 3237 if (ctx->task) { 3238 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 3239 if (!ctx->is_active) 3240 cpuctx->task_ctx = NULL; 3241 } 3242 3243 /* 3244 * Always update time if it was set; not only when it changes. 3245 * Otherwise we can 'forget' to update time for any but the last 3246 * context we sched out. For example: 3247 * 3248 * ctx_sched_out(.event_type = EVENT_FLEXIBLE) 3249 * ctx_sched_out(.event_type = EVENT_PINNED) 3250 * 3251 * would only update time for the pinned events. 3252 */ 3253 if (is_active & EVENT_TIME) { 3254 /* update (and stop) ctx time */ 3255 update_context_time(ctx); 3256 update_cgrp_time_from_cpuctx(cpuctx); 3257 } 3258 3259 is_active ^= ctx->is_active; /* changed bits */ 3260 3261 if (!ctx->nr_active || !(is_active & EVENT_ALL)) 3262 return; 3263 3264 perf_pmu_disable(ctx->pmu); 3265 if (is_active & EVENT_PINNED) { 3266 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) 3267 group_sched_out(event, cpuctx, ctx); 3268 } 3269 3270 if (is_active & EVENT_FLEXIBLE) { 3271 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) 3272 group_sched_out(event, cpuctx, ctx); 3273 3274 /* 3275 * Since we cleared EVENT_FLEXIBLE, also clear 3276 * rotate_necessary, is will be reset by 3277 * ctx_flexible_sched_in() when needed. 3278 */ 3279 ctx->rotate_necessary = 0; 3280 } 3281 perf_pmu_enable(ctx->pmu); 3282 } 3283 3284 /* 3285 * Test whether two contexts are equivalent, i.e. whether they have both been 3286 * cloned from the same version of the same context. 3287 * 3288 * Equivalence is measured using a generation number in the context that is 3289 * incremented on each modification to it; see unclone_ctx(), list_add_event() 3290 * and list_del_event(). 3291 */ 3292 static int context_equiv(struct perf_event_context *ctx1, 3293 struct perf_event_context *ctx2) 3294 { 3295 lockdep_assert_held(&ctx1->lock); 3296 lockdep_assert_held(&ctx2->lock); 3297 3298 /* Pinning disables the swap optimization */ 3299 if (ctx1->pin_count || ctx2->pin_count) 3300 return 0; 3301 3302 /* If ctx1 is the parent of ctx2 */ 3303 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) 3304 return 1; 3305 3306 /* If ctx2 is the parent of ctx1 */ 3307 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) 3308 return 1; 3309 3310 /* 3311 * If ctx1 and ctx2 have the same parent; we flatten the parent 3312 * hierarchy, see perf_event_init_context(). 3313 */ 3314 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && 3315 ctx1->parent_gen == ctx2->parent_gen) 3316 return 1; 3317 3318 /* Unmatched */ 3319 return 0; 3320 } 3321 3322 static void __perf_event_sync_stat(struct perf_event *event, 3323 struct perf_event *next_event) 3324 { 3325 u64 value; 3326 3327 if (!event->attr.inherit_stat) 3328 return; 3329 3330 /* 3331 * Update the event value, we cannot use perf_event_read() 3332 * because we're in the middle of a context switch and have IRQs 3333 * disabled, which upsets smp_call_function_single(), however 3334 * we know the event must be on the current CPU, therefore we 3335 * don't need to use it. 3336 */ 3337 if (event->state == PERF_EVENT_STATE_ACTIVE) 3338 event->pmu->read(event); 3339 3340 perf_event_update_time(event); 3341 3342 /* 3343 * In order to keep per-task stats reliable we need to flip the event 3344 * values when we flip the contexts. 3345 */ 3346 value = local64_read(&next_event->count); 3347 value = local64_xchg(&event->count, value); 3348 local64_set(&next_event->count, value); 3349 3350 swap(event->total_time_enabled, next_event->total_time_enabled); 3351 swap(event->total_time_running, next_event->total_time_running); 3352 3353 /* 3354 * Since we swizzled the values, update the user visible data too. 3355 */ 3356 perf_event_update_userpage(event); 3357 perf_event_update_userpage(next_event); 3358 } 3359 3360 static void perf_event_sync_stat(struct perf_event_context *ctx, 3361 struct perf_event_context *next_ctx) 3362 { 3363 struct perf_event *event, *next_event; 3364 3365 if (!ctx->nr_stat) 3366 return; 3367 3368 update_context_time(ctx); 3369 3370 event = list_first_entry(&ctx->event_list, 3371 struct perf_event, event_entry); 3372 3373 next_event = list_first_entry(&next_ctx->event_list, 3374 struct perf_event, event_entry); 3375 3376 while (&event->event_entry != &ctx->event_list && 3377 &next_event->event_entry != &next_ctx->event_list) { 3378 3379 __perf_event_sync_stat(event, next_event); 3380 3381 event = list_next_entry(event, event_entry); 3382 next_event = list_next_entry(next_event, event_entry); 3383 } 3384 } 3385 3386 static void perf_event_context_sched_out(struct task_struct *task, int ctxn, 3387 struct task_struct *next) 3388 { 3389 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 3390 struct perf_event_context *next_ctx; 3391 struct perf_event_context *parent, *next_parent; 3392 struct perf_cpu_context *cpuctx; 3393 int do_switch = 1; 3394 struct pmu *pmu; 3395 3396 if (likely(!ctx)) 3397 return; 3398 3399 pmu = ctx->pmu; 3400 cpuctx = __get_cpu_context(ctx); 3401 if (!cpuctx->task_ctx) 3402 return; 3403 3404 rcu_read_lock(); 3405 next_ctx = next->perf_event_ctxp[ctxn]; 3406 if (!next_ctx) 3407 goto unlock; 3408 3409 parent = rcu_dereference(ctx->parent_ctx); 3410 next_parent = rcu_dereference(next_ctx->parent_ctx); 3411 3412 /* If neither context have a parent context; they cannot be clones. */ 3413 if (!parent && !next_parent) 3414 goto unlock; 3415 3416 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 3417 /* 3418 * Looks like the two contexts are clones, so we might be 3419 * able to optimize the context switch. We lock both 3420 * contexts and check that they are clones under the 3421 * lock (including re-checking that neither has been 3422 * uncloned in the meantime). It doesn't matter which 3423 * order we take the locks because no other cpu could 3424 * be trying to lock both of these tasks. 3425 */ 3426 raw_spin_lock(&ctx->lock); 3427 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 3428 if (context_equiv(ctx, next_ctx)) { 3429 3430 WRITE_ONCE(ctx->task, next); 3431 WRITE_ONCE(next_ctx->task, task); 3432 3433 perf_pmu_disable(pmu); 3434 3435 if (cpuctx->sched_cb_usage && pmu->sched_task) 3436 pmu->sched_task(ctx, false); 3437 3438 /* 3439 * PMU specific parts of task perf context can require 3440 * additional synchronization. As an example of such 3441 * synchronization see implementation details of Intel 3442 * LBR call stack data profiling; 3443 */ 3444 if (pmu->swap_task_ctx) 3445 pmu->swap_task_ctx(ctx, next_ctx); 3446 else 3447 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 3448 3449 perf_pmu_enable(pmu); 3450 3451 /* 3452 * RCU_INIT_POINTER here is safe because we've not 3453 * modified the ctx and the above modification of 3454 * ctx->task and ctx->task_ctx_data are immaterial 3455 * since those values are always verified under 3456 * ctx->lock which we're now holding. 3457 */ 3458 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); 3459 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); 3460 3461 do_switch = 0; 3462 3463 perf_event_sync_stat(ctx, next_ctx); 3464 } 3465 raw_spin_unlock(&next_ctx->lock); 3466 raw_spin_unlock(&ctx->lock); 3467 } 3468 unlock: 3469 rcu_read_unlock(); 3470 3471 if (do_switch) { 3472 raw_spin_lock(&ctx->lock); 3473 perf_pmu_disable(pmu); 3474 3475 if (cpuctx->sched_cb_usage && pmu->sched_task) 3476 pmu->sched_task(ctx, false); 3477 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); 3478 3479 perf_pmu_enable(pmu); 3480 raw_spin_unlock(&ctx->lock); 3481 } 3482 } 3483 3484 static DEFINE_PER_CPU(struct list_head, sched_cb_list); 3485 3486 void perf_sched_cb_dec(struct pmu *pmu) 3487 { 3488 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3489 3490 this_cpu_dec(perf_sched_cb_usages); 3491 3492 if (!--cpuctx->sched_cb_usage) 3493 list_del(&cpuctx->sched_cb_entry); 3494 } 3495 3496 3497 void perf_sched_cb_inc(struct pmu *pmu) 3498 { 3499 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3500 3501 if (!cpuctx->sched_cb_usage++) 3502 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); 3503 3504 this_cpu_inc(perf_sched_cb_usages); 3505 } 3506 3507 /* 3508 * This function provides the context switch callback to the lower code 3509 * layer. It is invoked ONLY when the context switch callback is enabled. 3510 * 3511 * This callback is relevant even to per-cpu events; for example multi event 3512 * PEBS requires this to provide PID/TID information. This requires we flush 3513 * all queued PEBS records before we context switch to a new task. 3514 */ 3515 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) 3516 { 3517 struct pmu *pmu; 3518 3519 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ 3520 3521 if (WARN_ON_ONCE(!pmu->sched_task)) 3522 return; 3523 3524 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 3525 perf_pmu_disable(pmu); 3526 3527 pmu->sched_task(cpuctx->task_ctx, sched_in); 3528 3529 perf_pmu_enable(pmu); 3530 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3531 } 3532 3533 static void perf_pmu_sched_task(struct task_struct *prev, 3534 struct task_struct *next, 3535 bool sched_in) 3536 { 3537 struct perf_cpu_context *cpuctx; 3538 3539 if (prev == next) 3540 return; 3541 3542 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { 3543 /* will be handled in perf_event_context_sched_in/out */ 3544 if (cpuctx->task_ctx) 3545 continue; 3546 3547 __perf_pmu_sched_task(cpuctx, sched_in); 3548 } 3549 } 3550 3551 static void perf_event_switch(struct task_struct *task, 3552 struct task_struct *next_prev, bool sched_in); 3553 3554 #define for_each_task_context_nr(ctxn) \ 3555 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 3556 3557 /* 3558 * Called from scheduler to remove the events of the current task, 3559 * with interrupts disabled. 3560 * 3561 * We stop each event and update the event value in event->count. 3562 * 3563 * This does not protect us against NMI, but disable() 3564 * sets the disabled bit in the control field of event _before_ 3565 * accessing the event control register. If a NMI hits, then it will 3566 * not restart the event. 3567 */ 3568 void __perf_event_task_sched_out(struct task_struct *task, 3569 struct task_struct *next) 3570 { 3571 int ctxn; 3572 3573 if (__this_cpu_read(perf_sched_cb_usages)) 3574 perf_pmu_sched_task(task, next, false); 3575 3576 if (atomic_read(&nr_switch_events)) 3577 perf_event_switch(task, next, false); 3578 3579 for_each_task_context_nr(ctxn) 3580 perf_event_context_sched_out(task, ctxn, next); 3581 3582 /* 3583 * if cgroup events exist on this CPU, then we need 3584 * to check if we have to switch out PMU state. 3585 * cgroup event are system-wide mode only 3586 */ 3587 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 3588 perf_cgroup_sched_out(task, next); 3589 } 3590 3591 /* 3592 * Called with IRQs disabled 3593 */ 3594 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 3595 enum event_type_t event_type) 3596 { 3597 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 3598 } 3599 3600 static bool perf_less_group_idx(const void *l, const void *r) 3601 { 3602 const struct perf_event *le = *(const struct perf_event **)l; 3603 const struct perf_event *re = *(const struct perf_event **)r; 3604 3605 return le->group_index < re->group_index; 3606 } 3607 3608 static void swap_ptr(void *l, void *r) 3609 { 3610 void **lp = l, **rp = r; 3611 3612 swap(*lp, *rp); 3613 } 3614 3615 static const struct min_heap_callbacks perf_min_heap = { 3616 .elem_size = sizeof(struct perf_event *), 3617 .less = perf_less_group_idx, 3618 .swp = swap_ptr, 3619 }; 3620 3621 static void __heap_add(struct min_heap *heap, struct perf_event *event) 3622 { 3623 struct perf_event **itrs = heap->data; 3624 3625 if (event) { 3626 itrs[heap->nr] = event; 3627 heap->nr++; 3628 } 3629 } 3630 3631 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, 3632 struct perf_event_groups *groups, int cpu, 3633 int (*func)(struct perf_event *, void *), 3634 void *data) 3635 { 3636 #ifdef CONFIG_CGROUP_PERF 3637 struct cgroup_subsys_state *css = NULL; 3638 #endif 3639 /* Space for per CPU and/or any CPU event iterators. */ 3640 struct perf_event *itrs[2]; 3641 struct min_heap event_heap; 3642 struct perf_event **evt; 3643 int ret; 3644 3645 if (cpuctx) { 3646 event_heap = (struct min_heap){ 3647 .data = cpuctx->heap, 3648 .nr = 0, 3649 .size = cpuctx->heap_size, 3650 }; 3651 3652 lockdep_assert_held(&cpuctx->ctx.lock); 3653 3654 #ifdef CONFIG_CGROUP_PERF 3655 if (cpuctx->cgrp) 3656 css = &cpuctx->cgrp->css; 3657 #endif 3658 } else { 3659 event_heap = (struct min_heap){ 3660 .data = itrs, 3661 .nr = 0, 3662 .size = ARRAY_SIZE(itrs), 3663 }; 3664 /* Events not within a CPU context may be on any CPU. */ 3665 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); 3666 } 3667 evt = event_heap.data; 3668 3669 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); 3670 3671 #ifdef CONFIG_CGROUP_PERF 3672 for (; css; css = css->parent) 3673 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); 3674 #endif 3675 3676 min_heapify_all(&event_heap, &perf_min_heap); 3677 3678 while (event_heap.nr) { 3679 ret = func(*evt, data); 3680 if (ret) 3681 return ret; 3682 3683 *evt = perf_event_groups_next(*evt); 3684 if (*evt) 3685 min_heapify(&event_heap, 0, &perf_min_heap); 3686 else 3687 min_heap_pop(&event_heap, &perf_min_heap); 3688 } 3689 3690 return 0; 3691 } 3692 3693 static int merge_sched_in(struct perf_event *event, void *data) 3694 { 3695 struct perf_event_context *ctx = event->ctx; 3696 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 3697 int *can_add_hw = data; 3698 3699 if (event->state <= PERF_EVENT_STATE_OFF) 3700 return 0; 3701 3702 if (!event_filter_match(event)) 3703 return 0; 3704 3705 if (group_can_go_on(event, cpuctx, *can_add_hw)) { 3706 if (!group_sched_in(event, cpuctx, ctx)) 3707 list_add_tail(&event->active_list, get_event_list(event)); 3708 } 3709 3710 if (event->state == PERF_EVENT_STATE_INACTIVE) { 3711 if (event->attr.pinned) { 3712 perf_cgroup_event_disable(event, ctx); 3713 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 3714 } 3715 3716 *can_add_hw = 0; 3717 ctx->rotate_necessary = 1; 3718 perf_mux_hrtimer_restart(cpuctx); 3719 } 3720 3721 return 0; 3722 } 3723 3724 static void 3725 ctx_pinned_sched_in(struct perf_event_context *ctx, 3726 struct perf_cpu_context *cpuctx) 3727 { 3728 int can_add_hw = 1; 3729 3730 if (ctx != &cpuctx->ctx) 3731 cpuctx = NULL; 3732 3733 visit_groups_merge(cpuctx, &ctx->pinned_groups, 3734 smp_processor_id(), 3735 merge_sched_in, &can_add_hw); 3736 } 3737 3738 static void 3739 ctx_flexible_sched_in(struct perf_event_context *ctx, 3740 struct perf_cpu_context *cpuctx) 3741 { 3742 int can_add_hw = 1; 3743 3744 if (ctx != &cpuctx->ctx) 3745 cpuctx = NULL; 3746 3747 visit_groups_merge(cpuctx, &ctx->flexible_groups, 3748 smp_processor_id(), 3749 merge_sched_in, &can_add_hw); 3750 } 3751 3752 static void 3753 ctx_sched_in(struct perf_event_context *ctx, 3754 struct perf_cpu_context *cpuctx, 3755 enum event_type_t event_type, 3756 struct task_struct *task) 3757 { 3758 int is_active = ctx->is_active; 3759 u64 now; 3760 3761 lockdep_assert_held(&ctx->lock); 3762 3763 if (likely(!ctx->nr_events)) 3764 return; 3765 3766 ctx->is_active |= (event_type | EVENT_TIME); 3767 if (ctx->task) { 3768 if (!is_active) 3769 cpuctx->task_ctx = ctx; 3770 else 3771 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 3772 } 3773 3774 is_active ^= ctx->is_active; /* changed bits */ 3775 3776 if (is_active & EVENT_TIME) { 3777 /* start ctx time */ 3778 now = perf_clock(); 3779 ctx->timestamp = now; 3780 perf_cgroup_set_timestamp(task, ctx); 3781 } 3782 3783 /* 3784 * First go through the list and put on any pinned groups 3785 * in order to give them the best chance of going on. 3786 */ 3787 if (is_active & EVENT_PINNED) 3788 ctx_pinned_sched_in(ctx, cpuctx); 3789 3790 /* Then walk through the lower prio flexible groups */ 3791 if (is_active & EVENT_FLEXIBLE) 3792 ctx_flexible_sched_in(ctx, cpuctx); 3793 } 3794 3795 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 3796 enum event_type_t event_type, 3797 struct task_struct *task) 3798 { 3799 struct perf_event_context *ctx = &cpuctx->ctx; 3800 3801 ctx_sched_in(ctx, cpuctx, event_type, task); 3802 } 3803 3804 static void perf_event_context_sched_in(struct perf_event_context *ctx, 3805 struct task_struct *task) 3806 { 3807 struct perf_cpu_context *cpuctx; 3808 struct pmu *pmu = ctx->pmu; 3809 3810 cpuctx = __get_cpu_context(ctx); 3811 if (cpuctx->task_ctx == ctx) { 3812 if (cpuctx->sched_cb_usage) 3813 __perf_pmu_sched_task(cpuctx, true); 3814 return; 3815 } 3816 3817 perf_ctx_lock(cpuctx, ctx); 3818 /* 3819 * We must check ctx->nr_events while holding ctx->lock, such 3820 * that we serialize against perf_install_in_context(). 3821 */ 3822 if (!ctx->nr_events) 3823 goto unlock; 3824 3825 perf_pmu_disable(pmu); 3826 /* 3827 * We want to keep the following priority order: 3828 * cpu pinned (that don't need to move), task pinned, 3829 * cpu flexible, task flexible. 3830 * 3831 * However, if task's ctx is not carrying any pinned 3832 * events, no need to flip the cpuctx's events around. 3833 */ 3834 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) 3835 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3836 perf_event_sched_in(cpuctx, ctx, task); 3837 3838 if (cpuctx->sched_cb_usage && pmu->sched_task) 3839 pmu->sched_task(cpuctx->task_ctx, true); 3840 3841 perf_pmu_enable(pmu); 3842 3843 unlock: 3844 perf_ctx_unlock(cpuctx, ctx); 3845 } 3846 3847 /* 3848 * Called from scheduler to add the events of the current task 3849 * with interrupts disabled. 3850 * 3851 * We restore the event value and then enable it. 3852 * 3853 * This does not protect us against NMI, but enable() 3854 * sets the enabled bit in the control field of event _before_ 3855 * accessing the event control register. If a NMI hits, then it will 3856 * keep the event running. 3857 */ 3858 void __perf_event_task_sched_in(struct task_struct *prev, 3859 struct task_struct *task) 3860 { 3861 struct perf_event_context *ctx; 3862 int ctxn; 3863 3864 /* 3865 * If cgroup events exist on this CPU, then we need to check if we have 3866 * to switch in PMU state; cgroup event are system-wide mode only. 3867 * 3868 * Since cgroup events are CPU events, we must schedule these in before 3869 * we schedule in the task events. 3870 */ 3871 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 3872 perf_cgroup_sched_in(prev, task); 3873 3874 for_each_task_context_nr(ctxn) { 3875 ctx = task->perf_event_ctxp[ctxn]; 3876 if (likely(!ctx)) 3877 continue; 3878 3879 perf_event_context_sched_in(ctx, task); 3880 } 3881 3882 if (atomic_read(&nr_switch_events)) 3883 perf_event_switch(task, prev, true); 3884 3885 if (__this_cpu_read(perf_sched_cb_usages)) 3886 perf_pmu_sched_task(prev, task, true); 3887 } 3888 3889 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 3890 { 3891 u64 frequency = event->attr.sample_freq; 3892 u64 sec = NSEC_PER_SEC; 3893 u64 divisor, dividend; 3894 3895 int count_fls, nsec_fls, frequency_fls, sec_fls; 3896 3897 count_fls = fls64(count); 3898 nsec_fls = fls64(nsec); 3899 frequency_fls = fls64(frequency); 3900 sec_fls = 30; 3901 3902 /* 3903 * We got @count in @nsec, with a target of sample_freq HZ 3904 * the target period becomes: 3905 * 3906 * @count * 10^9 3907 * period = ------------------- 3908 * @nsec * sample_freq 3909 * 3910 */ 3911 3912 /* 3913 * Reduce accuracy by one bit such that @a and @b converge 3914 * to a similar magnitude. 3915 */ 3916 #define REDUCE_FLS(a, b) \ 3917 do { \ 3918 if (a##_fls > b##_fls) { \ 3919 a >>= 1; \ 3920 a##_fls--; \ 3921 } else { \ 3922 b >>= 1; \ 3923 b##_fls--; \ 3924 } \ 3925 } while (0) 3926 3927 /* 3928 * Reduce accuracy until either term fits in a u64, then proceed with 3929 * the other, so that finally we can do a u64/u64 division. 3930 */ 3931 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 3932 REDUCE_FLS(nsec, frequency); 3933 REDUCE_FLS(sec, count); 3934 } 3935 3936 if (count_fls + sec_fls > 64) { 3937 divisor = nsec * frequency; 3938 3939 while (count_fls + sec_fls > 64) { 3940 REDUCE_FLS(count, sec); 3941 divisor >>= 1; 3942 } 3943 3944 dividend = count * sec; 3945 } else { 3946 dividend = count * sec; 3947 3948 while (nsec_fls + frequency_fls > 64) { 3949 REDUCE_FLS(nsec, frequency); 3950 dividend >>= 1; 3951 } 3952 3953 divisor = nsec * frequency; 3954 } 3955 3956 if (!divisor) 3957 return dividend; 3958 3959 return div64_u64(dividend, divisor); 3960 } 3961 3962 static DEFINE_PER_CPU(int, perf_throttled_count); 3963 static DEFINE_PER_CPU(u64, perf_throttled_seq); 3964 3965 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) 3966 { 3967 struct hw_perf_event *hwc = &event->hw; 3968 s64 period, sample_period; 3969 s64 delta; 3970 3971 period = perf_calculate_period(event, nsec, count); 3972 3973 delta = (s64)(period - hwc->sample_period); 3974 delta = (delta + 7) / 8; /* low pass filter */ 3975 3976 sample_period = hwc->sample_period + delta; 3977 3978 if (!sample_period) 3979 sample_period = 1; 3980 3981 hwc->sample_period = sample_period; 3982 3983 if (local64_read(&hwc->period_left) > 8*sample_period) { 3984 if (disable) 3985 event->pmu->stop(event, PERF_EF_UPDATE); 3986 3987 local64_set(&hwc->period_left, 0); 3988 3989 if (disable) 3990 event->pmu->start(event, PERF_EF_RELOAD); 3991 } 3992 } 3993 3994 /* 3995 * combine freq adjustment with unthrottling to avoid two passes over the 3996 * events. At the same time, make sure, having freq events does not change 3997 * the rate of unthrottling as that would introduce bias. 3998 */ 3999 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, 4000 int needs_unthr) 4001 { 4002 struct perf_event *event; 4003 struct hw_perf_event *hwc; 4004 u64 now, period = TICK_NSEC; 4005 s64 delta; 4006 4007 /* 4008 * only need to iterate over all events iff: 4009 * - context have events in frequency mode (needs freq adjust) 4010 * - there are events to unthrottle on this cpu 4011 */ 4012 if (!(ctx->nr_freq || needs_unthr)) 4013 return; 4014 4015 raw_spin_lock(&ctx->lock); 4016 perf_pmu_disable(ctx->pmu); 4017 4018 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4019 if (event->state != PERF_EVENT_STATE_ACTIVE) 4020 continue; 4021 4022 if (!event_filter_match(event)) 4023 continue; 4024 4025 perf_pmu_disable(event->pmu); 4026 4027 hwc = &event->hw; 4028 4029 if (hwc->interrupts == MAX_INTERRUPTS) { 4030 hwc->interrupts = 0; 4031 perf_log_throttle(event, 1); 4032 event->pmu->start(event, 0); 4033 } 4034 4035 if (!event->attr.freq || !event->attr.sample_freq) 4036 goto next; 4037 4038 /* 4039 * stop the event and update event->count 4040 */ 4041 event->pmu->stop(event, PERF_EF_UPDATE); 4042 4043 now = local64_read(&event->count); 4044 delta = now - hwc->freq_count_stamp; 4045 hwc->freq_count_stamp = now; 4046 4047 /* 4048 * restart the event 4049 * reload only if value has changed 4050 * we have stopped the event so tell that 4051 * to perf_adjust_period() to avoid stopping it 4052 * twice. 4053 */ 4054 if (delta > 0) 4055 perf_adjust_period(event, period, delta, false); 4056 4057 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 4058 next: 4059 perf_pmu_enable(event->pmu); 4060 } 4061 4062 perf_pmu_enable(ctx->pmu); 4063 raw_spin_unlock(&ctx->lock); 4064 } 4065 4066 /* 4067 * Move @event to the tail of the @ctx's elegible events. 4068 */ 4069 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) 4070 { 4071 /* 4072 * Rotate the first entry last of non-pinned groups. Rotation might be 4073 * disabled by the inheritance code. 4074 */ 4075 if (ctx->rotate_disable) 4076 return; 4077 4078 perf_event_groups_delete(&ctx->flexible_groups, event); 4079 perf_event_groups_insert(&ctx->flexible_groups, event); 4080 } 4081 4082 /* pick an event from the flexible_groups to rotate */ 4083 static inline struct perf_event * 4084 ctx_event_to_rotate(struct perf_event_context *ctx) 4085 { 4086 struct perf_event *event; 4087 4088 /* pick the first active flexible event */ 4089 event = list_first_entry_or_null(&ctx->flexible_active, 4090 struct perf_event, active_list); 4091 4092 /* if no active flexible event, pick the first event */ 4093 if (!event) { 4094 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), 4095 typeof(*event), group_node); 4096 } 4097 4098 /* 4099 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() 4100 * finds there are unschedulable events, it will set it again. 4101 */ 4102 ctx->rotate_necessary = 0; 4103 4104 return event; 4105 } 4106 4107 static bool perf_rotate_context(struct perf_cpu_context *cpuctx) 4108 { 4109 struct perf_event *cpu_event = NULL, *task_event = NULL; 4110 struct perf_event_context *task_ctx = NULL; 4111 int cpu_rotate, task_rotate; 4112 4113 /* 4114 * Since we run this from IRQ context, nobody can install new 4115 * events, thus the event count values are stable. 4116 */ 4117 4118 cpu_rotate = cpuctx->ctx.rotate_necessary; 4119 task_ctx = cpuctx->task_ctx; 4120 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; 4121 4122 if (!(cpu_rotate || task_rotate)) 4123 return false; 4124 4125 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 4126 perf_pmu_disable(cpuctx->ctx.pmu); 4127 4128 if (task_rotate) 4129 task_event = ctx_event_to_rotate(task_ctx); 4130 if (cpu_rotate) 4131 cpu_event = ctx_event_to_rotate(&cpuctx->ctx); 4132 4133 /* 4134 * As per the order given at ctx_resched() first 'pop' task flexible 4135 * and then, if needed CPU flexible. 4136 */ 4137 if (task_event || (task_ctx && cpu_event)) 4138 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); 4139 if (cpu_event) 4140 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 4141 4142 if (task_event) 4143 rotate_ctx(task_ctx, task_event); 4144 if (cpu_event) 4145 rotate_ctx(&cpuctx->ctx, cpu_event); 4146 4147 perf_event_sched_in(cpuctx, task_ctx, current); 4148 4149 perf_pmu_enable(cpuctx->ctx.pmu); 4150 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 4151 4152 return true; 4153 } 4154 4155 void perf_event_task_tick(void) 4156 { 4157 struct list_head *head = this_cpu_ptr(&active_ctx_list); 4158 struct perf_event_context *ctx, *tmp; 4159 int throttled; 4160 4161 lockdep_assert_irqs_disabled(); 4162 4163 __this_cpu_inc(perf_throttled_seq); 4164 throttled = __this_cpu_xchg(perf_throttled_count, 0); 4165 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 4166 4167 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) 4168 perf_adjust_freq_unthr_context(ctx, throttled); 4169 } 4170 4171 static int event_enable_on_exec(struct perf_event *event, 4172 struct perf_event_context *ctx) 4173 { 4174 if (!event->attr.enable_on_exec) 4175 return 0; 4176 4177 event->attr.enable_on_exec = 0; 4178 if (event->state >= PERF_EVENT_STATE_INACTIVE) 4179 return 0; 4180 4181 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 4182 4183 return 1; 4184 } 4185 4186 /* 4187 * Enable all of a task's events that have been marked enable-on-exec. 4188 * This expects task == current. 4189 */ 4190 static void perf_event_enable_on_exec(int ctxn) 4191 { 4192 struct perf_event_context *ctx, *clone_ctx = NULL; 4193 enum event_type_t event_type = 0; 4194 struct perf_cpu_context *cpuctx; 4195 struct perf_event *event; 4196 unsigned long flags; 4197 int enabled = 0; 4198 4199 local_irq_save(flags); 4200 ctx = current->perf_event_ctxp[ctxn]; 4201 if (!ctx || !ctx->nr_events) 4202 goto out; 4203 4204 cpuctx = __get_cpu_context(ctx); 4205 perf_ctx_lock(cpuctx, ctx); 4206 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 4207 list_for_each_entry(event, &ctx->event_list, event_entry) { 4208 enabled |= event_enable_on_exec(event, ctx); 4209 event_type |= get_event_type(event); 4210 } 4211 4212 /* 4213 * Unclone and reschedule this context if we enabled any event. 4214 */ 4215 if (enabled) { 4216 clone_ctx = unclone_ctx(ctx); 4217 ctx_resched(cpuctx, ctx, event_type); 4218 } else { 4219 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 4220 } 4221 perf_ctx_unlock(cpuctx, ctx); 4222 4223 out: 4224 local_irq_restore(flags); 4225 4226 if (clone_ctx) 4227 put_ctx(clone_ctx); 4228 } 4229 4230 struct perf_read_data { 4231 struct perf_event *event; 4232 bool group; 4233 int ret; 4234 }; 4235 4236 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) 4237 { 4238 u16 local_pkg, event_pkg; 4239 4240 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { 4241 int local_cpu = smp_processor_id(); 4242 4243 event_pkg = topology_physical_package_id(event_cpu); 4244 local_pkg = topology_physical_package_id(local_cpu); 4245 4246 if (event_pkg == local_pkg) 4247 return local_cpu; 4248 } 4249 4250 return event_cpu; 4251 } 4252 4253 /* 4254 * Cross CPU call to read the hardware event 4255 */ 4256 static void __perf_event_read(void *info) 4257 { 4258 struct perf_read_data *data = info; 4259 struct perf_event *sub, *event = data->event; 4260 struct perf_event_context *ctx = event->ctx; 4261 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 4262 struct pmu *pmu = event->pmu; 4263 4264 /* 4265 * If this is a task context, we need to check whether it is 4266 * the current task context of this cpu. If not it has been 4267 * scheduled out before the smp call arrived. In that case 4268 * event->count would have been updated to a recent sample 4269 * when the event was scheduled out. 4270 */ 4271 if (ctx->task && cpuctx->task_ctx != ctx) 4272 return; 4273 4274 raw_spin_lock(&ctx->lock); 4275 if (ctx->is_active & EVENT_TIME) { 4276 update_context_time(ctx); 4277 update_cgrp_time_from_event(event); 4278 } 4279 4280 perf_event_update_time(event); 4281 if (data->group) 4282 perf_event_update_sibling_time(event); 4283 4284 if (event->state != PERF_EVENT_STATE_ACTIVE) 4285 goto unlock; 4286 4287 if (!data->group) { 4288 pmu->read(event); 4289 data->ret = 0; 4290 goto unlock; 4291 } 4292 4293 pmu->start_txn(pmu, PERF_PMU_TXN_READ); 4294 4295 pmu->read(event); 4296 4297 for_each_sibling_event(sub, event) { 4298 if (sub->state == PERF_EVENT_STATE_ACTIVE) { 4299 /* 4300 * Use sibling's PMU rather than @event's since 4301 * sibling could be on different (eg: software) PMU. 4302 */ 4303 sub->pmu->read(sub); 4304 } 4305 } 4306 4307 data->ret = pmu->commit_txn(pmu); 4308 4309 unlock: 4310 raw_spin_unlock(&ctx->lock); 4311 } 4312 4313 static inline u64 perf_event_count(struct perf_event *event) 4314 { 4315 return local64_read(&event->count) + atomic64_read(&event->child_count); 4316 } 4317 4318 /* 4319 * NMI-safe method to read a local event, that is an event that 4320 * is: 4321 * - either for the current task, or for this CPU 4322 * - does not have inherit set, for inherited task events 4323 * will not be local and we cannot read them atomically 4324 * - must not have a pmu::count method 4325 */ 4326 int perf_event_read_local(struct perf_event *event, u64 *value, 4327 u64 *enabled, u64 *running) 4328 { 4329 unsigned long flags; 4330 int ret = 0; 4331 4332 /* 4333 * Disabling interrupts avoids all counter scheduling (context 4334 * switches, timer based rotation and IPIs). 4335 */ 4336 local_irq_save(flags); 4337 4338 /* 4339 * It must not be an event with inherit set, we cannot read 4340 * all child counters from atomic context. 4341 */ 4342 if (event->attr.inherit) { 4343 ret = -EOPNOTSUPP; 4344 goto out; 4345 } 4346 4347 /* If this is a per-task event, it must be for current */ 4348 if ((event->attach_state & PERF_ATTACH_TASK) && 4349 event->hw.target != current) { 4350 ret = -EINVAL; 4351 goto out; 4352 } 4353 4354 /* If this is a per-CPU event, it must be for this CPU */ 4355 if (!(event->attach_state & PERF_ATTACH_TASK) && 4356 event->cpu != smp_processor_id()) { 4357 ret = -EINVAL; 4358 goto out; 4359 } 4360 4361 /* If this is a pinned event it must be running on this CPU */ 4362 if (event->attr.pinned && event->oncpu != smp_processor_id()) { 4363 ret = -EBUSY; 4364 goto out; 4365 } 4366 4367 /* 4368 * If the event is currently on this CPU, its either a per-task event, 4369 * or local to this CPU. Furthermore it means its ACTIVE (otherwise 4370 * oncpu == -1). 4371 */ 4372 if (event->oncpu == smp_processor_id()) 4373 event->pmu->read(event); 4374 4375 *value = local64_read(&event->count); 4376 if (enabled || running) { 4377 u64 now = event->shadow_ctx_time + perf_clock(); 4378 u64 __enabled, __running; 4379 4380 __perf_update_times(event, now, &__enabled, &__running); 4381 if (enabled) 4382 *enabled = __enabled; 4383 if (running) 4384 *running = __running; 4385 } 4386 out: 4387 local_irq_restore(flags); 4388 4389 return ret; 4390 } 4391 4392 static int perf_event_read(struct perf_event *event, bool group) 4393 { 4394 enum perf_event_state state = READ_ONCE(event->state); 4395 int event_cpu, ret = 0; 4396 4397 /* 4398 * If event is enabled and currently active on a CPU, update the 4399 * value in the event structure: 4400 */ 4401 again: 4402 if (state == PERF_EVENT_STATE_ACTIVE) { 4403 struct perf_read_data data; 4404 4405 /* 4406 * Orders the ->state and ->oncpu loads such that if we see 4407 * ACTIVE we must also see the right ->oncpu. 4408 * 4409 * Matches the smp_wmb() from event_sched_in(). 4410 */ 4411 smp_rmb(); 4412 4413 event_cpu = READ_ONCE(event->oncpu); 4414 if ((unsigned)event_cpu >= nr_cpu_ids) 4415 return 0; 4416 4417 data = (struct perf_read_data){ 4418 .event = event, 4419 .group = group, 4420 .ret = 0, 4421 }; 4422 4423 preempt_disable(); 4424 event_cpu = __perf_event_read_cpu(event, event_cpu); 4425 4426 /* 4427 * Purposely ignore the smp_call_function_single() return 4428 * value. 4429 * 4430 * If event_cpu isn't a valid CPU it means the event got 4431 * scheduled out and that will have updated the event count. 4432 * 4433 * Therefore, either way, we'll have an up-to-date event count 4434 * after this. 4435 */ 4436 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); 4437 preempt_enable(); 4438 ret = data.ret; 4439 4440 } else if (state == PERF_EVENT_STATE_INACTIVE) { 4441 struct perf_event_context *ctx = event->ctx; 4442 unsigned long flags; 4443 4444 raw_spin_lock_irqsave(&ctx->lock, flags); 4445 state = event->state; 4446 if (state != PERF_EVENT_STATE_INACTIVE) { 4447 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4448 goto again; 4449 } 4450 4451 /* 4452 * May read while context is not active (e.g., thread is 4453 * blocked), in that case we cannot update context time 4454 */ 4455 if (ctx->is_active & EVENT_TIME) { 4456 update_context_time(ctx); 4457 update_cgrp_time_from_event(event); 4458 } 4459 4460 perf_event_update_time(event); 4461 if (group) 4462 perf_event_update_sibling_time(event); 4463 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4464 } 4465 4466 return ret; 4467 } 4468 4469 /* 4470 * Initialize the perf_event context in a task_struct: 4471 */ 4472 static void __perf_event_init_context(struct perf_event_context *ctx) 4473 { 4474 raw_spin_lock_init(&ctx->lock); 4475 mutex_init(&ctx->mutex); 4476 INIT_LIST_HEAD(&ctx->active_ctx_list); 4477 perf_event_groups_init(&ctx->pinned_groups); 4478 perf_event_groups_init(&ctx->flexible_groups); 4479 INIT_LIST_HEAD(&ctx->event_list); 4480 INIT_LIST_HEAD(&ctx->pinned_active); 4481 INIT_LIST_HEAD(&ctx->flexible_active); 4482 refcount_set(&ctx->refcount, 1); 4483 } 4484 4485 static struct perf_event_context * 4486 alloc_perf_context(struct pmu *pmu, struct task_struct *task) 4487 { 4488 struct perf_event_context *ctx; 4489 4490 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 4491 if (!ctx) 4492 return NULL; 4493 4494 __perf_event_init_context(ctx); 4495 if (task) 4496 ctx->task = get_task_struct(task); 4497 ctx->pmu = pmu; 4498 4499 return ctx; 4500 } 4501 4502 static struct task_struct * 4503 find_lively_task_by_vpid(pid_t vpid) 4504 { 4505 struct task_struct *task; 4506 4507 rcu_read_lock(); 4508 if (!vpid) 4509 task = current; 4510 else 4511 task = find_task_by_vpid(vpid); 4512 if (task) 4513 get_task_struct(task); 4514 rcu_read_unlock(); 4515 4516 if (!task) 4517 return ERR_PTR(-ESRCH); 4518 4519 return task; 4520 } 4521 4522 /* 4523 * Returns a matching context with refcount and pincount. 4524 */ 4525 static struct perf_event_context * 4526 find_get_context(struct pmu *pmu, struct task_struct *task, 4527 struct perf_event *event) 4528 { 4529 struct perf_event_context *ctx, *clone_ctx = NULL; 4530 struct perf_cpu_context *cpuctx; 4531 void *task_ctx_data = NULL; 4532 unsigned long flags; 4533 int ctxn, err; 4534 int cpu = event->cpu; 4535 4536 if (!task) { 4537 /* Must be root to operate on a CPU event: */ 4538 err = perf_allow_cpu(&event->attr); 4539 if (err) 4540 return ERR_PTR(err); 4541 4542 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 4543 ctx = &cpuctx->ctx; 4544 get_ctx(ctx); 4545 raw_spin_lock_irqsave(&ctx->lock, flags); 4546 ++ctx->pin_count; 4547 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4548 4549 return ctx; 4550 } 4551 4552 err = -EINVAL; 4553 ctxn = pmu->task_ctx_nr; 4554 if (ctxn < 0) 4555 goto errout; 4556 4557 if (event->attach_state & PERF_ATTACH_TASK_DATA) { 4558 task_ctx_data = alloc_task_ctx_data(pmu); 4559 if (!task_ctx_data) { 4560 err = -ENOMEM; 4561 goto errout; 4562 } 4563 } 4564 4565 retry: 4566 ctx = perf_lock_task_context(task, ctxn, &flags); 4567 if (ctx) { 4568 clone_ctx = unclone_ctx(ctx); 4569 ++ctx->pin_count; 4570 4571 if (task_ctx_data && !ctx->task_ctx_data) { 4572 ctx->task_ctx_data = task_ctx_data; 4573 task_ctx_data = NULL; 4574 } 4575 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4576 4577 if (clone_ctx) 4578 put_ctx(clone_ctx); 4579 } else { 4580 ctx = alloc_perf_context(pmu, task); 4581 err = -ENOMEM; 4582 if (!ctx) 4583 goto errout; 4584 4585 if (task_ctx_data) { 4586 ctx->task_ctx_data = task_ctx_data; 4587 task_ctx_data = NULL; 4588 } 4589 4590 err = 0; 4591 mutex_lock(&task->perf_event_mutex); 4592 /* 4593 * If it has already passed perf_event_exit_task(). 4594 * we must see PF_EXITING, it takes this mutex too. 4595 */ 4596 if (task->flags & PF_EXITING) 4597 err = -ESRCH; 4598 else if (task->perf_event_ctxp[ctxn]) 4599 err = -EAGAIN; 4600 else { 4601 get_ctx(ctx); 4602 ++ctx->pin_count; 4603 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 4604 } 4605 mutex_unlock(&task->perf_event_mutex); 4606 4607 if (unlikely(err)) { 4608 put_ctx(ctx); 4609 4610 if (err == -EAGAIN) 4611 goto retry; 4612 goto errout; 4613 } 4614 } 4615 4616 free_task_ctx_data(pmu, task_ctx_data); 4617 return ctx; 4618 4619 errout: 4620 free_task_ctx_data(pmu, task_ctx_data); 4621 return ERR_PTR(err); 4622 } 4623 4624 static void perf_event_free_filter(struct perf_event *event); 4625 static void perf_event_free_bpf_prog(struct perf_event *event); 4626 4627 static void free_event_rcu(struct rcu_head *head) 4628 { 4629 struct perf_event *event; 4630 4631 event = container_of(head, struct perf_event, rcu_head); 4632 if (event->ns) 4633 put_pid_ns(event->ns); 4634 perf_event_free_filter(event); 4635 kfree(event); 4636 } 4637 4638 static void ring_buffer_attach(struct perf_event *event, 4639 struct perf_buffer *rb); 4640 4641 static void detach_sb_event(struct perf_event *event) 4642 { 4643 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 4644 4645 raw_spin_lock(&pel->lock); 4646 list_del_rcu(&event->sb_list); 4647 raw_spin_unlock(&pel->lock); 4648 } 4649 4650 static bool is_sb_event(struct perf_event *event) 4651 { 4652 struct perf_event_attr *attr = &event->attr; 4653 4654 if (event->parent) 4655 return false; 4656 4657 if (event->attach_state & PERF_ATTACH_TASK) 4658 return false; 4659 4660 if (attr->mmap || attr->mmap_data || attr->mmap2 || 4661 attr->comm || attr->comm_exec || 4662 attr->task || attr->ksymbol || 4663 attr->context_switch || attr->text_poke || 4664 attr->bpf_event) 4665 return true; 4666 return false; 4667 } 4668 4669 static void unaccount_pmu_sb_event(struct perf_event *event) 4670 { 4671 if (is_sb_event(event)) 4672 detach_sb_event(event); 4673 } 4674 4675 static void unaccount_event_cpu(struct perf_event *event, int cpu) 4676 { 4677 if (event->parent) 4678 return; 4679 4680 if (is_cgroup_event(event)) 4681 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 4682 } 4683 4684 #ifdef CONFIG_NO_HZ_FULL 4685 static DEFINE_SPINLOCK(nr_freq_lock); 4686 #endif 4687 4688 static void unaccount_freq_event_nohz(void) 4689 { 4690 #ifdef CONFIG_NO_HZ_FULL 4691 spin_lock(&nr_freq_lock); 4692 if (atomic_dec_and_test(&nr_freq_events)) 4693 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); 4694 spin_unlock(&nr_freq_lock); 4695 #endif 4696 } 4697 4698 static void unaccount_freq_event(void) 4699 { 4700 if (tick_nohz_full_enabled()) 4701 unaccount_freq_event_nohz(); 4702 else 4703 atomic_dec(&nr_freq_events); 4704 } 4705 4706 static void unaccount_event(struct perf_event *event) 4707 { 4708 bool dec = false; 4709 4710 if (event->parent) 4711 return; 4712 4713 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) 4714 dec = true; 4715 if (event->attr.mmap || event->attr.mmap_data) 4716 atomic_dec(&nr_mmap_events); 4717 if (event->attr.build_id) 4718 atomic_dec(&nr_build_id_events); 4719 if (event->attr.comm) 4720 atomic_dec(&nr_comm_events); 4721 if (event->attr.namespaces) 4722 atomic_dec(&nr_namespaces_events); 4723 if (event->attr.cgroup) 4724 atomic_dec(&nr_cgroup_events); 4725 if (event->attr.task) 4726 atomic_dec(&nr_task_events); 4727 if (event->attr.freq) 4728 unaccount_freq_event(); 4729 if (event->attr.context_switch) { 4730 dec = true; 4731 atomic_dec(&nr_switch_events); 4732 } 4733 if (is_cgroup_event(event)) 4734 dec = true; 4735 if (has_branch_stack(event)) 4736 dec = true; 4737 if (event->attr.ksymbol) 4738 atomic_dec(&nr_ksymbol_events); 4739 if (event->attr.bpf_event) 4740 atomic_dec(&nr_bpf_events); 4741 if (event->attr.text_poke) 4742 atomic_dec(&nr_text_poke_events); 4743 4744 if (dec) { 4745 if (!atomic_add_unless(&perf_sched_count, -1, 1)) 4746 schedule_delayed_work(&perf_sched_work, HZ); 4747 } 4748 4749 unaccount_event_cpu(event, event->cpu); 4750 4751 unaccount_pmu_sb_event(event); 4752 } 4753 4754 static void perf_sched_delayed(struct work_struct *work) 4755 { 4756 mutex_lock(&perf_sched_mutex); 4757 if (atomic_dec_and_test(&perf_sched_count)) 4758 static_branch_disable(&perf_sched_events); 4759 mutex_unlock(&perf_sched_mutex); 4760 } 4761 4762 /* 4763 * The following implement mutual exclusion of events on "exclusive" pmus 4764 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled 4765 * at a time, so we disallow creating events that might conflict, namely: 4766 * 4767 * 1) cpu-wide events in the presence of per-task events, 4768 * 2) per-task events in the presence of cpu-wide events, 4769 * 3) two matching events on the same context. 4770 * 4771 * The former two cases are handled in the allocation path (perf_event_alloc(), 4772 * _free_event()), the latter -- before the first perf_install_in_context(). 4773 */ 4774 static int exclusive_event_init(struct perf_event *event) 4775 { 4776 struct pmu *pmu = event->pmu; 4777 4778 if (!is_exclusive_pmu(pmu)) 4779 return 0; 4780 4781 /* 4782 * Prevent co-existence of per-task and cpu-wide events on the 4783 * same exclusive pmu. 4784 * 4785 * Negative pmu::exclusive_cnt means there are cpu-wide 4786 * events on this "exclusive" pmu, positive means there are 4787 * per-task events. 4788 * 4789 * Since this is called in perf_event_alloc() path, event::ctx 4790 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK 4791 * to mean "per-task event", because unlike other attach states it 4792 * never gets cleared. 4793 */ 4794 if (event->attach_state & PERF_ATTACH_TASK) { 4795 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) 4796 return -EBUSY; 4797 } else { 4798 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) 4799 return -EBUSY; 4800 } 4801 4802 return 0; 4803 } 4804 4805 static void exclusive_event_destroy(struct perf_event *event) 4806 { 4807 struct pmu *pmu = event->pmu; 4808 4809 if (!is_exclusive_pmu(pmu)) 4810 return; 4811 4812 /* see comment in exclusive_event_init() */ 4813 if (event->attach_state & PERF_ATTACH_TASK) 4814 atomic_dec(&pmu->exclusive_cnt); 4815 else 4816 atomic_inc(&pmu->exclusive_cnt); 4817 } 4818 4819 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) 4820 { 4821 if ((e1->pmu == e2->pmu) && 4822 (e1->cpu == e2->cpu || 4823 e1->cpu == -1 || 4824 e2->cpu == -1)) 4825 return true; 4826 return false; 4827 } 4828 4829 static bool exclusive_event_installable(struct perf_event *event, 4830 struct perf_event_context *ctx) 4831 { 4832 struct perf_event *iter_event; 4833 struct pmu *pmu = event->pmu; 4834 4835 lockdep_assert_held(&ctx->mutex); 4836 4837 if (!is_exclusive_pmu(pmu)) 4838 return true; 4839 4840 list_for_each_entry(iter_event, &ctx->event_list, event_entry) { 4841 if (exclusive_event_match(iter_event, event)) 4842 return false; 4843 } 4844 4845 return true; 4846 } 4847 4848 static void perf_addr_filters_splice(struct perf_event *event, 4849 struct list_head *head); 4850 4851 static void _free_event(struct perf_event *event) 4852 { 4853 irq_work_sync(&event->pending); 4854 4855 unaccount_event(event); 4856 4857 security_perf_event_free(event); 4858 4859 if (event->rb) { 4860 /* 4861 * Can happen when we close an event with re-directed output. 4862 * 4863 * Since we have a 0 refcount, perf_mmap_close() will skip 4864 * over us; possibly making our ring_buffer_put() the last. 4865 */ 4866 mutex_lock(&event->mmap_mutex); 4867 ring_buffer_attach(event, NULL); 4868 mutex_unlock(&event->mmap_mutex); 4869 } 4870 4871 if (is_cgroup_event(event)) 4872 perf_detach_cgroup(event); 4873 4874 if (!event->parent) { 4875 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 4876 put_callchain_buffers(); 4877 } 4878 4879 perf_event_free_bpf_prog(event); 4880 perf_addr_filters_splice(event, NULL); 4881 kfree(event->addr_filter_ranges); 4882 4883 if (event->destroy) 4884 event->destroy(event); 4885 4886 /* 4887 * Must be after ->destroy(), due to uprobe_perf_close() using 4888 * hw.target. 4889 */ 4890 if (event->hw.target) 4891 put_task_struct(event->hw.target); 4892 4893 /* 4894 * perf_event_free_task() relies on put_ctx() being 'last', in particular 4895 * all task references must be cleaned up. 4896 */ 4897 if (event->ctx) 4898 put_ctx(event->ctx); 4899 4900 exclusive_event_destroy(event); 4901 module_put(event->pmu->module); 4902 4903 call_rcu(&event->rcu_head, free_event_rcu); 4904 } 4905 4906 /* 4907 * Used to free events which have a known refcount of 1, such as in error paths 4908 * where the event isn't exposed yet and inherited events. 4909 */ 4910 static void free_event(struct perf_event *event) 4911 { 4912 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 4913 "unexpected event refcount: %ld; ptr=%p\n", 4914 atomic_long_read(&event->refcount), event)) { 4915 /* leak to avoid use-after-free */ 4916 return; 4917 } 4918 4919 _free_event(event); 4920 } 4921 4922 /* 4923 * Remove user event from the owner task. 4924 */ 4925 static void perf_remove_from_owner(struct perf_event *event) 4926 { 4927 struct task_struct *owner; 4928 4929 rcu_read_lock(); 4930 /* 4931 * Matches the smp_store_release() in perf_event_exit_task(). If we 4932 * observe !owner it means the list deletion is complete and we can 4933 * indeed free this event, otherwise we need to serialize on 4934 * owner->perf_event_mutex. 4935 */ 4936 owner = READ_ONCE(event->owner); 4937 if (owner) { 4938 /* 4939 * Since delayed_put_task_struct() also drops the last 4940 * task reference we can safely take a new reference 4941 * while holding the rcu_read_lock(). 4942 */ 4943 get_task_struct(owner); 4944 } 4945 rcu_read_unlock(); 4946 4947 if (owner) { 4948 /* 4949 * If we're here through perf_event_exit_task() we're already 4950 * holding ctx->mutex which would be an inversion wrt. the 4951 * normal lock order. 4952 * 4953 * However we can safely take this lock because its the child 4954 * ctx->mutex. 4955 */ 4956 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); 4957 4958 /* 4959 * We have to re-check the event->owner field, if it is cleared 4960 * we raced with perf_event_exit_task(), acquiring the mutex 4961 * ensured they're done, and we can proceed with freeing the 4962 * event. 4963 */ 4964 if (event->owner) { 4965 list_del_init(&event->owner_entry); 4966 smp_store_release(&event->owner, NULL); 4967 } 4968 mutex_unlock(&owner->perf_event_mutex); 4969 put_task_struct(owner); 4970 } 4971 } 4972 4973 static void put_event(struct perf_event *event) 4974 { 4975 if (!atomic_long_dec_and_test(&event->refcount)) 4976 return; 4977 4978 _free_event(event); 4979 } 4980 4981 /* 4982 * Kill an event dead; while event:refcount will preserve the event 4983 * object, it will not preserve its functionality. Once the last 'user' 4984 * gives up the object, we'll destroy the thing. 4985 */ 4986 int perf_event_release_kernel(struct perf_event *event) 4987 { 4988 struct perf_event_context *ctx = event->ctx; 4989 struct perf_event *child, *tmp; 4990 LIST_HEAD(free_list); 4991 4992 /* 4993 * If we got here through err_file: fput(event_file); we will not have 4994 * attached to a context yet. 4995 */ 4996 if (!ctx) { 4997 WARN_ON_ONCE(event->attach_state & 4998 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); 4999 goto no_ctx; 5000 } 5001 5002 if (!is_kernel_event(event)) 5003 perf_remove_from_owner(event); 5004 5005 ctx = perf_event_ctx_lock(event); 5006 WARN_ON_ONCE(ctx->parent_ctx); 5007 perf_remove_from_context(event, DETACH_GROUP); 5008 5009 raw_spin_lock_irq(&ctx->lock); 5010 /* 5011 * Mark this event as STATE_DEAD, there is no external reference to it 5012 * anymore. 5013 * 5014 * Anybody acquiring event->child_mutex after the below loop _must_ 5015 * also see this, most importantly inherit_event() which will avoid 5016 * placing more children on the list. 5017 * 5018 * Thus this guarantees that we will in fact observe and kill _ALL_ 5019 * child events. 5020 */ 5021 event->state = PERF_EVENT_STATE_DEAD; 5022 raw_spin_unlock_irq(&ctx->lock); 5023 5024 perf_event_ctx_unlock(event, ctx); 5025 5026 again: 5027 mutex_lock(&event->child_mutex); 5028 list_for_each_entry(child, &event->child_list, child_list) { 5029 5030 /* 5031 * Cannot change, child events are not migrated, see the 5032 * comment with perf_event_ctx_lock_nested(). 5033 */ 5034 ctx = READ_ONCE(child->ctx); 5035 /* 5036 * Since child_mutex nests inside ctx::mutex, we must jump 5037 * through hoops. We start by grabbing a reference on the ctx. 5038 * 5039 * Since the event cannot get freed while we hold the 5040 * child_mutex, the context must also exist and have a !0 5041 * reference count. 5042 */ 5043 get_ctx(ctx); 5044 5045 /* 5046 * Now that we have a ctx ref, we can drop child_mutex, and 5047 * acquire ctx::mutex without fear of it going away. Then we 5048 * can re-acquire child_mutex. 5049 */ 5050 mutex_unlock(&event->child_mutex); 5051 mutex_lock(&ctx->mutex); 5052 mutex_lock(&event->child_mutex); 5053 5054 /* 5055 * Now that we hold ctx::mutex and child_mutex, revalidate our 5056 * state, if child is still the first entry, it didn't get freed 5057 * and we can continue doing so. 5058 */ 5059 tmp = list_first_entry_or_null(&event->child_list, 5060 struct perf_event, child_list); 5061 if (tmp == child) { 5062 perf_remove_from_context(child, DETACH_GROUP); 5063 list_move(&child->child_list, &free_list); 5064 /* 5065 * This matches the refcount bump in inherit_event(); 5066 * this can't be the last reference. 5067 */ 5068 put_event(event); 5069 } 5070 5071 mutex_unlock(&event->child_mutex); 5072 mutex_unlock(&ctx->mutex); 5073 put_ctx(ctx); 5074 goto again; 5075 } 5076 mutex_unlock(&event->child_mutex); 5077 5078 list_for_each_entry_safe(child, tmp, &free_list, child_list) { 5079 void *var = &child->ctx->refcount; 5080 5081 list_del(&child->child_list); 5082 free_event(child); 5083 5084 /* 5085 * Wake any perf_event_free_task() waiting for this event to be 5086 * freed. 5087 */ 5088 smp_mb(); /* pairs with wait_var_event() */ 5089 wake_up_var(var); 5090 } 5091 5092 no_ctx: 5093 put_event(event); /* Must be the 'last' reference */ 5094 return 0; 5095 } 5096 EXPORT_SYMBOL_GPL(perf_event_release_kernel); 5097 5098 /* 5099 * Called when the last reference to the file is gone. 5100 */ 5101 static int perf_release(struct inode *inode, struct file *file) 5102 { 5103 perf_event_release_kernel(file->private_data); 5104 return 0; 5105 } 5106 5107 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 5108 { 5109 struct perf_event *child; 5110 u64 total = 0; 5111 5112 *enabled = 0; 5113 *running = 0; 5114 5115 mutex_lock(&event->child_mutex); 5116 5117 (void)perf_event_read(event, false); 5118 total += perf_event_count(event); 5119 5120 *enabled += event->total_time_enabled + 5121 atomic64_read(&event->child_total_time_enabled); 5122 *running += event->total_time_running + 5123 atomic64_read(&event->child_total_time_running); 5124 5125 list_for_each_entry(child, &event->child_list, child_list) { 5126 (void)perf_event_read(child, false); 5127 total += perf_event_count(child); 5128 *enabled += child->total_time_enabled; 5129 *running += child->total_time_running; 5130 } 5131 mutex_unlock(&event->child_mutex); 5132 5133 return total; 5134 } 5135 5136 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 5137 { 5138 struct perf_event_context *ctx; 5139 u64 count; 5140 5141 ctx = perf_event_ctx_lock(event); 5142 count = __perf_event_read_value(event, enabled, running); 5143 perf_event_ctx_unlock(event, ctx); 5144 5145 return count; 5146 } 5147 EXPORT_SYMBOL_GPL(perf_event_read_value); 5148 5149 static int __perf_read_group_add(struct perf_event *leader, 5150 u64 read_format, u64 *values) 5151 { 5152 struct perf_event_context *ctx = leader->ctx; 5153 struct perf_event *sub; 5154 unsigned long flags; 5155 int n = 1; /* skip @nr */ 5156 int ret; 5157 5158 ret = perf_event_read(leader, true); 5159 if (ret) 5160 return ret; 5161 5162 raw_spin_lock_irqsave(&ctx->lock, flags); 5163 5164 /* 5165 * Since we co-schedule groups, {enabled,running} times of siblings 5166 * will be identical to those of the leader, so we only publish one 5167 * set. 5168 */ 5169 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 5170 values[n++] += leader->total_time_enabled + 5171 atomic64_read(&leader->child_total_time_enabled); 5172 } 5173 5174 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 5175 values[n++] += leader->total_time_running + 5176 atomic64_read(&leader->child_total_time_running); 5177 } 5178 5179 /* 5180 * Write {count,id} tuples for every sibling. 5181 */ 5182 values[n++] += perf_event_count(leader); 5183 if (read_format & PERF_FORMAT_ID) 5184 values[n++] = primary_event_id(leader); 5185 5186 for_each_sibling_event(sub, leader) { 5187 values[n++] += perf_event_count(sub); 5188 if (read_format & PERF_FORMAT_ID) 5189 values[n++] = primary_event_id(sub); 5190 } 5191 5192 raw_spin_unlock_irqrestore(&ctx->lock, flags); 5193 return 0; 5194 } 5195 5196 static int perf_read_group(struct perf_event *event, 5197 u64 read_format, char __user *buf) 5198 { 5199 struct perf_event *leader = event->group_leader, *child; 5200 struct perf_event_context *ctx = leader->ctx; 5201 int ret; 5202 u64 *values; 5203 5204 lockdep_assert_held(&ctx->mutex); 5205 5206 values = kzalloc(event->read_size, GFP_KERNEL); 5207 if (!values) 5208 return -ENOMEM; 5209 5210 values[0] = 1 + leader->nr_siblings; 5211 5212 /* 5213 * By locking the child_mutex of the leader we effectively 5214 * lock the child list of all siblings.. XXX explain how. 5215 */ 5216 mutex_lock(&leader->child_mutex); 5217 5218 ret = __perf_read_group_add(leader, read_format, values); 5219 if (ret) 5220 goto unlock; 5221 5222 list_for_each_entry(child, &leader->child_list, child_list) { 5223 ret = __perf_read_group_add(child, read_format, values); 5224 if (ret) 5225 goto unlock; 5226 } 5227 5228 mutex_unlock(&leader->child_mutex); 5229 5230 ret = event->read_size; 5231 if (copy_to_user(buf, values, event->read_size)) 5232 ret = -EFAULT; 5233 goto out; 5234 5235 unlock: 5236 mutex_unlock(&leader->child_mutex); 5237 out: 5238 kfree(values); 5239 return ret; 5240 } 5241 5242 static int perf_read_one(struct perf_event *event, 5243 u64 read_format, char __user *buf) 5244 { 5245 u64 enabled, running; 5246 u64 values[4]; 5247 int n = 0; 5248 5249 values[n++] = __perf_event_read_value(event, &enabled, &running); 5250 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 5251 values[n++] = enabled; 5252 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 5253 values[n++] = running; 5254 if (read_format & PERF_FORMAT_ID) 5255 values[n++] = primary_event_id(event); 5256 5257 if (copy_to_user(buf, values, n * sizeof(u64))) 5258 return -EFAULT; 5259 5260 return n * sizeof(u64); 5261 } 5262 5263 static bool is_event_hup(struct perf_event *event) 5264 { 5265 bool no_children; 5266 5267 if (event->state > PERF_EVENT_STATE_EXIT) 5268 return false; 5269 5270 mutex_lock(&event->child_mutex); 5271 no_children = list_empty(&event->child_list); 5272 mutex_unlock(&event->child_mutex); 5273 return no_children; 5274 } 5275 5276 /* 5277 * Read the performance event - simple non blocking version for now 5278 */ 5279 static ssize_t 5280 __perf_read(struct perf_event *event, char __user *buf, size_t count) 5281 { 5282 u64 read_format = event->attr.read_format; 5283 int ret; 5284 5285 /* 5286 * Return end-of-file for a read on an event that is in 5287 * error state (i.e. because it was pinned but it couldn't be 5288 * scheduled on to the CPU at some point). 5289 */ 5290 if (event->state == PERF_EVENT_STATE_ERROR) 5291 return 0; 5292 5293 if (count < event->read_size) 5294 return -ENOSPC; 5295 5296 WARN_ON_ONCE(event->ctx->parent_ctx); 5297 if (read_format & PERF_FORMAT_GROUP) 5298 ret = perf_read_group(event, read_format, buf); 5299 else 5300 ret = perf_read_one(event, read_format, buf); 5301 5302 return ret; 5303 } 5304 5305 static ssize_t 5306 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 5307 { 5308 struct perf_event *event = file->private_data; 5309 struct perf_event_context *ctx; 5310 int ret; 5311 5312 ret = security_perf_event_read(event); 5313 if (ret) 5314 return ret; 5315 5316 ctx = perf_event_ctx_lock(event); 5317 ret = __perf_read(event, buf, count); 5318 perf_event_ctx_unlock(event, ctx); 5319 5320 return ret; 5321 } 5322 5323 static __poll_t perf_poll(struct file *file, poll_table *wait) 5324 { 5325 struct perf_event *event = file->private_data; 5326 struct perf_buffer *rb; 5327 __poll_t events = EPOLLHUP; 5328 5329 poll_wait(file, &event->waitq, wait); 5330 5331 if (is_event_hup(event)) 5332 return events; 5333 5334 /* 5335 * Pin the event->rb by taking event->mmap_mutex; otherwise 5336 * perf_event_set_output() can swizzle our rb and make us miss wakeups. 5337 */ 5338 mutex_lock(&event->mmap_mutex); 5339 rb = event->rb; 5340 if (rb) 5341 events = atomic_xchg(&rb->poll, 0); 5342 mutex_unlock(&event->mmap_mutex); 5343 return events; 5344 } 5345 5346 static void _perf_event_reset(struct perf_event *event) 5347 { 5348 (void)perf_event_read(event, false); 5349 local64_set(&event->count, 0); 5350 perf_event_update_userpage(event); 5351 } 5352 5353 /* Assume it's not an event with inherit set. */ 5354 u64 perf_event_pause(struct perf_event *event, bool reset) 5355 { 5356 struct perf_event_context *ctx; 5357 u64 count; 5358 5359 ctx = perf_event_ctx_lock(event); 5360 WARN_ON_ONCE(event->attr.inherit); 5361 _perf_event_disable(event); 5362 count = local64_read(&event->count); 5363 if (reset) 5364 local64_set(&event->count, 0); 5365 perf_event_ctx_unlock(event, ctx); 5366 5367 return count; 5368 } 5369 EXPORT_SYMBOL_GPL(perf_event_pause); 5370 5371 /* 5372 * Holding the top-level event's child_mutex means that any 5373 * descendant process that has inherited this event will block 5374 * in perf_event_exit_event() if it goes to exit, thus satisfying the 5375 * task existence requirements of perf_event_enable/disable. 5376 */ 5377 static void perf_event_for_each_child(struct perf_event *event, 5378 void (*func)(struct perf_event *)) 5379 { 5380 struct perf_event *child; 5381 5382 WARN_ON_ONCE(event->ctx->parent_ctx); 5383 5384 mutex_lock(&event->child_mutex); 5385 func(event); 5386 list_for_each_entry(child, &event->child_list, child_list) 5387 func(child); 5388 mutex_unlock(&event->child_mutex); 5389 } 5390 5391 static void perf_event_for_each(struct perf_event *event, 5392 void (*func)(struct perf_event *)) 5393 { 5394 struct perf_event_context *ctx = event->ctx; 5395 struct perf_event *sibling; 5396 5397 lockdep_assert_held(&ctx->mutex); 5398 5399 event = event->group_leader; 5400 5401 perf_event_for_each_child(event, func); 5402 for_each_sibling_event(sibling, event) 5403 perf_event_for_each_child(sibling, func); 5404 } 5405 5406 static void __perf_event_period(struct perf_event *event, 5407 struct perf_cpu_context *cpuctx, 5408 struct perf_event_context *ctx, 5409 void *info) 5410 { 5411 u64 value = *((u64 *)info); 5412 bool active; 5413 5414 if (event->attr.freq) { 5415 event->attr.sample_freq = value; 5416 } else { 5417 event->attr.sample_period = value; 5418 event->hw.sample_period = value; 5419 } 5420 5421 active = (event->state == PERF_EVENT_STATE_ACTIVE); 5422 if (active) { 5423 perf_pmu_disable(ctx->pmu); 5424 /* 5425 * We could be throttled; unthrottle now to avoid the tick 5426 * trying to unthrottle while we already re-started the event. 5427 */ 5428 if (event->hw.interrupts == MAX_INTERRUPTS) { 5429 event->hw.interrupts = 0; 5430 perf_log_throttle(event, 1); 5431 } 5432 event->pmu->stop(event, PERF_EF_UPDATE); 5433 } 5434 5435 local64_set(&event->hw.period_left, 0); 5436 5437 if (active) { 5438 event->pmu->start(event, PERF_EF_RELOAD); 5439 perf_pmu_enable(ctx->pmu); 5440 } 5441 } 5442 5443 static int perf_event_check_period(struct perf_event *event, u64 value) 5444 { 5445 return event->pmu->check_period(event, value); 5446 } 5447 5448 static int _perf_event_period(struct perf_event *event, u64 value) 5449 { 5450 if (!is_sampling_event(event)) 5451 return -EINVAL; 5452 5453 if (!value) 5454 return -EINVAL; 5455 5456 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 5457 return -EINVAL; 5458 5459 if (perf_event_check_period(event, value)) 5460 return -EINVAL; 5461 5462 if (!event->attr.freq && (value & (1ULL << 63))) 5463 return -EINVAL; 5464 5465 event_function_call(event, __perf_event_period, &value); 5466 5467 return 0; 5468 } 5469 5470 int perf_event_period(struct perf_event *event, u64 value) 5471 { 5472 struct perf_event_context *ctx; 5473 int ret; 5474 5475 ctx = perf_event_ctx_lock(event); 5476 ret = _perf_event_period(event, value); 5477 perf_event_ctx_unlock(event, ctx); 5478 5479 return ret; 5480 } 5481 EXPORT_SYMBOL_GPL(perf_event_period); 5482 5483 static const struct file_operations perf_fops; 5484 5485 static inline int perf_fget_light(int fd, struct fd *p) 5486 { 5487 struct fd f = fdget(fd); 5488 if (!f.file) 5489 return -EBADF; 5490 5491 if (f.file->f_op != &perf_fops) { 5492 fdput(f); 5493 return -EBADF; 5494 } 5495 *p = f; 5496 return 0; 5497 } 5498 5499 static int perf_event_set_output(struct perf_event *event, 5500 struct perf_event *output_event); 5501 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 5502 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); 5503 static int perf_copy_attr(struct perf_event_attr __user *uattr, 5504 struct perf_event_attr *attr); 5505 5506 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 5507 { 5508 void (*func)(struct perf_event *); 5509 u32 flags = arg; 5510 5511 switch (cmd) { 5512 case PERF_EVENT_IOC_ENABLE: 5513 func = _perf_event_enable; 5514 break; 5515 case PERF_EVENT_IOC_DISABLE: 5516 func = _perf_event_disable; 5517 break; 5518 case PERF_EVENT_IOC_RESET: 5519 func = _perf_event_reset; 5520 break; 5521 5522 case PERF_EVENT_IOC_REFRESH: 5523 return _perf_event_refresh(event, arg); 5524 5525 case PERF_EVENT_IOC_PERIOD: 5526 { 5527 u64 value; 5528 5529 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) 5530 return -EFAULT; 5531 5532 return _perf_event_period(event, value); 5533 } 5534 case PERF_EVENT_IOC_ID: 5535 { 5536 u64 id = primary_event_id(event); 5537 5538 if (copy_to_user((void __user *)arg, &id, sizeof(id))) 5539 return -EFAULT; 5540 return 0; 5541 } 5542 5543 case PERF_EVENT_IOC_SET_OUTPUT: 5544 { 5545 int ret; 5546 if (arg != -1) { 5547 struct perf_event *output_event; 5548 struct fd output; 5549 ret = perf_fget_light(arg, &output); 5550 if (ret) 5551 return ret; 5552 output_event = output.file->private_data; 5553 ret = perf_event_set_output(event, output_event); 5554 fdput(output); 5555 } else { 5556 ret = perf_event_set_output(event, NULL); 5557 } 5558 return ret; 5559 } 5560 5561 case PERF_EVENT_IOC_SET_FILTER: 5562 return perf_event_set_filter(event, (void __user *)arg); 5563 5564 case PERF_EVENT_IOC_SET_BPF: 5565 return perf_event_set_bpf_prog(event, arg); 5566 5567 case PERF_EVENT_IOC_PAUSE_OUTPUT: { 5568 struct perf_buffer *rb; 5569 5570 rcu_read_lock(); 5571 rb = rcu_dereference(event->rb); 5572 if (!rb || !rb->nr_pages) { 5573 rcu_read_unlock(); 5574 return -EINVAL; 5575 } 5576 rb_toggle_paused(rb, !!arg); 5577 rcu_read_unlock(); 5578 return 0; 5579 } 5580 5581 case PERF_EVENT_IOC_QUERY_BPF: 5582 return perf_event_query_prog_array(event, (void __user *)arg); 5583 5584 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { 5585 struct perf_event_attr new_attr; 5586 int err = perf_copy_attr((struct perf_event_attr __user *)arg, 5587 &new_attr); 5588 5589 if (err) 5590 return err; 5591 5592 return perf_event_modify_attr(event, &new_attr); 5593 } 5594 default: 5595 return -ENOTTY; 5596 } 5597 5598 if (flags & PERF_IOC_FLAG_GROUP) 5599 perf_event_for_each(event, func); 5600 else 5601 perf_event_for_each_child(event, func); 5602 5603 return 0; 5604 } 5605 5606 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 5607 { 5608 struct perf_event *event = file->private_data; 5609 struct perf_event_context *ctx; 5610 long ret; 5611 5612 /* Treat ioctl like writes as it is likely a mutating operation. */ 5613 ret = security_perf_event_write(event); 5614 if (ret) 5615 return ret; 5616 5617 ctx = perf_event_ctx_lock(event); 5618 ret = _perf_ioctl(event, cmd, arg); 5619 perf_event_ctx_unlock(event, ctx); 5620 5621 return ret; 5622 } 5623 5624 #ifdef CONFIG_COMPAT 5625 static long perf_compat_ioctl(struct file *file, unsigned int cmd, 5626 unsigned long arg) 5627 { 5628 switch (_IOC_NR(cmd)) { 5629 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): 5630 case _IOC_NR(PERF_EVENT_IOC_ID): 5631 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): 5632 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): 5633 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ 5634 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { 5635 cmd &= ~IOCSIZE_MASK; 5636 cmd |= sizeof(void *) << IOCSIZE_SHIFT; 5637 } 5638 break; 5639 } 5640 return perf_ioctl(file, cmd, arg); 5641 } 5642 #else 5643 # define perf_compat_ioctl NULL 5644 #endif 5645 5646 int perf_event_task_enable(void) 5647 { 5648 struct perf_event_context *ctx; 5649 struct perf_event *event; 5650 5651 mutex_lock(¤t->perf_event_mutex); 5652 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 5653 ctx = perf_event_ctx_lock(event); 5654 perf_event_for_each_child(event, _perf_event_enable); 5655 perf_event_ctx_unlock(event, ctx); 5656 } 5657 mutex_unlock(¤t->perf_event_mutex); 5658 5659 return 0; 5660 } 5661 5662 int perf_event_task_disable(void) 5663 { 5664 struct perf_event_context *ctx; 5665 struct perf_event *event; 5666 5667 mutex_lock(¤t->perf_event_mutex); 5668 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 5669 ctx = perf_event_ctx_lock(event); 5670 perf_event_for_each_child(event, _perf_event_disable); 5671 perf_event_ctx_unlock(event, ctx); 5672 } 5673 mutex_unlock(¤t->perf_event_mutex); 5674 5675 return 0; 5676 } 5677 5678 static int perf_event_index(struct perf_event *event) 5679 { 5680 if (event->hw.state & PERF_HES_STOPPED) 5681 return 0; 5682 5683 if (event->state != PERF_EVENT_STATE_ACTIVE) 5684 return 0; 5685 5686 return event->pmu->event_idx(event); 5687 } 5688 5689 static void calc_timer_values(struct perf_event *event, 5690 u64 *now, 5691 u64 *enabled, 5692 u64 *running) 5693 { 5694 u64 ctx_time; 5695 5696 *now = perf_clock(); 5697 ctx_time = event->shadow_ctx_time + *now; 5698 __perf_update_times(event, ctx_time, enabled, running); 5699 } 5700 5701 static void perf_event_init_userpage(struct perf_event *event) 5702 { 5703 struct perf_event_mmap_page *userpg; 5704 struct perf_buffer *rb; 5705 5706 rcu_read_lock(); 5707 rb = rcu_dereference(event->rb); 5708 if (!rb) 5709 goto unlock; 5710 5711 userpg = rb->user_page; 5712 5713 /* Allow new userspace to detect that bit 0 is deprecated */ 5714 userpg->cap_bit0_is_deprecated = 1; 5715 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 5716 userpg->data_offset = PAGE_SIZE; 5717 userpg->data_size = perf_data_size(rb); 5718 5719 unlock: 5720 rcu_read_unlock(); 5721 } 5722 5723 void __weak arch_perf_update_userpage( 5724 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) 5725 { 5726 } 5727 5728 /* 5729 * Callers need to ensure there can be no nesting of this function, otherwise 5730 * the seqlock logic goes bad. We can not serialize this because the arch 5731 * code calls this from NMI context. 5732 */ 5733 void perf_event_update_userpage(struct perf_event *event) 5734 { 5735 struct perf_event_mmap_page *userpg; 5736 struct perf_buffer *rb; 5737 u64 enabled, running, now; 5738 5739 rcu_read_lock(); 5740 rb = rcu_dereference(event->rb); 5741 if (!rb) 5742 goto unlock; 5743 5744 /* 5745 * compute total_time_enabled, total_time_running 5746 * based on snapshot values taken when the event 5747 * was last scheduled in. 5748 * 5749 * we cannot simply called update_context_time() 5750 * because of locking issue as we can be called in 5751 * NMI context 5752 */ 5753 calc_timer_values(event, &now, &enabled, &running); 5754 5755 userpg = rb->user_page; 5756 /* 5757 * Disable preemption to guarantee consistent time stamps are stored to 5758 * the user page. 5759 */ 5760 preempt_disable(); 5761 ++userpg->lock; 5762 barrier(); 5763 userpg->index = perf_event_index(event); 5764 userpg->offset = perf_event_count(event); 5765 if (userpg->index) 5766 userpg->offset -= local64_read(&event->hw.prev_count); 5767 5768 userpg->time_enabled = enabled + 5769 atomic64_read(&event->child_total_time_enabled); 5770 5771 userpg->time_running = running + 5772 atomic64_read(&event->child_total_time_running); 5773 5774 arch_perf_update_userpage(event, userpg, now); 5775 5776 barrier(); 5777 ++userpg->lock; 5778 preempt_enable(); 5779 unlock: 5780 rcu_read_unlock(); 5781 } 5782 EXPORT_SYMBOL_GPL(perf_event_update_userpage); 5783 5784 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) 5785 { 5786 struct perf_event *event = vmf->vma->vm_file->private_data; 5787 struct perf_buffer *rb; 5788 vm_fault_t ret = VM_FAULT_SIGBUS; 5789 5790 if (vmf->flags & FAULT_FLAG_MKWRITE) { 5791 if (vmf->pgoff == 0) 5792 ret = 0; 5793 return ret; 5794 } 5795 5796 rcu_read_lock(); 5797 rb = rcu_dereference(event->rb); 5798 if (!rb) 5799 goto unlock; 5800 5801 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 5802 goto unlock; 5803 5804 vmf->page = perf_mmap_to_page(rb, vmf->pgoff); 5805 if (!vmf->page) 5806 goto unlock; 5807 5808 get_page(vmf->page); 5809 vmf->page->mapping = vmf->vma->vm_file->f_mapping; 5810 vmf->page->index = vmf->pgoff; 5811 5812 ret = 0; 5813 unlock: 5814 rcu_read_unlock(); 5815 5816 return ret; 5817 } 5818 5819 static void ring_buffer_attach(struct perf_event *event, 5820 struct perf_buffer *rb) 5821 { 5822 struct perf_buffer *old_rb = NULL; 5823 unsigned long flags; 5824 5825 if (event->rb) { 5826 /* 5827 * Should be impossible, we set this when removing 5828 * event->rb_entry and wait/clear when adding event->rb_entry. 5829 */ 5830 WARN_ON_ONCE(event->rcu_pending); 5831 5832 old_rb = event->rb; 5833 spin_lock_irqsave(&old_rb->event_lock, flags); 5834 list_del_rcu(&event->rb_entry); 5835 spin_unlock_irqrestore(&old_rb->event_lock, flags); 5836 5837 event->rcu_batches = get_state_synchronize_rcu(); 5838 event->rcu_pending = 1; 5839 } 5840 5841 if (rb) { 5842 if (event->rcu_pending) { 5843 cond_synchronize_rcu(event->rcu_batches); 5844 event->rcu_pending = 0; 5845 } 5846 5847 spin_lock_irqsave(&rb->event_lock, flags); 5848 list_add_rcu(&event->rb_entry, &rb->event_list); 5849 spin_unlock_irqrestore(&rb->event_lock, flags); 5850 } 5851 5852 /* 5853 * Avoid racing with perf_mmap_close(AUX): stop the event 5854 * before swizzling the event::rb pointer; if it's getting 5855 * unmapped, its aux_mmap_count will be 0 and it won't 5856 * restart. See the comment in __perf_pmu_output_stop(). 5857 * 5858 * Data will inevitably be lost when set_output is done in 5859 * mid-air, but then again, whoever does it like this is 5860 * not in for the data anyway. 5861 */ 5862 if (has_aux(event)) 5863 perf_event_stop(event, 0); 5864 5865 rcu_assign_pointer(event->rb, rb); 5866 5867 if (old_rb) { 5868 ring_buffer_put(old_rb); 5869 /* 5870 * Since we detached before setting the new rb, so that we 5871 * could attach the new rb, we could have missed a wakeup. 5872 * Provide it now. 5873 */ 5874 wake_up_all(&event->waitq); 5875 } 5876 } 5877 5878 static void ring_buffer_wakeup(struct perf_event *event) 5879 { 5880 struct perf_buffer *rb; 5881 5882 rcu_read_lock(); 5883 rb = rcu_dereference(event->rb); 5884 if (rb) { 5885 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 5886 wake_up_all(&event->waitq); 5887 } 5888 rcu_read_unlock(); 5889 } 5890 5891 struct perf_buffer *ring_buffer_get(struct perf_event *event) 5892 { 5893 struct perf_buffer *rb; 5894 5895 rcu_read_lock(); 5896 rb = rcu_dereference(event->rb); 5897 if (rb) { 5898 if (!refcount_inc_not_zero(&rb->refcount)) 5899 rb = NULL; 5900 } 5901 rcu_read_unlock(); 5902 5903 return rb; 5904 } 5905 5906 void ring_buffer_put(struct perf_buffer *rb) 5907 { 5908 if (!refcount_dec_and_test(&rb->refcount)) 5909 return; 5910 5911 WARN_ON_ONCE(!list_empty(&rb->event_list)); 5912 5913 call_rcu(&rb->rcu_head, rb_free_rcu); 5914 } 5915 5916 static void perf_mmap_open(struct vm_area_struct *vma) 5917 { 5918 struct perf_event *event = vma->vm_file->private_data; 5919 5920 atomic_inc(&event->mmap_count); 5921 atomic_inc(&event->rb->mmap_count); 5922 5923 if (vma->vm_pgoff) 5924 atomic_inc(&event->rb->aux_mmap_count); 5925 5926 if (event->pmu->event_mapped) 5927 event->pmu->event_mapped(event, vma->vm_mm); 5928 } 5929 5930 static void perf_pmu_output_stop(struct perf_event *event); 5931 5932 /* 5933 * A buffer can be mmap()ed multiple times; either directly through the same 5934 * event, or through other events by use of perf_event_set_output(). 5935 * 5936 * In order to undo the VM accounting done by perf_mmap() we need to destroy 5937 * the buffer here, where we still have a VM context. This means we need 5938 * to detach all events redirecting to us. 5939 */ 5940 static void perf_mmap_close(struct vm_area_struct *vma) 5941 { 5942 struct perf_event *event = vma->vm_file->private_data; 5943 struct perf_buffer *rb = ring_buffer_get(event); 5944 struct user_struct *mmap_user = rb->mmap_user; 5945 int mmap_locked = rb->mmap_locked; 5946 unsigned long size = perf_data_size(rb); 5947 bool detach_rest = false; 5948 5949 if (event->pmu->event_unmapped) 5950 event->pmu->event_unmapped(event, vma->vm_mm); 5951 5952 /* 5953 * rb->aux_mmap_count will always drop before rb->mmap_count and 5954 * event->mmap_count, so it is ok to use event->mmap_mutex to 5955 * serialize with perf_mmap here. 5956 */ 5957 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && 5958 atomic_dec_and_mu