~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/sched/fair.c

Version: ~ [ linux-5.13-rc5 ] ~ [ linux-5.12.9 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.42 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.124 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.193 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.235 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.271 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.271 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
  4  *
  5  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  6  *
  7  *  Interactivity improvements by Mike Galbraith
  8  *  (C) 2007 Mike Galbraith <efault@gmx.de>
  9  *
 10  *  Various enhancements by Dmitry Adamushko.
 11  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 12  *
 13  *  Group scheduling enhancements by Srivatsa Vaddagiri
 14  *  Copyright IBM Corporation, 2007
 15  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 16  *
 17  *  Scaled math optimizations by Thomas Gleixner
 18  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
 19  *
 20  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 21  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 22  */
 23 #include "sched.h"
 24 
 25 #include <trace/events/sched.h>
 26 
 27 /*
 28  * Targeted preemption latency for CPU-bound tasks:
 29  *
 30  * NOTE: this latency value is not the same as the concept of
 31  * 'timeslice length' - timeslices in CFS are of variable length
 32  * and have no persistent notion like in traditional, time-slice
 33  * based scheduling concepts.
 34  *
 35  * (to see the precise effective timeslice length of your workload,
 36  *  run vmstat and monitor the context-switches (cs) field)
 37  *
 38  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 39  */
 40 unsigned int sysctl_sched_latency                       = 6000000ULL;
 41 static unsigned int normalized_sysctl_sched_latency     = 6000000ULL;
 42 
 43 /*
 44  * The initial- and re-scaling of tunables is configurable
 45  *
 46  * Options are:
 47  *
 48  *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
 49  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
 50  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 51  *
 52  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
 53  */
 54 enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 55 
 56 /*
 57  * Minimal preemption granularity for CPU-bound tasks:
 58  *
 59  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 60  */
 61 unsigned int sysctl_sched_min_granularity                       = 750000ULL;
 62 static unsigned int normalized_sysctl_sched_min_granularity     = 750000ULL;
 63 
 64 /*
 65  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
 66  */
 67 static unsigned int sched_nr_latency = 8;
 68 
 69 /*
 70  * After fork, child runs first. If set to 0 (default) then
 71  * parent will (try to) run first.
 72  */
 73 unsigned int sysctl_sched_child_runs_first __read_mostly;
 74 
 75 /*
 76  * SCHED_OTHER wake-up granularity.
 77  *
 78  * This option delays the preemption effects of decoupled workloads
 79  * and reduces their over-scheduling. Synchronous workloads will still
 80  * have immediate wakeup/sleep latencies.
 81  *
 82  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 83  */
 84 unsigned int sysctl_sched_wakeup_granularity                    = 1000000UL;
 85 static unsigned int normalized_sysctl_sched_wakeup_granularity  = 1000000UL;
 86 
 87 const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
 88 
 89 #ifdef CONFIG_SMP
 90 /*
 91  * For asym packing, by default the lower numbered CPU has higher priority.
 92  */
 93 int __weak arch_asym_cpu_priority(int cpu)
 94 {
 95         return -cpu;
 96 }
 97 
 98 /*
 99  * The margin used when comparing utilization with CPU capacity.
100  *
101  * (default: ~20%)
102  */
103 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
104 
105 #endif
106 
107 #ifdef CONFIG_CFS_BANDWIDTH
108 /*
109  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
110  * each time a cfs_rq requests quota.
111  *
112  * Note: in the case that the slice exceeds the runtime remaining (either due
113  * to consumption or the quota being specified to be smaller than the slice)
114  * we will always only issue the remaining available time.
115  *
116  * (default: 5 msec, units: microseconds)
117  */
118 unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
119 #endif
120 
121 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122 {
123         lw->weight += inc;
124         lw->inv_weight = 0;
125 }
126 
127 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128 {
129         lw->weight -= dec;
130         lw->inv_weight = 0;
131 }
132 
133 static inline void update_load_set(struct load_weight *lw, unsigned long w)
134 {
135         lw->weight = w;
136         lw->inv_weight = 0;
137 }
138 
139 /*
140  * Increase the granularity value when there are more CPUs,
141  * because with more CPUs the 'effective latency' as visible
142  * to users decreases. But the relationship is not linear,
143  * so pick a second-best guess by going with the log2 of the
144  * number of CPUs.
145  *
146  * This idea comes from the SD scheduler of Con Kolivas:
147  */
148 static unsigned int get_update_sysctl_factor(void)
149 {
150         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151         unsigned int factor;
152 
153         switch (sysctl_sched_tunable_scaling) {
154         case SCHED_TUNABLESCALING_NONE:
155                 factor = 1;
156                 break;
157         case SCHED_TUNABLESCALING_LINEAR:
158                 factor = cpus;
159                 break;
160         case SCHED_TUNABLESCALING_LOG:
161         default:
162                 factor = 1 + ilog2(cpus);
163                 break;
164         }
165 
166         return factor;
167 }
168 
169 static void update_sysctl(void)
170 {
171         unsigned int factor = get_update_sysctl_factor();
172 
173 #define SET_SYSCTL(name) \
174         (sysctl_##name = (factor) * normalized_sysctl_##name)
175         SET_SYSCTL(sched_min_granularity);
176         SET_SYSCTL(sched_latency);
177         SET_SYSCTL(sched_wakeup_granularity);
178 #undef SET_SYSCTL
179 }
180 
181 void sched_init_granularity(void)
182 {
183         update_sysctl();
184 }
185 
186 #define WMULT_CONST     (~0U)
187 #define WMULT_SHIFT     32
188 
189 static void __update_inv_weight(struct load_weight *lw)
190 {
191         unsigned long w;
192 
193         if (likely(lw->inv_weight))
194                 return;
195 
196         w = scale_load_down(lw->weight);
197 
198         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199                 lw->inv_weight = 1;
200         else if (unlikely(!w))
201                 lw->inv_weight = WMULT_CONST;
202         else
203                 lw->inv_weight = WMULT_CONST / w;
204 }
205 
206 /*
207  * delta_exec * weight / lw.weight
208  *   OR
209  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
210  *
211  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
212  * we're guaranteed shift stays positive because inv_weight is guaranteed to
213  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
214  *
215  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
216  * weight/lw.weight <= 1, and therefore our shift will also be positive.
217  */
218 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219 {
220         u64 fact = scale_load_down(weight);
221         int shift = WMULT_SHIFT;
222 
223         __update_inv_weight(lw);
224 
225         if (unlikely(fact >> 32)) {
226                 while (fact >> 32) {
227                         fact >>= 1;
228                         shift--;
229                 }
230         }
231 
232         fact = mul_u32_u32(fact, lw->inv_weight);
233 
234         while (fact >> 32) {
235                 fact >>= 1;
236                 shift--;
237         }
238 
239         return mul_u64_u32_shr(delta_exec, fact, shift);
240 }
241 
242 
243 const struct sched_class fair_sched_class;
244 
245 /**************************************************************
246  * CFS operations on generic schedulable entities:
247  */
248 
249 #ifdef CONFIG_FAIR_GROUP_SCHED
250 static inline struct task_struct *task_of(struct sched_entity *se)
251 {
252         SCHED_WARN_ON(!entity_is_task(se));
253         return container_of(se, struct task_struct, se);
254 }
255 
256 /* Walk up scheduling entities hierarchy */
257 #define for_each_sched_entity(se) \
258                 for (; se; se = se->parent)
259 
260 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
261 {
262         return p->se.cfs_rq;
263 }
264 
265 /* runqueue on which this entity is (to be) queued */
266 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
267 {
268         return se->cfs_rq;
269 }
270 
271 /* runqueue "owned" by this group */
272 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
273 {
274         return grp->my_q;
275 }
276 
277 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
278 {
279         if (!path)
280                 return;
281 
282         if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
283                 autogroup_path(cfs_rq->tg, path, len);
284         else if (cfs_rq && cfs_rq->tg->css.cgroup)
285                 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
286         else
287                 strlcpy(path, "(null)", len);
288 }
289 
290 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291 {
292         struct rq *rq = rq_of(cfs_rq);
293         int cpu = cpu_of(rq);
294 
295         if (cfs_rq->on_list)
296                 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
297 
298         cfs_rq->on_list = 1;
299 
300         /*
301          * Ensure we either appear before our parent (if already
302          * enqueued) or force our parent to appear after us when it is
303          * enqueued. The fact that we always enqueue bottom-up
304          * reduces this to two cases and a special case for the root
305          * cfs_rq. Furthermore, it also means that we will always reset
306          * tmp_alone_branch either when the branch is connected
307          * to a tree or when we reach the top of the tree
308          */
309         if (cfs_rq->tg->parent &&
310             cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
311                 /*
312                  * If parent is already on the list, we add the child
313                  * just before. Thanks to circular linked property of
314                  * the list, this means to put the child at the tail
315                  * of the list that starts by parent.
316                  */
317                 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
318                         &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
319                 /*
320                  * The branch is now connected to its tree so we can
321                  * reset tmp_alone_branch to the beginning of the
322                  * list.
323                  */
324                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
325                 return true;
326         }
327 
328         if (!cfs_rq->tg->parent) {
329                 /*
330                  * cfs rq without parent should be put
331                  * at the tail of the list.
332                  */
333                 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
334                         &rq->leaf_cfs_rq_list);
335                 /*
336                  * We have reach the top of a tree so we can reset
337                  * tmp_alone_branch to the beginning of the list.
338                  */
339                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
340                 return true;
341         }
342 
343         /*
344          * The parent has not already been added so we want to
345          * make sure that it will be put after us.
346          * tmp_alone_branch points to the begin of the branch
347          * where we will add parent.
348          */
349         list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
350         /*
351          * update tmp_alone_branch to points to the new begin
352          * of the branch
353          */
354         rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
355         return false;
356 }
357 
358 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
359 {
360         if (cfs_rq->on_list) {
361                 struct rq *rq = rq_of(cfs_rq);
362 
363                 /*
364                  * With cfs_rq being unthrottled/throttled during an enqueue,
365                  * it can happen the tmp_alone_branch points the a leaf that
366                  * we finally want to del. In this case, tmp_alone_branch moves
367                  * to the prev element but it will point to rq->leaf_cfs_rq_list
368                  * at the end of the enqueue.
369                  */
370                 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
371                         rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
372 
373                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
374                 cfs_rq->on_list = 0;
375         }
376 }
377 
378 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
379 {
380         SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
381 }
382 
383 /* Iterate thr' all leaf cfs_rq's on a runqueue */
384 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
385         list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
386                                  leaf_cfs_rq_list)
387 
388 /* Do the two (enqueued) entities belong to the same group ? */
389 static inline struct cfs_rq *
390 is_same_group(struct sched_entity *se, struct sched_entity *pse)
391 {
392         if (se->cfs_rq == pse->cfs_rq)
393                 return se->cfs_rq;
394 
395         return NULL;
396 }
397 
398 static inline struct sched_entity *parent_entity(struct sched_entity *se)
399 {
400         return se->parent;
401 }
402 
403 static void
404 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
405 {
406         int se_depth, pse_depth;
407 
408         /*
409          * preemption test can be made between sibling entities who are in the
410          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
411          * both tasks until we find their ancestors who are siblings of common
412          * parent.
413          */
414 
415         /* First walk up until both entities are at same depth */
416         se_depth = (*se)->depth;
417         pse_depth = (*pse)->depth;
418 
419         while (se_depth > pse_depth) {
420                 se_depth--;
421                 *se = parent_entity(*se);
422         }
423 
424         while (pse_depth > se_depth) {
425                 pse_depth--;
426                 *pse = parent_entity(*pse);
427         }
428 
429         while (!is_same_group(*se, *pse)) {
430                 *se = parent_entity(*se);
431                 *pse = parent_entity(*pse);
432         }
433 }
434 
435 #else   /* !CONFIG_FAIR_GROUP_SCHED */
436 
437 static inline struct task_struct *task_of(struct sched_entity *se)
438 {
439         return container_of(se, struct task_struct, se);
440 }
441 
442 #define for_each_sched_entity(se) \
443                 for (; se; se = NULL)
444 
445 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
446 {
447         return &task_rq(p)->cfs;
448 }
449 
450 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
451 {
452         struct task_struct *p = task_of(se);
453         struct rq *rq = task_rq(p);
454 
455         return &rq->cfs;
456 }
457 
458 /* runqueue "owned" by this group */
459 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
460 {
461         return NULL;
462 }
463 
464 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
465 {
466         if (path)
467                 strlcpy(path, "(null)", len);
468 }
469 
470 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
471 {
472         return true;
473 }
474 
475 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
476 {
477 }
478 
479 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
480 {
481 }
482 
483 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
484                 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
485 
486 static inline struct sched_entity *parent_entity(struct sched_entity *se)
487 {
488         return NULL;
489 }
490 
491 static inline void
492 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
493 {
494 }
495 
496 #endif  /* CONFIG_FAIR_GROUP_SCHED */
497 
498 static __always_inline
499 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
500 
501 /**************************************************************
502  * Scheduling class tree data structure manipulation methods:
503  */
504 
505 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
506 {
507         s64 delta = (s64)(vruntime - max_vruntime);
508         if (delta > 0)
509                 max_vruntime = vruntime;
510 
511         return max_vruntime;
512 }
513 
514 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
515 {
516         s64 delta = (s64)(vruntime - min_vruntime);
517         if (delta < 0)
518                 min_vruntime = vruntime;
519 
520         return min_vruntime;
521 }
522 
523 static inline int entity_before(struct sched_entity *a,
524                                 struct sched_entity *b)
525 {
526         return (s64)(a->vruntime - b->vruntime) < 0;
527 }
528 
529 static void update_min_vruntime(struct cfs_rq *cfs_rq)
530 {
531         struct sched_entity *curr = cfs_rq->curr;
532         struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
533 
534         u64 vruntime = cfs_rq->min_vruntime;
535 
536         if (curr) {
537                 if (curr->on_rq)
538                         vruntime = curr->vruntime;
539                 else
540                         curr = NULL;
541         }
542 
543         if (leftmost) { /* non-empty tree */
544                 struct sched_entity *se;
545                 se = rb_entry(leftmost, struct sched_entity, run_node);
546 
547                 if (!curr)
548                         vruntime = se->vruntime;
549                 else
550                         vruntime = min_vruntime(vruntime, se->vruntime);
551         }
552 
553         /* ensure we never gain time by being placed backwards. */
554         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
555 #ifndef CONFIG_64BIT
556         smp_wmb();
557         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
558 #endif
559 }
560 
561 /*
562  * Enqueue an entity into the rb-tree:
563  */
564 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
565 {
566         struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
567         struct rb_node *parent = NULL;
568         struct sched_entity *entry;
569         bool leftmost = true;
570 
571         /*
572          * Find the right place in the rbtree:
573          */
574         while (*link) {
575                 parent = *link;
576                 entry = rb_entry(parent, struct sched_entity, run_node);
577                 /*
578                  * We dont care about collisions. Nodes with
579                  * the same key stay together.
580                  */
581                 if (entity_before(se, entry)) {
582                         link = &parent->rb_left;
583                 } else {
584                         link = &parent->rb_right;
585                         leftmost = false;
586                 }
587         }
588 
589         rb_link_node(&se->run_node, parent, link);
590         rb_insert_color_cached(&se->run_node,
591                                &cfs_rq->tasks_timeline, leftmost);
592 }
593 
594 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
595 {
596         rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
597 }
598 
599 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
600 {
601         struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
602 
603         if (!left)
604                 return NULL;
605 
606         return rb_entry(left, struct sched_entity, run_node);
607 }
608 
609 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
610 {
611         struct rb_node *next = rb_next(&se->run_node);
612 
613         if (!next)
614                 return NULL;
615 
616         return rb_entry(next, struct sched_entity, run_node);
617 }
618 
619 #ifdef CONFIG_SCHED_DEBUG
620 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
621 {
622         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
623 
624         if (!last)
625                 return NULL;
626 
627         return rb_entry(last, struct sched_entity, run_node);
628 }
629 
630 /**************************************************************
631  * Scheduling class statistics methods:
632  */
633 
634 int sched_proc_update_handler(struct ctl_table *table, int write,
635                 void __user *buffer, size_t *lenp,
636                 loff_t *ppos)
637 {
638         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
639         unsigned int factor = get_update_sysctl_factor();
640 
641         if (ret || !write)
642                 return ret;
643 
644         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
645                                         sysctl_sched_min_granularity);
646 
647 #define WRT_SYSCTL(name) \
648         (normalized_sysctl_##name = sysctl_##name / (factor))
649         WRT_SYSCTL(sched_min_granularity);
650         WRT_SYSCTL(sched_latency);
651         WRT_SYSCTL(sched_wakeup_granularity);
652 #undef WRT_SYSCTL
653 
654         return 0;
655 }
656 #endif
657 
658 /*
659  * delta /= w
660  */
661 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
662 {
663         if (unlikely(se->load.weight != NICE_0_LOAD))
664                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
665 
666         return delta;
667 }
668 
669 /*
670  * The idea is to set a period in which each task runs once.
671  *
672  * When there are too many tasks (sched_nr_latency) we have to stretch
673  * this period because otherwise the slices get too small.
674  *
675  * p = (nr <= nl) ? l : l*nr/nl
676  */
677 static u64 __sched_period(unsigned long nr_running)
678 {
679         if (unlikely(nr_running > sched_nr_latency))
680                 return nr_running * sysctl_sched_min_granularity;
681         else
682                 return sysctl_sched_latency;
683 }
684 
685 /*
686  * We calculate the wall-time slice from the period by taking a part
687  * proportional to the weight.
688  *
689  * s = p*P[w/rw]
690  */
691 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
692 {
693         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
694 
695         for_each_sched_entity(se) {
696                 struct load_weight *load;
697                 struct load_weight lw;
698 
699                 cfs_rq = cfs_rq_of(se);
700                 load = &cfs_rq->load;
701 
702                 if (unlikely(!se->on_rq)) {
703                         lw = cfs_rq->load;
704 
705                         update_load_add(&lw, se->load.weight);
706                         load = &lw;
707                 }
708                 slice = __calc_delta(slice, se->load.weight, load);
709         }
710         return slice;
711 }
712 
713 /*
714  * We calculate the vruntime slice of a to-be-inserted task.
715  *
716  * vs = s/w
717  */
718 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
719 {
720         return calc_delta_fair(sched_slice(cfs_rq, se), se);
721 }
722 
723 #include "pelt.h"
724 #ifdef CONFIG_SMP
725 
726 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
727 static unsigned long task_h_load(struct task_struct *p);
728 static unsigned long capacity_of(int cpu);
729 
730 /* Give new sched_entity start runnable values to heavy its load in infant time */
731 void init_entity_runnable_average(struct sched_entity *se)
732 {
733         struct sched_avg *sa = &se->avg;
734 
735         memset(sa, 0, sizeof(*sa));
736 
737         /*
738          * Tasks are initialized with full load to be seen as heavy tasks until
739          * they get a chance to stabilize to their real load level.
740          * Group entities are initialized with zero load to reflect the fact that
741          * nothing has been attached to the task group yet.
742          */
743         if (entity_is_task(se))
744                 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
745 
746         se->runnable_weight = se->load.weight;
747 
748         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
749 }
750 
751 static void attach_entity_cfs_rq(struct sched_entity *se);
752 
753 /*
754  * With new tasks being created, their initial util_avgs are extrapolated
755  * based on the cfs_rq's current util_avg:
756  *
757  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
758  *
759  * However, in many cases, the above util_avg does not give a desired
760  * value. Moreover, the sum of the util_avgs may be divergent, such
761  * as when the series is a harmonic series.
762  *
763  * To solve this problem, we also cap the util_avg of successive tasks to
764  * only 1/2 of the left utilization budget:
765  *
766  *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
767  *
768  * where n denotes the nth task and cpu_scale the CPU capacity.
769  *
770  * For example, for a CPU with 1024 of capacity, a simplest series from
771  * the beginning would be like:
772  *
773  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
774  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
775  *
776  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
777  * if util_avg > util_avg_cap.
778  */
779 void post_init_entity_util_avg(struct task_struct *p)
780 {
781         struct sched_entity *se = &p->se;
782         struct cfs_rq *cfs_rq = cfs_rq_of(se);
783         struct sched_avg *sa = &se->avg;
784         long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
785         long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
786 
787         if (cap > 0) {
788                 if (cfs_rq->avg.util_avg != 0) {
789                         sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
790                         sa->util_avg /= (cfs_rq->avg.load_avg + 1);
791 
792                         if (sa->util_avg > cap)
793                                 sa->util_avg = cap;
794                 } else {
795                         sa->util_avg = cap;
796                 }
797         }
798 
799         if (p->sched_class != &fair_sched_class) {
800                 /*
801                  * For !fair tasks do:
802                  *
803                 update_cfs_rq_load_avg(now, cfs_rq);
804                 attach_entity_load_avg(cfs_rq, se);
805                 switched_from_fair(rq, p);
806                  *
807                  * such that the next switched_to_fair() has the
808                  * expected state.
809                  */
810                 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
811                 return;
812         }
813 
814         attach_entity_cfs_rq(se);
815 }
816 
817 #else /* !CONFIG_SMP */
818 void init_entity_runnable_average(struct sched_entity *se)
819 {
820 }
821 void post_init_entity_util_avg(struct task_struct *p)
822 {
823 }
824 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
825 {
826 }
827 #endif /* CONFIG_SMP */
828 
829 /*
830  * Update the current task's runtime statistics.
831  */
832 static void update_curr(struct cfs_rq *cfs_rq)
833 {
834         struct sched_entity *curr = cfs_rq->curr;
835         u64 now = rq_clock_task(rq_of(cfs_rq));
836         u64 delta_exec;
837 
838         if (unlikely(!curr))
839                 return;
840 
841         delta_exec = now - curr->exec_start;
842         if (unlikely((s64)delta_exec <= 0))
843                 return;
844 
845         curr->exec_start = now;
846 
847         schedstat_set(curr->statistics.exec_max,
848                       max(delta_exec, curr->statistics.exec_max));
849 
850         curr->sum_exec_runtime += delta_exec;
851         schedstat_add(cfs_rq->exec_clock, delta_exec);
852 
853         curr->vruntime += calc_delta_fair(delta_exec, curr);
854         update_min_vruntime(cfs_rq);
855 
856         if (entity_is_task(curr)) {
857                 struct task_struct *curtask = task_of(curr);
858 
859                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
860                 cgroup_account_cputime(curtask, delta_exec);
861                 account_group_exec_runtime(curtask, delta_exec);
862         }
863 
864         account_cfs_rq_runtime(cfs_rq, delta_exec);
865 }
866 
867 static void update_curr_fair(struct rq *rq)
868 {
869         update_curr(cfs_rq_of(&rq->curr->se));
870 }
871 
872 static inline void
873 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
874 {
875         u64 wait_start, prev_wait_start;
876 
877         if (!schedstat_enabled())
878                 return;
879 
880         wait_start = rq_clock(rq_of(cfs_rq));
881         prev_wait_start = schedstat_val(se->statistics.wait_start);
882 
883         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
884             likely(wait_start > prev_wait_start))
885                 wait_start -= prev_wait_start;
886 
887         __schedstat_set(se->statistics.wait_start, wait_start);
888 }
889 
890 static inline void
891 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
892 {
893         struct task_struct *p;
894         u64 delta;
895 
896         if (!schedstat_enabled())
897                 return;
898 
899         delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
900 
901         if (entity_is_task(se)) {
902                 p = task_of(se);
903                 if (task_on_rq_migrating(p)) {
904                         /*
905                          * Preserve migrating task's wait time so wait_start
906                          * time stamp can be adjusted to accumulate wait time
907                          * prior to migration.
908                          */
909                         __schedstat_set(se->statistics.wait_start, delta);
910                         return;
911                 }
912                 trace_sched_stat_wait(p, delta);
913         }
914 
915         __schedstat_set(se->statistics.wait_max,
916                       max(schedstat_val(se->statistics.wait_max), delta));
917         __schedstat_inc(se->statistics.wait_count);
918         __schedstat_add(se->statistics.wait_sum, delta);
919         __schedstat_set(se->statistics.wait_start, 0);
920 }
921 
922 static inline void
923 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
924 {
925         struct task_struct *tsk = NULL;
926         u64 sleep_start, block_start;
927 
928         if (!schedstat_enabled())
929                 return;
930 
931         sleep_start = schedstat_val(se->statistics.sleep_start);
932         block_start = schedstat_val(se->statistics.block_start);
933 
934         if (entity_is_task(se))
935                 tsk = task_of(se);
936 
937         if (sleep_start) {
938                 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
939 
940                 if ((s64)delta < 0)
941                         delta = 0;
942 
943                 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
944                         __schedstat_set(se->statistics.sleep_max, delta);
945 
946                 __schedstat_set(se->statistics.sleep_start, 0);
947                 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
948 
949                 if (tsk) {
950                         account_scheduler_latency(tsk, delta >> 10, 1);
951                         trace_sched_stat_sleep(tsk, delta);
952                 }
953         }
954         if (block_start) {
955                 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
956 
957                 if ((s64)delta < 0)
958                         delta = 0;
959 
960                 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
961                         __schedstat_set(se->statistics.block_max, delta);
962 
963                 __schedstat_set(se->statistics.block_start, 0);
964                 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
965 
966                 if (tsk) {
967                         if (tsk->in_iowait) {
968                                 __schedstat_add(se->statistics.iowait_sum, delta);
969                                 __schedstat_inc(se->statistics.iowait_count);
970                                 trace_sched_stat_iowait(tsk, delta);
971                         }
972 
973                         trace_sched_stat_blocked(tsk, delta);
974 
975                         /*
976                          * Blocking time is in units of nanosecs, so shift by
977                          * 20 to get a milliseconds-range estimation of the
978                          * amount of time that the task spent sleeping:
979                          */
980                         if (unlikely(prof_on == SLEEP_PROFILING)) {
981                                 profile_hits(SLEEP_PROFILING,
982                                                 (void *)get_wchan(tsk),
983                                                 delta >> 20);
984                         }
985                         account_scheduler_latency(tsk, delta >> 10, 0);
986                 }
987         }
988 }
989 
990 /*
991  * Task is being enqueued - update stats:
992  */
993 static inline void
994 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
995 {
996         if (!schedstat_enabled())
997                 return;
998 
999         /*
1000          * Are we enqueueing a waiting task? (for current tasks
1001          * a dequeue/enqueue event is a NOP)
1002          */
1003         if (se != cfs_rq->curr)
1004                 update_stats_wait_start(cfs_rq, se);
1005 
1006         if (flags & ENQUEUE_WAKEUP)
1007                 update_stats_enqueue_sleeper(cfs_rq, se);
1008 }
1009 
1010 static inline void
1011 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1012 {
1013 
1014         if (!schedstat_enabled())
1015                 return;
1016 
1017         /*
1018          * Mark the end of the wait period if dequeueing a
1019          * waiting task:
1020          */
1021         if (se != cfs_rq->curr)
1022                 update_stats_wait_end(cfs_rq, se);
1023 
1024         if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1025                 struct task_struct *tsk = task_of(se);
1026 
1027                 if (tsk->state & TASK_INTERRUPTIBLE)
1028                         __schedstat_set(se->statistics.sleep_start,
1029                                       rq_clock(rq_of(cfs_rq)));
1030                 if (tsk->state & TASK_UNINTERRUPTIBLE)
1031                         __schedstat_set(se->statistics.block_start,
1032                                       rq_clock(rq_of(cfs_rq)));
1033         }
1034 }
1035 
1036 /*
1037  * We are picking a new current task - update its stats:
1038  */
1039 static inline void
1040 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1041 {
1042         /*
1043          * We are starting a new run period:
1044          */
1045         se->exec_start = rq_clock_task(rq_of(cfs_rq));
1046 }
1047 
1048 /**************************************************
1049  * Scheduling class queueing methods:
1050  */
1051 
1052 #ifdef CONFIG_NUMA_BALANCING
1053 /*
1054  * Approximate time to scan a full NUMA task in ms. The task scan period is
1055  * calculated based on the tasks virtual memory size and
1056  * numa_balancing_scan_size.
1057  */
1058 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1059 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1060 
1061 /* Portion of address space to scan in MB */
1062 unsigned int sysctl_numa_balancing_scan_size = 256;
1063 
1064 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1065 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1066 
1067 struct numa_group {
1068         refcount_t refcount;
1069 
1070         spinlock_t lock; /* nr_tasks, tasks */
1071         int nr_tasks;
1072         pid_t gid;
1073         int active_nodes;
1074 
1075         struct rcu_head rcu;
1076         unsigned long total_faults;
1077         unsigned long max_faults_cpu;
1078         /*
1079          * Faults_cpu is used to decide whether memory should move
1080          * towards the CPU. As a consequence, these stats are weighted
1081          * more by CPU use than by memory faults.
1082          */
1083         unsigned long *faults_cpu;
1084         unsigned long faults[0];
1085 };
1086 
1087 /*
1088  * For functions that can be called in multiple contexts that permit reading
1089  * ->numa_group (see struct task_struct for locking rules).
1090  */
1091 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1092 {
1093         return rcu_dereference_check(p->numa_group, p == current ||
1094                 (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1095 }
1096 
1097 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1098 {
1099         return rcu_dereference_protected(p->numa_group, p == current);
1100 }
1101 
1102 static inline unsigned long group_faults_priv(struct numa_group *ng);
1103 static inline unsigned long group_faults_shared(struct numa_group *ng);
1104 
1105 static unsigned int task_nr_scan_windows(struct task_struct *p)
1106 {
1107         unsigned long rss = 0;
1108         unsigned long nr_scan_pages;
1109 
1110         /*
1111          * Calculations based on RSS as non-present and empty pages are skipped
1112          * by the PTE scanner and NUMA hinting faults should be trapped based
1113          * on resident pages
1114          */
1115         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1116         rss = get_mm_rss(p->mm);
1117         if (!rss)
1118                 rss = nr_scan_pages;
1119 
1120         rss = round_up(rss, nr_scan_pages);
1121         return rss / nr_scan_pages;
1122 }
1123 
1124 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1125 #define MAX_SCAN_WINDOW 2560
1126 
1127 static unsigned int task_scan_min(struct task_struct *p)
1128 {
1129         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1130         unsigned int scan, floor;
1131         unsigned int windows = 1;
1132 
1133         if (scan_size < MAX_SCAN_WINDOW)
1134                 windows = MAX_SCAN_WINDOW / scan_size;
1135         floor = 1000 / windows;
1136 
1137         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1138         return max_t(unsigned int, floor, scan);
1139 }
1140 
1141 static unsigned int task_scan_start(struct task_struct *p)
1142 {
1143         unsigned long smin = task_scan_min(p);
1144         unsigned long period = smin;
1145         struct numa_group *ng;
1146 
1147         /* Scale the maximum scan period with the amount of shared memory. */
1148         rcu_read_lock();
1149         ng = rcu_dereference(p->numa_group);
1150         if (ng) {
1151                 unsigned long shared = group_faults_shared(ng);
1152                 unsigned long private = group_faults_priv(ng);
1153 
1154                 period *= refcount_read(&ng->refcount);
1155                 period *= shared + 1;
1156                 period /= private + shared + 1;
1157         }
1158         rcu_read_unlock();
1159 
1160         return max(smin, period);
1161 }
1162 
1163 static unsigned int task_scan_max(struct task_struct *p)
1164 {
1165         unsigned long smin = task_scan_min(p);
1166         unsigned long smax;
1167         struct numa_group *ng;
1168 
1169         /* Watch for min being lower than max due to floor calculations */
1170         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1171 
1172         /* Scale the maximum scan period with the amount of shared memory. */
1173         ng = deref_curr_numa_group(p);
1174         if (ng) {
1175                 unsigned long shared = group_faults_shared(ng);
1176                 unsigned long private = group_faults_priv(ng);
1177                 unsigned long period = smax;
1178 
1179                 period *= refcount_read(&ng->refcount);
1180                 period *= shared + 1;
1181                 period /= private + shared + 1;
1182 
1183                 smax = max(smax, period);
1184         }
1185 
1186         return max(smin, smax);
1187 }
1188 
1189 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1190 {
1191         rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1192         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1193 }
1194 
1195 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1196 {
1197         rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1198         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1199 }
1200 
1201 /* Shared or private faults. */
1202 #define NR_NUMA_HINT_FAULT_TYPES 2
1203 
1204 /* Memory and CPU locality */
1205 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1206 
1207 /* Averaged statistics, and temporary buffers. */
1208 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1209 
1210 pid_t task_numa_group_id(struct task_struct *p)
1211 {
1212         struct numa_group *ng;
1213         pid_t gid = 0;
1214 
1215         rcu_read_lock();
1216         ng = rcu_dereference(p->numa_group);
1217         if (ng)
1218                 gid = ng->gid;
1219         rcu_read_unlock();
1220 
1221         return gid;
1222 }
1223 
1224 /*
1225  * The averaged statistics, shared & private, memory & CPU,
1226  * occupy the first half of the array. The second half of the
1227  * array is for current counters, which are averaged into the
1228  * first set by task_numa_placement.
1229  */
1230 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1231 {
1232         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1233 }
1234 
1235 static inline unsigned long task_faults(struct task_struct *p, int nid)
1236 {
1237         if (!p->numa_faults)
1238                 return 0;
1239 
1240         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1241                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1242 }
1243 
1244 static inline unsigned long group_faults(struct task_struct *p, int nid)
1245 {
1246         struct numa_group *ng = deref_task_numa_group(p);
1247 
1248         if (!ng)
1249                 return 0;
1250 
1251         return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1252                 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1253 }
1254 
1255 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1256 {
1257         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1258                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1259 }
1260 
1261 static inline unsigned long group_faults_priv(struct numa_group *ng)
1262 {
1263         unsigned long faults = 0;
1264         int node;
1265 
1266         for_each_online_node(node) {
1267                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1268         }
1269 
1270         return faults;
1271 }
1272 
1273 static inline unsigned long group_faults_shared(struct numa_group *ng)
1274 {
1275         unsigned long faults = 0;
1276         int node;
1277 
1278         for_each_online_node(node) {
1279                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1280         }
1281 
1282         return faults;
1283 }
1284 
1285 /*
1286  * A node triggering more than 1/3 as many NUMA faults as the maximum is
1287  * considered part of a numa group's pseudo-interleaving set. Migrations
1288  * between these nodes are slowed down, to allow things to settle down.
1289  */
1290 #define ACTIVE_NODE_FRACTION 3
1291 
1292 static bool numa_is_active_node(int nid, struct numa_group *ng)
1293 {
1294         return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1295 }
1296 
1297 /* Handle placement on systems where not all nodes are directly connected. */
1298 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1299                                         int maxdist, bool task)
1300 {
1301         unsigned long score = 0;
1302         int node;
1303 
1304         /*
1305          * All nodes are directly connected, and the same distance
1306          * from each other. No need for fancy placement algorithms.
1307          */
1308         if (sched_numa_topology_type == NUMA_DIRECT)
1309                 return 0;
1310 
1311         /*
1312          * This code is called for each node, introducing N^2 complexity,
1313          * which should be ok given the number of nodes rarely exceeds 8.
1314          */
1315         for_each_online_node(node) {
1316                 unsigned long faults;
1317                 int dist = node_distance(nid, node);
1318 
1319                 /*
1320                  * The furthest away nodes in the system are not interesting
1321                  * for placement; nid was already counted.
1322                  */
1323                 if (dist == sched_max_numa_distance || node == nid)
1324                         continue;
1325 
1326                 /*
1327                  * On systems with a backplane NUMA topology, compare groups
1328                  * of nodes, and move tasks towards the group with the most
1329                  * memory accesses. When comparing two nodes at distance
1330                  * "hoplimit", only nodes closer by than "hoplimit" are part
1331                  * of each group. Skip other nodes.
1332                  */
1333                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1334                                         dist >= maxdist)
1335                         continue;
1336 
1337                 /* Add up the faults from nearby nodes. */
1338                 if (task)
1339                         faults = task_faults(p, node);
1340                 else
1341                         faults = group_faults(p, node);
1342 
1343                 /*
1344                  * On systems with a glueless mesh NUMA topology, there are
1345                  * no fixed "groups of nodes". Instead, nodes that are not
1346                  * directly connected bounce traffic through intermediate
1347                  * nodes; a numa_group can occupy any set of nodes.
1348                  * The further away a node is, the less the faults count.
1349                  * This seems to result in good task placement.
1350                  */
1351                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1352                         faults *= (sched_max_numa_distance - dist);
1353                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1354                 }
1355 
1356                 score += faults;
1357         }
1358 
1359         return score;
1360 }
1361 
1362 /*
1363  * These return the fraction of accesses done by a particular task, or
1364  * task group, on a particular numa node.  The group weight is given a
1365  * larger multiplier, in order to group tasks together that are almost
1366  * evenly spread out between numa nodes.
1367  */
1368 static inline unsigned long task_weight(struct task_struct *p, int nid,
1369                                         int dist)
1370 {
1371         unsigned long faults, total_faults;
1372 
1373         if (!p->numa_faults)
1374                 return 0;
1375 
1376         total_faults = p->total_numa_faults;
1377 
1378         if (!total_faults)
1379                 return 0;
1380 
1381         faults = task_faults(p, nid);
1382         faults += score_nearby_nodes(p, nid, dist, true);
1383 
1384         return 1000 * faults / total_faults;
1385 }
1386 
1387 static inline unsigned long group_weight(struct task_struct *p, int nid,
1388                                          int dist)
1389 {
1390         struct numa_group *ng = deref_task_numa_group(p);
1391         unsigned long faults, total_faults;
1392 
1393         if (!ng)
1394                 return 0;
1395 
1396         total_faults = ng->total_faults;
1397 
1398         if (!total_faults)
1399                 return 0;
1400 
1401         faults = group_faults(p, nid);
1402         faults += score_nearby_nodes(p, nid, dist, false);
1403 
1404         return 1000 * faults / total_faults;
1405 }
1406 
1407 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1408                                 int src_nid, int dst_cpu)
1409 {
1410         struct numa_group *ng = deref_curr_numa_group(p);
1411         int dst_nid = cpu_to_node(dst_cpu);
1412         int last_cpupid, this_cpupid;
1413 
1414         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1415         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1416 
1417         /*
1418          * Allow first faults or private faults to migrate immediately early in
1419          * the lifetime of a task. The magic number 4 is based on waiting for
1420          * two full passes of the "multi-stage node selection" test that is
1421          * executed below.
1422          */
1423         if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1424             (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1425                 return true;
1426 
1427         /*
1428          * Multi-stage node selection is used in conjunction with a periodic
1429          * migration fault to build a temporal task<->page relation. By using
1430          * a two-stage filter we remove short/unlikely relations.
1431          *
1432          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1433          * a task's usage of a particular page (n_p) per total usage of this
1434          * page (n_t) (in a given time-span) to a probability.
1435          *
1436          * Our periodic faults will sample this probability and getting the
1437          * same result twice in a row, given these samples are fully
1438          * independent, is then given by P(n)^2, provided our sample period
1439          * is sufficiently short compared to the usage pattern.
1440          *
1441          * This quadric squishes small probabilities, making it less likely we
1442          * act on an unlikely task<->page relation.
1443          */
1444         if (!cpupid_pid_unset(last_cpupid) &&
1445                                 cpupid_to_nid(last_cpupid) != dst_nid)
1446                 return false;
1447 
1448         /* Always allow migrate on private faults */
1449         if (cpupid_match_pid(p, last_cpupid))
1450                 return true;
1451 
1452         /* A shared fault, but p->numa_group has not been set up yet. */
1453         if (!ng)
1454                 return true;
1455 
1456         /*
1457          * Destination node is much more heavily used than the source
1458          * node? Allow migration.
1459          */
1460         if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1461                                         ACTIVE_NODE_FRACTION)
1462                 return true;
1463 
1464         /*
1465          * Distribute memory according to CPU & memory use on each node,
1466          * with 3/4 hysteresis to avoid unnecessary memory migrations:
1467          *
1468          * faults_cpu(dst)   3   faults_cpu(src)
1469          * --------------- * - > ---------------
1470          * faults_mem(dst)   4   faults_mem(src)
1471          */
1472         return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1473                group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1474 }
1475 
1476 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
1477 
1478 static unsigned long cpu_runnable_load(struct rq *rq)
1479 {
1480         return cfs_rq_runnable_load_avg(&rq->cfs);
1481 }
1482 
1483 /* Cached statistics for all CPUs within a node */
1484 struct numa_stats {
1485         unsigned long load;
1486 
1487         /* Total compute capacity of CPUs on a node */
1488         unsigned long compute_capacity;
1489 };
1490 
1491 /*
1492  * XXX borrowed from update_sg_lb_stats
1493  */
1494 static void update_numa_stats(struct numa_stats *ns, int nid)
1495 {
1496         int cpu;
1497 
1498         memset(ns, 0, sizeof(*ns));
1499         for_each_cpu(cpu, cpumask_of_node(nid)) {
1500                 struct rq *rq = cpu_rq(cpu);
1501 
1502                 ns->load += cpu_runnable_load(rq);
1503                 ns->compute_capacity += capacity_of(cpu);
1504         }
1505 
1506 }
1507 
1508 struct task_numa_env {
1509         struct task_struct *p;
1510 
1511         int src_cpu, src_nid;
1512         int dst_cpu, dst_nid;
1513 
1514         struct numa_stats src_stats, dst_stats;
1515 
1516         int imbalance_pct;
1517         int dist;
1518 
1519         struct task_struct *best_task;
1520         long best_imp;
1521         int best_cpu;
1522 };
1523 
1524 static void task_numa_assign(struct task_numa_env *env,
1525                              struct task_struct *p, long imp)
1526 {
1527         struct rq *rq = cpu_rq(env->dst_cpu);
1528 
1529         /* Bail out if run-queue part of active NUMA balance. */
1530         if (xchg(&rq->numa_migrate_on, 1))
1531                 return;
1532 
1533         /*
1534          * Clear previous best_cpu/rq numa-migrate flag, since task now
1535          * found a better CPU to move/swap.
1536          */
1537         if (env->best_cpu != -1) {
1538                 rq = cpu_rq(env->best_cpu);
1539                 WRITE_ONCE(rq->numa_migrate_on, 0);
1540         }
1541 
1542         if (env->best_task)
1543                 put_task_struct(env->best_task);
1544         if (p)
1545                 get_task_struct(p);
1546 
1547         env->best_task = p;
1548         env->best_imp = imp;
1549         env->best_cpu = env->dst_cpu;
1550 }
1551 
1552 static bool load_too_imbalanced(long src_load, long dst_load,
1553                                 struct task_numa_env *env)
1554 {
1555         long imb, old_imb;
1556         long orig_src_load, orig_dst_load;
1557         long src_capacity, dst_capacity;
1558 
1559         /*
1560          * The load is corrected for the CPU capacity available on each node.
1561          *
1562          * src_load        dst_load
1563          * ------------ vs ---------
1564          * src_capacity    dst_capacity
1565          */
1566         src_capacity = env->src_stats.compute_capacity;
1567         dst_capacity = env->dst_stats.compute_capacity;
1568 
1569         imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1570 
1571         orig_src_load = env->src_stats.load;
1572         orig_dst_load = env->dst_stats.load;
1573 
1574         old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1575 
1576         /* Would this change make things worse? */
1577         return (imb > old_imb);
1578 }
1579 
1580 /*
1581  * Maximum NUMA importance can be 1998 (2*999);
1582  * SMALLIMP @ 30 would be close to 1998/64.
1583  * Used to deter task migration.
1584  */
1585 #define SMALLIMP        30
1586 
1587 /*
1588  * This checks if the overall compute and NUMA accesses of the system would
1589  * be improved if the source tasks was migrated to the target dst_cpu taking
1590  * into account that it might be best if task running on the dst_cpu should
1591  * be exchanged with the source task
1592  */
1593 static void task_numa_compare(struct task_numa_env *env,
1594                               long taskimp, long groupimp, bool maymove)
1595 {
1596         struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1597         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1598         long imp = p_ng ? groupimp : taskimp;
1599         struct task_struct *cur;
1600         long src_load, dst_load;
1601         int dist = env->dist;
1602         long moveimp = imp;
1603         long load;
1604 
1605         if (READ_ONCE(dst_rq->numa_migrate_on))
1606                 return;
1607 
1608         rcu_read_lock();
1609         cur = rcu_dereference(dst_rq->curr);
1610         if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1611                 cur = NULL;
1612 
1613         /*
1614          * Because we have preemption enabled we can get migrated around and
1615          * end try selecting ourselves (current == env->p) as a swap candidate.
1616          */
1617         if (cur == env->p)
1618                 goto unlock;
1619 
1620         if (!cur) {
1621                 if (maymove && moveimp >= env->best_imp)
1622                         goto assign;
1623                 else
1624                         goto unlock;
1625         }
1626 
1627         /*
1628          * "imp" is the fault differential for the source task between the
1629          * source and destination node. Calculate the total differential for
1630          * the source task and potential destination task. The more negative
1631          * the value is, the more remote accesses that would be expected to
1632          * be incurred if the tasks were swapped.
1633          */
1634         /* Skip this swap candidate if cannot move to the source cpu */
1635         if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1636                 goto unlock;
1637 
1638         /*
1639          * If dst and source tasks are in the same NUMA group, or not
1640          * in any group then look only at task weights.
1641          */
1642         cur_ng = rcu_dereference(cur->numa_group);
1643         if (cur_ng == p_ng) {
1644                 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1645                       task_weight(cur, env->dst_nid, dist);
1646                 /*
1647                  * Add some hysteresis to prevent swapping the
1648                  * tasks within a group over tiny differences.
1649                  */
1650                 if (cur_ng)
1651                         imp -= imp / 16;
1652         } else {
1653                 /*
1654                  * Compare the group weights. If a task is all by itself
1655                  * (not part of a group), use the task weight instead.
1656                  */
1657                 if (cur_ng && p_ng)
1658                         imp += group_weight(cur, env->src_nid, dist) -
1659                                group_weight(cur, env->dst_nid, dist);
1660                 else
1661                         imp += task_weight(cur, env->src_nid, dist) -
1662                                task_weight(cur, env->dst_nid, dist);
1663         }
1664 
1665         if (maymove && moveimp > imp && moveimp > env->best_imp) {
1666                 imp = moveimp;
1667                 cur = NULL;
1668                 goto assign;
1669         }
1670 
1671         /*
1672          * If the NUMA importance is less than SMALLIMP,
1673          * task migration might only result in ping pong
1674          * of tasks and also hurt performance due to cache
1675          * misses.
1676          */
1677         if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1678                 goto unlock;
1679 
1680         /*
1681          * In the overloaded case, try and keep the load balanced.
1682          */
1683         load = task_h_load(env->p) - task_h_load(cur);
1684         if (!load)
1685                 goto assign;
1686 
1687         dst_load = env->dst_stats.load + load;
1688         src_load = env->src_stats.load - load;
1689 
1690         if (load_too_imbalanced(src_load, dst_load, env))
1691                 goto unlock;
1692 
1693 assign:
1694         /*
1695          * One idle CPU per node is evaluated for a task numa move.
1696          * Call select_idle_sibling to maybe find a better one.
1697          */
1698         if (!cur) {
1699                 /*
1700                  * select_idle_siblings() uses an per-CPU cpumask that
1701                  * can be used from IRQ context.
1702                  */
1703                 local_irq_disable();
1704                 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1705                                                    env->dst_cpu);
1706                 local_irq_enable();
1707         }
1708 
1709         task_numa_assign(env, cur, imp);
1710 unlock:
1711         rcu_read_unlock();
1712 }
1713 
1714 static void task_numa_find_cpu(struct task_numa_env *env,
1715                                 long taskimp, long groupimp)
1716 {
1717         long src_load, dst_load, load;
1718         bool maymove = false;
1719         int cpu;
1720 
1721         load = task_h_load(env->p);
1722         dst_load = env->dst_stats.load + load;
1723         src_load = env->src_stats.load - load;
1724 
1725         /*
1726          * If the improvement from just moving env->p direction is better
1727          * than swapping tasks around, check if a move is possible.
1728          */
1729         maymove = !load_too_imbalanced(src_load, dst_load, env);
1730 
1731         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1732                 /* Skip this CPU if the source task cannot migrate */
1733                 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1734                         continue;
1735 
1736                 env->dst_cpu = cpu;
1737                 task_numa_compare(env, taskimp, groupimp, maymove);
1738         }
1739 }
1740 
1741 static int task_numa_migrate(struct task_struct *p)
1742 {
1743         struct task_numa_env env = {
1744                 .p = p,
1745 
1746                 .src_cpu = task_cpu(p),
1747                 .src_nid = task_node(p),
1748 
1749                 .imbalance_pct = 112,
1750 
1751                 .best_task = NULL,
1752                 .best_imp = 0,
1753                 .best_cpu = -1,
1754         };
1755         unsigned long taskweight, groupweight;
1756         struct sched_domain *sd;
1757         long taskimp, groupimp;
1758         struct numa_group *ng;
1759         struct rq *best_rq;
1760         int nid, ret, dist;
1761 
1762         /*
1763          * Pick the lowest SD_NUMA domain, as that would have the smallest
1764          * imbalance and would be the first to start moving tasks about.
1765          *
1766          * And we want to avoid any moving of tasks about, as that would create
1767          * random movement of tasks -- counter the numa conditions we're trying
1768          * to satisfy here.
1769          */
1770         rcu_read_lock();
1771         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1772         if (sd)
1773                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1774         rcu_read_unlock();
1775 
1776         /*
1777          * Cpusets can break the scheduler domain tree into smaller
1778          * balance domains, some of which do not cross NUMA boundaries.
1779          * Tasks that are "trapped" in such domains cannot be migrated
1780          * elsewhere, so there is no point in (re)trying.
1781          */
1782         if (unlikely(!sd)) {
1783                 sched_setnuma(p, task_node(p));
1784                 return -EINVAL;
1785         }
1786 
1787         env.dst_nid = p->numa_preferred_nid;
1788         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1789         taskweight = task_weight(p, env.src_nid, dist);
1790         groupweight = group_weight(p, env.src_nid, dist);
1791         update_numa_stats(&env.src_stats, env.src_nid);
1792         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1793         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1794         update_numa_stats(&env.dst_stats, env.dst_nid);
1795 
1796         /* Try to find a spot on the preferred nid. */
1797         task_numa_find_cpu(&env, taskimp, groupimp);
1798 
1799         /*
1800          * Look at other nodes in these cases:
1801          * - there is no space available on the preferred_nid
1802          * - the task is part of a numa_group that is interleaved across
1803          *   multiple NUMA nodes; in order to better consolidate the group,
1804          *   we need to check other locations.
1805          */
1806         ng = deref_curr_numa_group(p);
1807         if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1808                 for_each_online_node(nid) {
1809                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1810                                 continue;
1811 
1812                         dist = node_distance(env.src_nid, env.dst_nid);
1813                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1814                                                 dist != env.dist) {
1815                                 taskweight = task_weight(p, env.src_nid, dist);
1816                                 groupweight = group_weight(p, env.src_nid, dist);
1817                         }
1818 
1819                         /* Only consider nodes where both task and groups benefit */
1820                         taskimp = task_weight(p, nid, dist) - taskweight;
1821                         groupimp = group_weight(p, nid, dist) - groupweight;
1822                         if (taskimp < 0 && groupimp < 0)
1823                                 continue;
1824 
1825                         env.dist = dist;
1826                         env.dst_nid = nid;
1827                         update_numa_stats(&env.dst_stats, env.dst_nid);
1828                         task_numa_find_cpu(&env, taskimp, groupimp);
1829                 }
1830         }
1831 
1832         /*
1833          * If the task is part of a workload that spans multiple NUMA nodes,
1834          * and is migrating into one of the workload's active nodes, remember
1835          * this node as the task's preferred numa node, so the workload can
1836          * settle down.
1837          * A task that migrated to a second choice node will be better off
1838          * trying for a better one later. Do not set the preferred node here.
1839          */
1840         if (ng) {
1841                 if (env.best_cpu == -1)
1842                         nid = env.src_nid;
1843                 else
1844                         nid = cpu_to_node(env.best_cpu);
1845 
1846                 if (nid != p->numa_preferred_nid)
1847                         sched_setnuma(p, nid);
1848         }
1849 
1850         /* No better CPU than the current one was found. */
1851         if (env.best_cpu == -1)
1852                 return -EAGAIN;
1853 
1854         best_rq = cpu_rq(env.best_cpu);
1855         if (env.best_task == NULL) {
1856                 ret = migrate_task_to(p, env.best_cpu);
1857                 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1858                 if (ret != 0)
1859                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1860                 return ret;
1861         }
1862 
1863         ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1864         WRITE_ONCE(best_rq->numa_migrate_on, 0);
1865 
1866         if (ret != 0)
1867                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1868         put_task_struct(env.best_task);
1869         return ret;
1870 }
1871 
1872 /* Attempt to migrate a task to a CPU on the preferred node. */
1873 static void numa_migrate_preferred(struct task_struct *p)
1874 {
1875         unsigned long interval = HZ;
1876 
1877         /* This task has no NUMA fault statistics yet */
1878         if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1879                 return;
1880 
1881         /* Periodically retry migrating the task to the preferred node */
1882         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1883         p->numa_migrate_retry = jiffies + interval;
1884 
1885         /* Success if task is already running on preferred CPU */
1886         if (task_node(p) == p->numa_preferred_nid)
1887                 return;
1888 
1889         /* Otherwise, try migrate to a CPU on the preferred node */
1890         task_numa_migrate(p);
1891 }
1892 
1893 /*
1894  * Find out how many nodes on the workload is actively running on. Do this by
1895  * tracking the nodes from which NUMA hinting faults are triggered. This can
1896  * be different from the set of nodes where the workload's memory is currently
1897  * located.
1898  */
1899 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1900 {
1901         unsigned long faults, max_faults = 0;
1902         int nid, active_nodes = 0;
1903 
1904         for_each_online_node(nid) {
1905                 faults = group_faults_cpu(numa_group, nid);
1906                 if (faults > max_faults)
1907                         max_faults = faults;
1908         }
1909 
1910         for_each_online_node(nid) {
1911                 faults = group_faults_cpu(numa_group, nid);
1912                 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1913                         active_nodes++;
1914         }
1915 
1916         numa_group->max_faults_cpu = max_faults;
1917         numa_group->active_nodes = active_nodes;
1918 }
1919 
1920 /*
1921  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1922  * increments. The more local the fault statistics are, the higher the scan
1923  * period will be for the next scan window. If local/(local+remote) ratio is
1924  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1925  * the scan period will decrease. Aim for 70% local accesses.
1926  */
1927 #define NUMA_PERIOD_SLOTS 10
1928 #define NUMA_PERIOD_THRESHOLD 7
1929 
1930 /*
1931  * Increase the scan period (slow down scanning) if the majority of
1932  * our memory is already on our local node, or if the majority of
1933  * the page accesses are shared with other processes.
1934  * Otherwise, decrease the scan period.
1935  */
1936 static void update_task_scan_period(struct task_struct *p,
1937                         unsigned long shared, unsigned long private)
1938 {
1939         unsigned int period_slot;
1940         int lr_ratio, ps_ratio;
1941         int diff;
1942 
1943         unsigned long remote = p->numa_faults_locality[0];
1944         unsigned long local = p->numa_faults_locality[1];
1945 
1946         /*
1947          * If there were no record hinting faults then either the task is
1948          * completely idle or all activity is areas that are not of interest
1949          * to automatic numa balancing. Related to that, if there were failed
1950          * migration then it implies we are migrating too quickly or the local
1951          * node is overloaded. In either case, scan slower
1952          */
1953         if (local + shared == 0 || p->numa_faults_locality[2]) {
1954                 p->numa_scan_period = min(p->numa_scan_period_max,
1955                         p->numa_scan_period << 1);
1956 
1957                 p->mm->numa_next_scan = jiffies +
1958                         msecs_to_jiffies(p->numa_scan_period);
1959 
1960                 return;
1961         }
1962 
1963         /*
1964          * Prepare to scale scan period relative to the current period.
1965          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1966          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1967          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1968          */
1969         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1970         lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1971         ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1972 
1973         if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1974                 /*
1975                  * Most memory accesses are local. There is no need to
1976                  * do fast NUMA scanning, since memory is already local.
1977                  */
1978                 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1979                 if (!slot)
1980                         slot = 1;
1981                 diff = slot * period_slot;
1982         } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1983                 /*
1984                  * Most memory accesses are shared with other tasks.
1985                  * There is no point in continuing fast NUMA scanning,
1986                  * since other tasks may just move the memory elsewhere.
1987                  */
1988                 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1989                 if (!slot)
1990                         slot = 1;
1991                 diff = slot * period_slot;
1992         } else {
1993                 /*
1994                  * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
1995                  * yet they are not on the local NUMA node. Speed up
1996                  * NUMA scanning to get the memory moved over.
1997                  */
1998                 int ratio = max(lr_ratio, ps_ratio);
1999                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2000         }
2001 
2002         p->numa_scan_period = clamp(p->numa_scan_period + diff,
2003                         task_scan_min(p), task_scan_max(p));
2004         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2005 }
2006 
2007 /*
2008  * Get the fraction of time the task has been running since the last
2009  * NUMA placement cycle. The scheduler keeps similar statistics, but
2010  * decays those on a 32ms period, which is orders of magnitude off
2011  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2012  * stats only if the task is so new there are no NUMA statistics yet.
2013  */
2014 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2015 {
2016         u64 runtime, delta, now;
2017         /* Use the start of this time slice to avoid calculations. */
2018         now = p->se.exec_start;
2019         runtime = p->se.sum_exec_runtime;
2020 
2021         if (p->last_task_numa_placement) {
2022                 delta = runtime - p->last_sum_exec_runtime;
2023                 *period = now - p->last_task_numa_placement;
2024 
2025                 /* Avoid time going backwards, prevent potential divide error: */
2026                 if (unlikely((s64)*period < 0))
2027                         *period = 0;
2028         } else {
2029                 delta = p->se.avg.load_sum;
2030                 *period = LOAD_AVG_MAX;
2031         }
2032 
2033         p->last_sum_exec_runtime = runtime;
2034         p->last_task_numa_placement = now;
2035 
2036         return delta;
2037 }
2038 
2039 /*
2040  * Determine the preferred nid for a task in a numa_group. This needs to
2041  * be done in a way that produces consistent results with group_weight,
2042  * otherwise workloads might not converge.
2043  */
2044 static int preferred_group_nid(struct task_struct *p, int nid)
2045 {
2046         nodemask_t nodes;
2047         int dist;
2048 
2049         /* Direct connections between all NUMA nodes. */
2050         if (sched_numa_topology_type == NUMA_DIRECT)
2051                 return nid;
2052 
2053         /*
2054          * On a system with glueless mesh NUMA topology, group_weight
2055          * scores nodes according to the number of NUMA hinting faults on
2056          * both the node itself, and on nearby nodes.
2057          */
2058         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2059                 unsigned long score, max_score = 0;
2060                 int node, max_node = nid;
2061 
2062                 dist = sched_max_numa_distance;
2063 
2064                 for_each_online_node(node) {
2065                         score = group_weight(p, node, dist);
2066                         if (score > max_score) {
2067                                 max_score = score;
2068                                 max_node = node;
2069                         }
2070                 }
2071                 return max_node;
2072         }
2073 
2074         /*
2075          * Finding the preferred nid in a system with NUMA backplane
2076          * interconnect topology is more involved. The goal is to locate
2077          * tasks from numa_groups near each other in the system, and
2078          * untangle workloads from different sides of the system. This requires
2079          * searching down the hierarchy of node groups, recursively searching
2080          * inside the highest scoring group of nodes. The nodemask tricks
2081          * keep the complexity of the search down.
2082          */
2083         nodes = node_online_map;
2084         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2085                 unsigned long max_faults = 0;
2086                 nodemask_t max_group = NODE_MASK_NONE;
2087                 int a, b;
2088 
2089                 /* Are there nodes at this distance from each other? */
2090                 if (!find_numa_distance(dist))
2091                         continue;
2092 
2093                 for_each_node_mask(a, nodes) {
2094                         unsigned long faults = 0;
2095                         nodemask_t this_group;
2096                         nodes_clear(this_group);
2097 
2098                         /* Sum group's NUMA faults; includes a==b case. */
2099                         for_each_node_mask(b, nodes) {
2100                                 if (node_distance(a, b) < dist) {
2101                                         faults += group_faults(p, b);
2102                                         node_set(b, this_group);
2103                                         node_clear(b, nodes);
2104                                 }
2105                         }
2106 
2107                         /* Remember the top group. */
2108                         if (faults > max_faults) {
2109                                 max_faults = faults;
2110                                 max_group = this_group;
2111                                 /*
2112                                  * subtle: at the smallest distance there is
2113                                  * just one node left in each "group", the
2114                                  * winner is the preferred nid.
2115                                  */
2116                                 nid = a;
2117                         }
2118                 }
2119                 /* Next round, evaluate the nodes within max_group. */
2120                 if (!max_faults)
2121                         break;
2122                 nodes = max_group;
2123         }
2124         return nid;
2125 }
2126 
2127 static void task_numa_placement(struct task_struct *p)
2128 {
2129         int seq, nid, max_nid = NUMA_NO_NODE;
2130         unsigned long max_faults = 0;
2131         unsigned long fault_types[2] = { 0, 0 };
2132         unsigned long total_faults;
2133         u64 runtime, period;
2134         spinlock_t *group_lock = NULL;
2135         struct numa_group *ng;
2136 
2137         /*
2138          * The p->mm->numa_scan_seq field gets updated without
2139          * exclusive access. Use READ_ONCE() here to ensure
2140          * that the field is read in a single access:
2141          */
2142         seq = READ_ONCE(p->mm->numa_scan_seq);
2143         if (p->numa_scan_seq == seq)
2144                 return;
2145         p->numa_scan_seq = seq;
2146         p->numa_scan_period_max = task_scan_max(p);
2147 
2148         total_faults = p->numa_faults_locality[0] +
2149                        p->numa_faults_locality[1];
2150         runtime = numa_get_avg_runtime(p, &period);
2151 
2152         /* If the task is part of a group prevent parallel updates to group stats */
2153         ng = deref_curr_numa_group(p);
2154         if (ng) {
2155                 group_lock = &ng->lock;
2156                 spin_lock_irq(group_lock);
2157         }
2158 
2159         /* Find the node with the highest number of faults */
2160         for_each_online_node(nid) {
2161                 /* Keep track of the offsets in numa_faults array */
2162                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2163                 unsigned long faults = 0, group_faults = 0;
2164                 int priv;
2165 
2166                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2167                         long diff, f_diff, f_weight;
2168 
2169                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2170                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2171                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2172                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2173 
2174                         /* Decay existing window, copy faults since last scan */
2175                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2176                         fault_types[priv] += p->numa_faults[membuf_idx];
2177                         p->numa_faults[membuf_idx] = 0;
2178 
2179                         /*
2180                          * Normalize the faults_from, so all tasks in a group
2181                          * count according to CPU use, instead of by the raw
2182                          * number of faults. Tasks with little runtime have
2183                          * little over-all impact on throughput, and thus their
2184                          * faults are less important.
2185                          */
2186                         f_weight = div64_u64(runtime << 16, period + 1);
2187                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2188                                    (total_faults + 1);
2189                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2190                         p->numa_faults[cpubuf_idx] = 0;
2191 
2192                         p->numa_faults[mem_idx] += diff;
2193                         p->numa_faults[cpu_idx] += f_diff;
2194                         faults += p->numa_faults[mem_idx];
2195                         p->total_numa_faults += diff;
2196                         if (ng) {
2197                                 /*
2198                                  * safe because we can only change our own group
2199                                  *
2200                                  * mem_idx represents the offset for a given
2201                                  * nid and priv in a specific region because it
2202                                  * is at the beginning of the numa_faults array.
2203                                  */
2204                                 ng->faults[mem_idx] += diff;
2205                                 ng->faults_cpu[mem_idx] += f_diff;
2206                                 ng->total_faults += diff;
2207                                 group_faults += ng->faults[mem_idx];
2208                         }
2209                 }
2210 
2211                 if (!ng) {
2212                         if (faults > max_faults) {
2213                                 max_faults = faults;
2214                                 max_nid = nid;
2215                         }
2216                 } else if (group_faults > max_faults) {
2217                         max_faults = group_faults;
2218                         max_nid = nid;
2219                 }
2220         }
2221 
2222         if (ng) {
2223                 numa_group_count_active_nodes(ng);
2224                 spin_unlock_irq(group_lock);
2225                 max_nid = preferred_group_nid(p, max_nid);
2226         }
2227 
2228         if (max_faults) {
2229                 /* Set the new preferred node */
2230                 if (max_nid != p->numa_preferred_nid)
2231                         sched_setnuma(p, max_nid);
2232         }
2233 
2234         update_task_scan_period(p, fault_types[0], fault_types[1]);
2235 }
2236 
2237 static inline int get_numa_group(struct numa_group *grp)
2238 {
2239         return refcount_inc_not_zero(&grp->refcount);
2240 }
2241 
2242 static inline void put_numa_group(struct numa_group *grp)
2243 {
2244         if (refcount_dec_and_test(&grp->refcount))
2245                 kfree_rcu(grp, rcu);
2246 }
2247 
2248 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2249                         int *priv)
2250 {
2251         struct numa_group *grp, *my_grp;
2252         struct task_struct *tsk;
2253         bool join = false;
2254         int cpu = cpupid_to_cpu(cpupid);
2255         int i;
2256 
2257         if (unlikely(!deref_curr_numa_group(p))) {
2258                 unsigned int size = sizeof(struct numa_group) +
2259                                     4*nr_node_ids*sizeof(unsigned long);
2260 
2261                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2262                 if (!grp)
2263                         return;
2264 
2265                 refcount_set(&grp->refcount, 1);
2266                 grp->active_nodes = 1;
2267                 grp->max_faults_cpu = 0;
2268                 spin_lock_init(&grp->lock);
2269                 grp->gid = p->pid;
2270                 /* Second half of the array tracks nids where faults happen */
2271                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2272                                                 nr_node_ids;
2273 
2274                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2275                         grp->faults[i] = p->numa_faults[i];
2276 
2277                 grp->total_faults = p->total_numa_faults;
2278 
2279                 grp->nr_tasks++;
2280                 rcu_assign_pointer(p->numa_group, grp);
2281         }
2282 
2283         rcu_read_lock();
2284         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2285 
2286         if (!cpupid_match_pid(tsk, cpupid))
2287                 goto no_join;
2288 
2289         grp = rcu_dereference(tsk->numa_group);
2290         if (!grp)
2291                 goto no_join;
2292 
2293         my_grp = deref_curr_numa_group(p);
2294         if (grp == my_grp)
2295                 goto no_join;
2296 
2297         /*
2298          * Only join the other group if its bigger; if we're the bigger group,
2299          * the other task will join us.
2300          */
2301         if (my_grp->nr_tasks > grp->nr_tasks)
2302                 goto no_join;
2303 
2304         /*
2305          * Tie-break on the grp address.
2306          */
2307         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2308                 goto no_join;
2309 
2310         /* Always join threads in the same process. */
2311         if (tsk->mm == current->mm)
2312                 join = true;
2313 
2314         /* Simple filter to avoid false positives due to PID collisions */
2315         if (flags & TNF_SHARED)
2316                 join = true;
2317 
2318         /* Update priv based on whether false sharing was detected */
2319         *priv = !join;
2320 
2321         if (join && !get_numa_group(grp))
2322                 goto no_join;
2323 
2324         rcu_read_unlock();
2325 
2326         if (!join)
2327                 return;
2328 
2329         BUG_ON(irqs_disabled());
2330         double_lock_irq(&my_grp->lock, &grp->lock);
2331 
2332         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2333                 my_grp->faults[i] -= p->numa_faults[i];
2334                 grp->faults[i] += p->numa_faults[i];
2335         }
2336         my_grp->total_faults -= p->total_numa_faults;
2337         grp->total_faults += p->total_numa_faults;
2338 
2339         my_grp->nr_tasks--;
2340         grp->nr_tasks++;
2341 
2342         spin_unlock(&my_grp->lock);
2343         spin_unlock_irq(&grp->lock);
2344 
2345         rcu_assign_pointer(p->numa_group, grp);
2346 
2347         put_numa_group(my_grp);
2348         return;
2349 
2350 no_join:
2351         rcu_read_unlock();
2352         return;
2353 }
2354 
2355 /*
2356  * Get rid of NUMA staticstics associated with a task (either current or dead).
2357  * If @final is set, the task is dead and has reached refcount zero, so we can
2358  * safely free all relevant data structures. Otherwise, there might be
2359  * concurrent reads from places like load balancing and procfs, and we should
2360  * reset the data back to default state without freeing ->numa_faults.
2361  */
2362 void task_numa_free(struct task_struct *p, bool final)
2363 {
2364         /* safe: p either is current or is being freed by current */
2365         struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2366         unsigned long *numa_faults = p->numa_faults;
2367         unsigned long flags;
2368         int i;
2369 
2370         if (!numa_faults)
2371                 return;
2372 
2373         if (grp) {
2374                 spin_lock_irqsave(&grp->lock, flags);
2375                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2376                         grp->faults[i] -= p->numa_faults[i];
2377                 grp->total_faults -= p->total_numa_faults;
2378 
2379                 grp->nr_tasks--;
2380                 spin_unlock_irqrestore(&grp->lock, flags);
2381                 RCU_INIT_POINTER(p->numa_group, NULL);
2382                 put_numa_group(grp);
2383         }
2384 
2385         if (final) {
2386                 p->numa_faults = NULL;
2387                 kfree(numa_faults);
2388         } else {
2389                 p->total_numa_faults = 0;
2390                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2391                         numa_faults[i] = 0;
2392         }
2393 }
2394 
2395 /*
2396  * Got a PROT_NONE fault for a page on @node.
2397  */
2398 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2399 {
2400         struct task_struct *p = current;
2401         bool migrated = flags & TNF_MIGRATED;
2402         int cpu_node = task_node(current);
2403         int local = !!(flags & TNF_FAULT_LOCAL);
2404         struct numa_group *ng;
2405         int priv;
2406 
2407         if (!static_branch_likely(&sched_numa_balancing))
2408                 return;
2409 
2410         /* for example, ksmd faulting in a user's mm */
2411         if (!p->mm)
2412                 return;
2413 
2414         /* Allocate buffer to track faults on a per-node basis */
2415         if (unlikely(!p->numa_faults)) {
2416                 int size = sizeof(*p->numa_faults) *
2417                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2418 
2419                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2420                 if (!p->numa_faults)
2421                         return;
2422 
2423                 p->total_numa_faults = 0;
2424                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2425         }
2426 
2427         /*
2428          * First accesses are treated as private, otherwise consider accesses
2429          * to be private if the accessing pid has not changed
2430          */
2431         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2432                 priv = 1;
2433         } else {
2434                 priv = cpupid_match_pid(p, last_cpupid);
2435                 if (!priv && !(flags & TNF_NO_GROUP))
2436                         task_numa_group(p, last_cpupid, flags, &priv);
2437         }
2438 
2439         /*
2440          * If a workload spans multiple NUMA nodes, a shared fault that
2441          * occurs wholly within the set of nodes that the workload is
2442          * actively using should be counted as local. This allows the
2443          * scan rate to slow down when a workload has settled down.
2444          */
2445         ng = deref_curr_numa_group(p);
2446         if (!priv && !local && ng && ng->active_nodes > 1 &&
2447                                 numa_is_active_node(cpu_node, ng) &&
2448                                 numa_is_active_node(mem_node, ng))
2449                 local = 1;
2450 
2451         /*
2452          * Retry to migrate task to preferred node periodically, in case it
2453          * previously failed, or the scheduler moved us.
2454          */
2455         if (time_after(jiffies, p->numa_migrate_retry)) {
2456                 task_numa_placement(p);
2457                 numa_migrate_preferred(p);
2458         }
2459 
2460         if (migrated)
2461                 p->numa_pages_migrated += pages;
2462         if (flags & TNF_MIGRATE_FAIL)
2463                 p->numa_faults_locality[2] += pages;
2464 
2465         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2466         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2467         p->numa_faults_locality[local] += pages;
2468 }
2469 
2470 static void reset_ptenuma_scan(struct task_struct *p)
2471 {
2472         /*
2473          * We only did a read acquisition of the mmap sem, so
2474          * p->mm->numa_scan_seq is written to without exclusive access
2475          * and the update is not guaranteed to be atomic. That's not
2476          * much of an issue though, since this is just used for
2477          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2478          * expensive, to avoid any form of compiler optimizations:
2479          */
2480         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2481         p->mm->numa_scan_offset = 0;
2482 }
2483 
2484 /*
2485  * The expensive part of numa migration is done from task_work context.
2486  * Triggered from task_tick_numa().
2487  */
2488 static void task_numa_work(struct callback_head *work)
2489 {
2490         unsigned long migrate, next_scan, now = jiffies;
2491         struct task_struct *p = current;
2492         struct mm_struct *mm = p->mm;
2493         u64 runtime = p->se.sum_exec_runtime;
2494         struct vm_area_struct *vma;
2495         unsigned long start, end;
2496         unsigned long nr_pte_updates = 0;
2497         long pages, virtpages;
2498 
2499         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2500 
2501         work->next = work;
2502         /*
2503          * Who cares about NUMA placement when they're dying.
2504          *
2505          * NOTE: make sure not to dereference p->mm before this check,
2506          * exit_task_work() happens _after_ exit_mm() so we could be called
2507          * without p->mm even though we still had it when we enqueued this
2508          * work.
2509          */
2510         if (p->flags & PF_EXITING)
2511                 return;
2512 
2513         if (!mm->numa_next_scan) {
2514                 mm->numa_next_scan = now +
2515                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2516         }
2517 
2518         /*
2519          * Enforce maximal scan/migration frequency..
2520          */
2521         migrate = mm->numa_next_scan;
2522         if (time_before(now, migrate))
2523                 return;
2524 
2525         if (p->numa_scan_period == 0) {
2526                 p->numa_scan_period_max = task_scan_max(p);
2527                 p->numa_scan_period = task_scan_start(p);
2528         }
2529 
2530         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2531         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2532                 return;
2533 
2534         /*
2535          * Delay this task enough that another task of this mm will likely win
2536          * the next time around.
2537          */
2538         p->node_stamp += 2 * TICK_NSEC;
2539 
2540         start = mm->numa_scan_offset;
2541         pages = sysctl_numa_balancing_scan_size;
2542         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2543         virtpages = pages * 8;     /* Scan up to this much virtual space */
2544         if (!pages)
2545                 return;
2546 
2547 
2548         if (!down_read_trylock(&mm->mmap_sem))
2549                 return;
2550         vma = find_vma(mm, start);
2551         if (!vma) {
2552                 reset_ptenuma_scan(p);
2553                 start = 0;
2554                 vma = mm->mmap;
2555         }
2556         for (; vma; vma = vma->vm_next) {
2557                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2558                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2559                         continue;
2560                 }
2561 
2562                 /*
2563                  * Shared library pages mapped by multiple processes are not
2564                  * migrated as it is expected they are cache replicated. Avoid
2565                  * hinting faults in read-only file-backed mappings or the vdso
2566                  * as migrating the pages will be of marginal benefit.
2567                  */
2568                 if (!vma->vm_mm ||
2569                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2570                         continue;
2571 
2572                 /*
2573                  * Skip inaccessible VMAs to avoid any confusion between
2574                  * PROT_NONE and NUMA hinting ptes
2575                  */
2576                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2577                         continue;
2578 
2579                 do {
2580                         start = max(start, vma->vm_start);
2581                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2582                         end = min(end, vma->vm_end);
2583                         nr_pte_updates = change_prot_numa(vma, start, end);
2584 
2585                         /*
2586                          * Try to scan sysctl_numa_balancing_size worth of
2587                          * hpages that have at least one present PTE that
2588                          * is not already pte-numa. If the VMA contains
2589                          * areas that are unused or already full of prot_numa
2590                          * PTEs, scan up to virtpages, to skip through those
2591                          * areas faster.
2592                          */
2593                         if (nr_pte_updates)
2594                                 pages -= (end - start) >> PAGE_SHIFT;
2595                         virtpages -= (end - start) >> PAGE_SHIFT;
2596 
2597                         start = end;
2598                         if (pages <= 0 || virtpages <= 0)
2599                                 goto out;
2600 
2601                         cond_resched();
2602                 } while (end != vma->vm_end);
2603         }
2604 
2605 out:
2606         /*
2607          * It is possible to reach the end of the VMA list but the last few
2608          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2609          * would find the !migratable VMA on the next scan but not reset the
2610          * scanner to the start so check it now.
2611          */
2612         if (vma)
2613                 mm->numa_scan_offset = start;
2614         else
2615                 reset_ptenuma_scan(p);
2616         up_read(&mm->mmap_sem);
2617 
2618         /*
2619          * Make sure tasks use at least 32x as much time to run other code
2620          * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2621          * Usually update_task_scan_period slows down scanning enough; on an
2622          * overloaded system we need to limit overhead on a per task basis.
2623          */
2624         if (unlikely(p->se.sum_exec_runtime != runtime)) {
2625                 u64 diff = p->se.sum_exec_runtime - runtime;
2626                 p->node_stamp += 32 * diff;
2627         }
2628 }
2629 
2630 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2631 {
2632         int mm_users = 0;
2633         struct mm_struct *mm = p->mm;
2634 
2635         if (mm) {
2636                 mm_users = atomic_read(&mm->mm_users);
2637                 if (mm_users == 1) {
2638                         mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2639                         mm->numa_scan_seq = 0;
2640                 }
2641         }
2642         p->node_stamp                   = 0;
2643         p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
2644         p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
2645         /* Protect against double add, see task_tick_numa and task_numa_work */
2646         p->numa_work.next               = &p->numa_work;
2647         p->numa_faults                  = NULL;
2648         RCU_INIT_POINTER(p->numa_group, NULL);
2649         p->last_task_numa_placement     = 0;
2650         p->last_sum_exec_runtime        = 0;
2651 
2652         init_task_work(&p->numa_work, task_numa_work);
2653 
2654         /* New address space, reset the preferred nid */
2655         if (!(clone_flags & CLONE_VM)) {
2656                 p->numa_preferred_nid = NUMA_NO_NODE;
2657                 return;
2658         }
2659 
2660         /*
2661          * New thread, keep existing numa_preferred_nid which should be copied
2662          * already by arch_dup_task_struct but stagger when scans start.
2663          */
2664         if (mm) {
2665                 unsigned int delay;
2666 
2667                 delay = min_t(unsigned int, task_scan_max(current),
2668                         current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2669                 delay += 2 * TICK_NSEC;
2670                 p->node_stamp = delay;
2671         }
2672 }
2673 
2674 /*
2675  * Drive the periodic memory faults..
2676  */
2677 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2678 {
2679         struct callback_head *work = &curr->numa_work;
2680         u64 period, now;
2681 
2682         /*
2683          * We don't care about NUMA placement if we don't have memory.
2684          */
2685         if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2686                 return;
2687 
2688         /*
2689          * Using runtime rather than walltime has the dual advantage that
2690          * we (mostly) drive the selection from busy threads and that the
2691          * task needs to have done some actual work before we bother with
2692          * NUMA placement.
2693          */
2694         now = curr->se.sum_exec_runtime;
2695         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2696 
2697         if (now > curr->node_stamp + period) {
2698                 if (!curr->node_stamp)
2699                         curr->numa_scan_period = task_scan_start(curr);
2700                 curr->node_stamp += period;
2701 
2702                 if (!time_before(jiffies, curr->mm->numa_next_scan))
2703                         task_work_add(curr, work, true);
2704         }
2705 }
2706 
2707 static void update_scan_period(struct task_struct *p, int new_cpu)
2708 {
2709         int src_nid = cpu_to_node(task_cpu(p));
2710         int dst_nid = cpu_to_node(new_cpu);
2711 
2712         if (!static_branch_likely(&sched_numa_balancing))
2713                 return;
2714 
2715         if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2716                 return;
2717 
2718         if (src_nid == dst_nid)
2719                 return;
2720 
2721         /*
2722          * Allow resets if faults have been trapped before one scan
2723          * has completed. This is most likely due to a new task that
2724          * is pulled cross-node due to wakeups or load balancing.
2725          */
2726         if (p->numa_scan_seq) {
2727                 /*
2728                  * Avoid scan adjustments if moving to the preferred
2729                  * node or if the task was not previously running on
2730                  * the preferred node.
2731                  */
2732                 if (dst_nid == p->numa_preferred_nid ||
2733                     (p->numa_preferred_nid != NUMA_NO_NODE &&
2734                         src_nid != p->numa_preferred_nid))
2735                         return;
2736         }
2737 
2738         p->numa_scan_period = task_scan_start(p);
2739 }
2740 
2741 #else
2742 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2743 {
2744 }
2745 
2746 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2747 {
2748 }
2749 
2750 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2751 {
2752 }
2753 
2754 static inline void update_scan_period(struct task_struct *p, int new_cpu)
2755 {
2756 }
2757 
2758 #endif /* CONFIG_NUMA_BALANCING */
2759 
2760 static void
2761 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762 {
2763         update_load_add(&cfs_rq->load, se->load.weight);
2764 #ifdef CONFIG_SMP
2765         if (entity_is_task(se)) {
2766                 struct rq *rq = rq_of(cfs_rq);
2767 
2768                 account_numa_enqueue(rq, task_of(se));
2769                 list_add(&se->group_node, &rq->cfs_tasks);
2770         }
2771 #endif
2772         cfs_rq->nr_running++;
2773 }
2774 
2775 static void
2776 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2777 {
2778         update_load_sub(&cfs_rq->load, se->load.weight);
2779 #ifdef CONFIG_SMP
2780         if (entity_is_task(se)) {
2781                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2782                 list_del_init(&se->group_node);
2783         }
2784 #endif
2785         cfs_rq->nr_running--;
2786 }
2787 
2788 /*
2789  * Signed add and clamp on underflow.
2790  *
2791  * Explicitly do a load-store to ensure the intermediate value never hits
2792  * memory. This allows lockless observations without ever seeing the negative
2793  * values.
2794  */
2795 #define add_positive(_ptr, _val) do {                           \
2796         typeof(_ptr) ptr = (_ptr);                              \
2797         typeof(_val) val = (_val);                              \
2798         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2799                                                                 \
2800         res = var + val;                                        \
2801                                                                 \
2802         if (val < 0 && res > var)                               \
2803                 res = 0;                                        \
2804                                                                 \
2805         WRITE_ONCE(*ptr, res);                                  \
2806 } while (0)
2807 
2808 /*
2809  * Unsigned subtract and clamp on underflow.
2810  *
2811  * Explicitly do a load-store to ensure the intermediate value never hits
2812  * memory. This allows lockless observations without ever seeing the negative
2813  * values.
2814  */
2815 #define sub_positive(_ptr, _val) do {                           \
2816         typeof(_ptr) ptr = (_ptr);                              \
2817         typeof(*ptr) val = (_val);                              \
2818         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2819         res = var - val;                                        \
2820         if (res > var)                                          \
2821                 res = 0;                                        \
2822         WRITE_ONCE(*ptr, res);                                  \
2823 } while (0)
2824 
2825 /*
2826  * Remove and clamp on negative, from a local variable.
2827  *
2828  * A variant of sub_positive(), which does not use explicit load-store
2829  * and is thus optimized for local variable updates.
2830  */
2831 #define lsub_positive(_ptr, _val) do {                          \
2832         typeof(_ptr) ptr = (_ptr);                              \
2833         *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
2834 } while (0)
2835 
2836 #ifdef CONFIG_SMP
2837 static inline void
2838 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2839 {
2840         cfs_rq->runnable_weight += se->runnable_weight;
2841 
2842         cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2843         cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2844 }
2845 
2846 static inline void
2847 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2848 {
2849         cfs_rq->runnable_weight -= se->runnable_weight;
2850 
2851         sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2852         sub_positive(&cfs_rq->avg.runnable_load_sum,
2853                      se_runnable(se) * se->avg.runnable_load_sum);
2854 }
2855 
2856 static inline void
2857 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2858 {
2859         cfs_rq->avg.load_avg += se->avg.load_avg;
2860         cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2861 }
2862 
2863 static inline void
2864 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2865 {
2866         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2867         sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2868 }
2869 #else
2870 static inline void
2871 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2872 static inline void
2873 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2874 static inline void
2875 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2876 static inline void
2877 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2878 #endif
2879 
2880 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2881                             unsigned long weight, unsigned long runnable)
2882 {
2883         if (se->on_rq) {
2884                 /* commit outstanding execution time */
2885                 if (cfs_rq->curr == se)
2886                         update_curr(cfs_rq);
2887                 account_entity_dequeue(cfs_rq, se);
2888                 dequeue_runnable_load_avg(cfs_rq, se);
2889         }
2890         dequeue_load_avg(cfs_rq, se);
2891 
2892         se->runnable_weight = runnable;
2893         update_load_set(&se->load, weight);
2894 
2895 #ifdef CONFIG_SMP
2896         do {
2897                 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2898 
2899                 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2900                 se->avg.runnable_load_avg =
2901                         div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2902         } while (0);
2903 #endif
2904 
2905         enqueue_load_avg(cfs_rq, se);
2906         if (se->on_rq) {
2907                 account_entity_enqueue(cfs_rq, se);
2908                 enqueue_runnable_load_avg(cfs_rq, se);
2909         }
2910 }
2911 
2912 void reweight_task(struct task_struct *p, int prio)
2913 {
2914         struct sched_entity *se = &p->se;
2915         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2916         struct load_weight *load = &se->load;
2917         unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2918 
2919         reweight_entity(cfs_rq, se, weight, weight);
2920         load->inv_weight = sched_prio_to_wmult[prio];
2921 }
2922 
2923 #ifdef CONFIG_FAIR_GROUP_SCHED
2924 #ifdef CONFIG_SMP
2925 /*
2926  * All this does is approximate the hierarchical proportion which includes that
2927  * global sum we all love to hate.
2928  *
2929  * That is, the weight of a group entity, is the proportional share of the
2930  * group weight based on the group runqueue weights. That is:
2931  *
2932  *                     tg->weight * grq->load.weight
2933  *   ge->load.weight = -----------------------------               (1)
2934  *                        \Sum grq->load.weight
2935  *
2936  * Now, because computing that sum is prohibitively expensive to compute (been
2937  * there, done that) we approximate it with this average stuff. The average
2938  * moves slower and therefore the approximation is cheaper and more stable.
2939  *
2940  * So instead of the above, we substitute:
2941  *
2942  *   grq->load.weight -> grq->avg.load_avg                         (2)
2943  *
2944  * which yields the following:
2945  *
2946  *                     tg->weight * grq->avg.load_avg
2947  *   ge->load.weight = ------------------------------              (3)
2948  *                              tg->load_avg
2949  *
2950  * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2951  *
2952  * That is shares_avg, and it is right (given the approximation (2)).
2953  *
2954  * The problem with it is that because the average is slow -- it was designed
2955  * to be exactly that of course -- this leads to transients in boundary
2956  * conditions. In specific, the case where the group was idle and we start the
2957  * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2958  * yielding bad latency etc..
2959  *
2960  * Now, in that special case (1) reduces to:
2961  *
2962  *                     tg->weight * grq->load.weight
2963  *   ge->load.weight = ----------------------------- = tg->weight   (4)
2964  *                          grp->load.weight
2965  *
2966  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2967  *
2968  * So what we do is modify our approximation (3) to approach (4) in the (near)
2969  * UP case, like:
2970  *
2971  *   ge->load.weight =
2972  *
2973  *              tg->weight * grq->load.weight
2974  *     ---------------------------------------------------         (5)
2975  *     tg->load_avg - grq->avg.load_avg + grq->load.weight
2976  *
2977  * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2978  * we need to use grq->avg.load_avg as its lower bound, which then gives:
2979  *
2980  *
2981  *                     tg->weight * grq->load.weight
2982  *   ge->load.weight = -----------------------------               (6)
2983  *                              tg_load_avg'
2984  *
2985  * Where:
2986  *
2987  *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2988  *                  max(grq->load.weight, grq->avg.load_avg)
2989  *
2990  * And that is shares_weight and is icky. In the (near) UP case it approaches
2991  * (4) while in the normal case it approaches (3). It consistently
2992  * overestimates the ge->load.weight and therefore:
2993  *
2994  *   \Sum ge->load.weight >= tg->weight
2995  *
2996  * hence icky!
2997  */
2998 static long calc_group_shares(struct cfs_rq *cfs_rq)
2999 {
3000         long tg_weight, tg_shares, load, shares;
3001         struct task_group *tg = cfs_rq->tg;
3002 
3003         tg_shares = READ_ONCE(tg->shares);
3004 
3005         load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3006 
3007         tg_weight = atomic_long_read(&tg->load_avg);
3008 
3009         /* Ensure tg_weight >= load */
3010         tg_weight -= cfs_rq->tg_load_avg_contrib;
3011         tg_weight += load;
3012 
3013         shares = (tg_shares * load);
3014         if (tg_weight)
3015                 shares /= tg_weight;
3016 
3017         /*
3018          * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3019          * of a group with small tg->shares value. It is a floor value which is
3020          * assigned as a minimum load.weight to the sched_entity representing
3021          * the group on a CPU.
3022          *
3023          * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3024          * on an 8-core system with 8 tasks each runnable on one CPU shares has
3025          * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3026          * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3027          * instead of 0.
3028          */
3029         return clamp_t(long, shares, MIN_SHARES, tg_shares);
3030 }
3031 
3032 /*
3033  * This calculates the effective runnable weight for a group entity based on
3034  * the group entity weight calculated above.
3035  *
3036  * Because of the above approximation (2), our group entity weight is
3037  * an load_avg based ratio (3). This means that it includes blocked load and
3038  * does not represent the runnable weight.
3039  *
3040  * Approximate the group entity's runnable weight per ratio from the group
3041  * runqueue:
3042  *
3043  *                                           grq->avg.runnable_load_avg
3044  *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
3045  *                                               grq->avg.load_avg
3046  *
3047  * However, analogous to above, since the avg numbers are slow, this leads to
3048  * transients in the from-idle case. Instead we use:
3049  *
3050  *   ge->runnable_weight = ge->load.weight *
3051  *
3052  *              max(grq->avg.runnable_load_avg, grq->runnable_weight)
3053  *              -----------------------------------------------------   (8)
3054  *                    max(grq->avg.load_avg, grq->load.weight)
3055  *
3056  * Where these max() serve both to use the 'instant' values to fix the slow
3057  * from-idle and avoid the /0 on to-idle, similar to (6).
3058  */
3059 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3060 {
3061         long runnable, load_avg;
3062 
3063         load_avg = max(cfs_rq->avg.load_avg,
3064                        scale_load_down(cfs_rq->load.weight));
3065 
3066         runnable = max(cfs_rq->avg.runnable_load_avg,
3067                        scale_load_down(cfs_rq->runnable_weight));
3068 
3069         runnable *= shares;
3070         if (load_avg)
3071                 runnable /= load_avg;
3072 
3073         return clamp_t(long, runnable, MIN_SHARES, shares);
3074 }
3075 #endif /* CONFIG_SMP */
3076 
3077 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3078 
3079 /*
3080  * Recomputes the group entity based on the current state of its group
3081  * runqueue.
3082  */
3083 static void update_cfs_group(struct sched_entity *se)
3084 {
3085         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3086         long shares, runnable;
3087 
3088         if (!gcfs_rq)
3089                 return;
3090 
3091         if (throttled_hierarchy(gcfs_rq))
3092                 return;
3093 
3094 #ifndef CONFIG_SMP
3095         runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3096 
3097         if (likely(se->load.weight == shares))
3098                 return;
3099 #else
3100         shares   = calc_group_shares(gcfs_rq);
3101         runnable = calc_group_runnable(gcfs_rq, shares);
3102 #endif
3103 
3104         reweight_entity(cfs_rq_of(se), se, shares, runnable);
3105 }
3106 
3107 #else /* CONFIG_FAIR_GROUP_SCHED */
3108 static inline void update_cfs_group(struct sched_entity *se)
3109 {
3110 }
3111 #endif /* CONFIG_FAIR_GROUP_SCHED */
3112 
3113 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3114 {
3115         struct rq *rq = rq_of(cfs_rq);
3116 
3117         if (&rq->cfs == cfs_rq) {
3118                 /*
3119                  * There are a few boundary cases this might miss but it should
3120                  * get called often enough that that should (hopefully) not be
3121                  * a real problem.
3122                  *
3123                  * It will not get called when we go idle, because the idle
3124                  * thread is a different class (!fair), nor will the utilization
3125                  * number include things like RT tasks.
3126                  *
3127                  * As is, the util number is not freq-invariant (we'd have to
3128                  * implement arch_scale_freq_capacity() for that).
3129                  *
3130                  * See cpu_util().
3131                  */
3132                 cpufreq_update_util(rq, flags);
3133         }
3134 }
3135 
3136 #ifdef CONFIG_SMP
3137 #ifdef CONFIG_FAIR_GROUP_SCHED
3138 /**
3139  * update_tg_load_avg - update the tg's load avg
3140  * @cfs_rq: the cfs_rq whose avg changed
3141  * @force: update regardless of how small the difference
3142  *
3143  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3144  * However, because tg->load_avg is a global value there are performance
3145  * considerations.
3146  *
3147  * In order to avoid having to look at the other cfs_rq's, we use a
3148  * differential update where we store the last value we propagated. This in
3149  * turn allows skipping updates if the differential is 'small'.
3150  *
3151  * Updating tg's load_avg is necessary before update_cfs_share().
3152  */
3153 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3154 {
3155         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3156 
3157         /*
3158          * No need to update load_avg for root_task_group as it is not used.
3159          */
3160         if (cfs_rq->tg == &root_task_group)
3161                 return;
3162 
3163         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3164                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3165                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3166         }
3167 }
3168 
3169 /*
3170  * Called within set_task_rq() right before setting a task's CPU. The
3171  * caller only guarantees p->pi_lock is held; no other assumptions,
3172  * including the state of rq->lock, should be made.
3173  */
3174 void set_task_rq_fair(struct sched_entity *se,
3175                       struct cfs_rq *prev, struct cfs_rq *next)
3176 {
3177         u64 p_last_update_time;
3178         u64 n_last_update_time;
3179 
3180         if (!sched_feat(ATTACH_AGE_LOAD))
3181                 return;
3182 
3183         /*
3184          * We are supposed to update the task to "current" time, then its up to
3185          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3186          * getting what current time is, so simply throw away the out-of-date
3187          * time. This will result in the wakee task is less decayed, but giving
3188          * the wakee more load sounds not bad.
3189          */
3190         if (!(se->avg.last_update_time && prev))
3191                 return;
3192 
3193 #ifndef CONFIG_64BIT
3194         {
3195                 u64 p_last_update_time_copy;
3196                 u64 n_last_update_time_copy;
3197 
3198                 do {
3199                         p_last_update_time_copy = prev->load_last_update_time_copy;
3200                         n_last_update_time_copy = next->load_last_update_time_copy;
3201 
3202                         smp_rmb();
3203 
3204                         p_last_update_time = prev->avg.last_update_time;
3205                         n_last_update_time = next->avg.last_update_time;
3206 
3207                 } while (p_last_update_time != p_last_update_time_copy ||
3208                          n_last_update_time != n_last_update_time_copy);
3209         }
3210 #else
3211         p_last_update_time = prev->avg.last_update_time;
3212         n_last_update_time = next->avg.last_update_time;
3213 #endif
3214         __update_load_avg_blocked_se(p_last_update_time, se);
3215         se->avg.last_update_time = n_last_update_time;
3216 }
3217 
3218 
3219 /*
3220  * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3221  * propagate its contribution. The key to this propagation is the invariant
3222  * that for each group:
3223  *
3224  *   ge->avg == grq->avg                                                (1)
3225  *
3226  * _IFF_ we look at the pure running and runnable sums. Because they
3227  * represent the very same entity, just at different points in the hierarchy.
3228  *
3229  * Per the above update_tg_cfs_util() is trivial and simply copies the running
3230  * sum over (but still wrong, because the group entity and group rq do not have
3231  * their PELT windows aligned).
3232  *
3233  * However, update_tg_cfs_runnable() is more complex. So we have:
3234  *
3235  *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg          (2)
3236  *
3237  * And since, like util, the runnable part should be directly transferable,
3238  * the following would _appear_ to be the straight forward approach:
3239  *
3240  *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
3241  *
3242  * And per (1) we have:
3243  *
3244  *   ge->avg.runnable_avg == grq->avg.runnable_avg
3245  *
3246  * Which gives:
3247  *
3248  *                      ge->load.weight * grq->avg.load_avg
3249  *   ge->avg.load_avg = -----------------------------------             (4)
3250  *                               grq->load.weight
3251  *
3252  * Except that is wrong!
3253  *
3254  * Because while for entities historical weight is not important and we
3255  * really only care about our future and therefore can consider a pure
3256  * runnable sum, runqueues can NOT do this.
3257  *
3258  * We specifically want runqueues to have a load_avg that includes
3259  * historical weights. Those represent the blocked load, the load we expect
3260  * to (shortly) return to us. This only works by keeping the weights as
3261  * integral part of the sum. We therefore cannot decompose as per (3).
3262  *
3263  * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3264  * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3265  * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3266  * runnable section of these tasks overlap (or not). If they were to perfectly
3267  * align the rq as a whole would be runnable 2/3 of the time. If however we
3268  * always have at least 1 runnable task, the rq as a whole is always runnable.
3269  *
3270  * So we'll have to approximate.. :/
3271  *
3272  * Given the constraint:
3273  *
3274  *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3275  *
3276  * We can construct a rule that adds runnable to a rq by assuming minimal
3277  * overlap.
3278  *
3279  * On removal, we'll assume each task is equally runnable; which yields:
3280  *
3281  *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3282  *
3283  * XXX: only do this for the part of runnable > running ?
3284  *
3285  */
3286 
3287 static inline void
3288 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3289 {
3290         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3291 
3292         /* Nothing to update */
3293         if (!delta)
3294                 return;
3295 
3296         /*
3297          * The relation between sum and avg is:
3298          *
3299          *   LOAD_AVG_MAX - 1024 + sa->period_contrib
3300          *
3301          * however, the PELT windows are not aligned between grq and gse.
3302          */
3303 
3304         /* Set new sched_entity's utilization */
3305         se->avg.util_avg = gcfs_rq->avg.util_avg;
3306         se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3307 
3308         /* Update parent cfs_rq utilization */
3309         add_positive(&cfs_rq->avg.util_avg, delta);
3310         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3311 }
3312 
3313 static inline void
3314 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3315 {
3316         long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3317         unsigned long runnable_load_avg, load_avg;
3318         u64 runnable_load_sum, load_sum = 0;
3319         s64 delta_sum;
3320 
3321         if (!runnable_sum)
3322                 return;
3323 
3324         gcfs_rq->prop_runnable_sum = 0;
3325 
3326         if (runnable_sum >= 0) {
3327                 /*
3328                  * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3329                  * the CPU is saturated running == runnable.
3330                  */
3331                 runnable_sum += se->avg.load_sum;
3332                 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3333         } else {
3334                 /*
3335                  * Estimate the new unweighted runnable_sum of the gcfs_rq by
3336                  * assuming all tasks are equally runnable.
3337                  */
3338                 if (scale_load_down(gcfs_rq->load.weight)) {
3339                         load_sum = div_s64(gcfs_rq->avg.load_sum,
3340                                 scale_load_down(gcfs_rq->load.weight));
3341                 }
3342 
3343                 /* But make sure to not inflate se's runnable */
3344                 runnable_sum = min(se->avg.load_sum, load_sum);
3345         }
3346 
3347         /*
3348          * runnable_sum can't be lower than running_sum
3349          * Rescale running sum to be in the same range as runnable sum
3350          * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
3351          * runnable_sum is in [0 : LOAD_AVG_MAX]
3352          */
3353         running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3354         runnable_sum = max(runnable_sum, running_sum);
3355 
3356         load_sum = (s64)se_weight(se) * runnable_sum;
3357         load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3358 
3359         delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3360         delta_avg = load_avg - se->avg.load_avg;
3361 
3362         se->avg.load_sum = runnable_sum;
3363         se->avg.load_avg = load_avg;
3364         add_positive(&cfs_rq->avg.load_avg, delta_avg);
3365         add_positive(&cfs_rq->avg.load_sum, delta_sum);
3366 
3367         runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3368         runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3369 
3370         if (se->on_rq) {
3371                 delta_sum = runnable_load_sum -
3372                                 se_weight(se) * se->avg.runnable_load_sum;
3373                 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3374                 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3375                 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3376         }
3377 
3378         se->avg.runnable_load_sum = runnable_sum;
3379         se->avg.runnable_load_avg = runnable_load_avg;
3380 }
3381 
3382 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3383 {
3384         cfs_rq->propagate = 1;
3385         cfs_rq->prop_runnable_sum += runnable_sum;
3386 }
3387 
3388 /* Update task and its cfs_rq load average */
3389 static inline int propagate_entity_load_avg(struct sched_entity *se)
3390 {
3391         struct cfs_rq *cfs_rq, *gcfs_rq;
3392 
3393         if (entity_is_task(se))
3394                 return 0;
3395 
3396         gcfs_rq = group_cfs_rq(se);
3397         if (!gcfs_rq->propagate)
3398                 return 0;
3399 
3400         gcfs_rq->propagate = 0;
3401 
3402         cfs_rq = cfs_rq_of(se);
3403 
3404         add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3405 
3406         update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3407         update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3408 
3409         trace_pelt_cfs_tp(cfs_rq);
3410         trace_pelt_se_tp(se);
3411 
3412         return 1;
3413 }
3414 
3415 /*
3416  * Check if we need to update the load and the utilization of a blocked
3417  * group_entity:
3418  */
3419 static inline bool skip_blocked_update(struct sched_entity *se)
3420 {
3421         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3422 
3423         /*
3424          * If sched_entity still have not zero load or utilization, we have to
3425          * decay it:
3426          */
3427         if (se->avg.load_avg || se->avg.util_avg)
3428                 return false;
3429 
3430         /*
3431          * If there is a pending propagation, we have to update the load and
3432          * the utilization of the sched_entity:
3433          */
3434         if (gcfs_rq->propagate)
3435                 return false;
3436 
3437         /*
3438          * Otherwise, the load and the utilization of the sched_entity is
3439          * already zero and there is no pending propagation, so it will be a
3440          * waste of time to try to decay it:
3441          */
3442         return true;
3443 }
3444 
3445 #else /* CONFIG_FAIR_GROUP_SCHED */
3446 
3447 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3448 
3449 static inline int propagate_entity_load_avg(struct sched_entity *se)
3450 {
3451         return 0;
3452 }
3453 
3454 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3455 
3456 #endif /* CONFIG_FAIR_GROUP_SCHED */
3457 
3458 /**
3459  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3460  * @now: current time, as per cfs_rq_clock_pelt()
3461  * @cfs_rq: cfs_rq to update
3462  *
3463  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3464  * avg. The immediate corollary is that all (fair) tasks must be attached, see
3465  * post_init_entity_util_avg().
3466  *
3467  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3468  *
3469  * Returns true if the load decayed or we removed load.
3470  *
3471  * Since both these conditions indicate a changed cfs_rq->avg.load we should
3472  * call update_tg_load_avg() when this function returns true.
3473  */
3474 static inline int
3475 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3476 {
3477         unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3478         struct sched_avg *sa = &cfs_rq->avg;
3479         int decayed = 0;
3480 
3481         if (cfs_rq->removed.nr) {
3482                 unsigned long r;
3483                 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3484 
3485                 raw_spin_lock(&cfs_rq->removed.lock);
3486                 swap(cfs_rq->removed.util_avg, removed_util);
3487                 swap(cfs_rq->removed.load_avg, removed_load);
3488                 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3489                 cfs_rq->removed.nr = 0;
3490                 raw_spin_unlock(&cfs_rq->removed.lock);
3491 
3492                 r = removed_load;
3493                 sub_positive(&sa->load_avg, r);
3494                 sub_positive(&sa->load_sum, r * divider);
3495 
3496                 r = removed_util;
3497                 sub_positive(&sa->util_avg, r);
3498                 sub_positive(&sa->util_sum, r * divider);
3499 
3500                 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3501 
3502                 decayed = 1;
3503         }
3504 
3505         decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3506 
3507 #ifndef CONFIG_64BIT
3508         smp_wmb();
3509         cfs_rq->load_last_update_time_copy = sa->last_update_time;
3510 #endif
3511 
3512         return decayed;
3513 }
3514 
3515 /**
3516  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3517  * @cfs_rq: cfs_rq to attach to
3518  * @se: sched_entity to attach
3519  *
3520  * Must call update_cfs_rq_load_avg() before this, since we rely on
3521  * cfs_rq->avg.last_update_time being current.
3522  */
3523 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3524 {
3525         u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3526 
3527         /*
3528          * When we attach the @se to the @cfs_rq, we must align the decay
3529          * window because without that, really weird and wonderful things can
3530          * happen.
3531          *
3532          * XXX illustrate
3533          */
3534         se->avg.last_update_time = cfs_rq->avg.last_update_time;
3535         se->avg.period_contrib = cfs_rq->avg.period_contrib;
3536 
3537         /*
3538          * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3539          * period_contrib. This isn't strictly correct, but since we're
3540          * entirely outside of the PELT hierarchy, nobody cares if we truncate
3541          * _sum a little.
3542          */
3543         se->avg.util_sum = se->avg.util_avg * divider;
3544 
3545         se->avg.load_sum = divider;
3546         if (se_weight(se)) {
3547                 se->avg.load_sum =
3548                         div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3549         }
3550 
3551         se->avg.runnable_load_sum = se->avg.load_sum;
3552 
3553         enqueue_load_avg(cfs_rq, se);
3554         cfs_rq->avg.util_avg += se->avg.util_avg;
3555         cfs_rq->avg.util_sum += se->avg.util_sum;
3556 
3557         add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3558 
3559         cfs_rq_util_change(cfs_rq, 0);
3560 
3561         trace_pelt_cfs_tp(cfs_rq);
3562 }
3563 
3564 /**
3565  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3566  * @cfs_rq: cfs_rq to detach from
3567  * @se: sched_entity to detach
3568  *
3569  * Must call update_cfs_rq_load_avg() before this, since we rely on
3570  * cfs_rq->avg.last_update_time being current.
3571  */
3572 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3573 {
3574         dequeue_load_avg(cfs_rq, se);
3575         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3576         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3577 
3578         add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3579 
3580         cfs_rq_util_change(cfs_rq, 0);
3581 
3582         trace_pelt_cfs_tp(cfs_rq);
3583 }
3584 
3585 /*
3586  * Optional action to be done while updating the load average
3587  */
3588 #define UPDATE_TG       0x1
3589 #define SKIP_AGE_LOAD   0x2
3590 #define DO_ATTACH       0x4
3591 
3592 /* Update task and its cfs_rq load average */
3593 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3594 {
3595         u64 now = cfs_rq_clock_pelt(cfs_rq);
3596         int decayed;
3597 
3598         /*
3599          * Track task load average for carrying it to new CPU after migrated, and
3600          * track group sched_entity load average for task_h_load calc in migration
3601          */
3602         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3603                 __update_load_avg_se(now, cfs_rq, se);
3604 
3605         decayed  = update_cfs_rq_load_avg(now, cfs_rq);
3606         decayed |= propagate_entity_load_avg(se);
3607 
3608         if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3609 
3610                 /*
3611                  * DO_ATTACH means we're here from enqueue_entity().
3612                  * !last_update_time means we've passed through
3613                  * migrate_task_rq_fair() indicating we migrated.
3614                  *
3615                  * IOW we're enqueueing a task on a new CPU.
3616                  */
3617                 attach_entity_load_avg(cfs_rq, se);
3618                 update_tg_load_avg(cfs_rq, 0);
3619 
3620         } else if (decayed) {
3621                 cfs_rq_util_change(cfs_rq, 0);
3622 
3623                 if (flags & UPDATE_TG)
3624                         update_tg_load_avg(cfs_rq, 0);
3625         }
3626 }
3627 
3628 #ifndef CONFIG_64BIT
3629 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3630 {
3631         u64 last_update_time_copy;
3632         u64 last_update_time;
3633 
3634         do {
3635                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3636                 smp_rmb();
3637                 last_update_time = cfs_rq->avg.last_update_time;
3638         } while (last_update_time != last_update_time_copy);
3639 
3640         return last_update_time;
3641 }
3642 #else
3643 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3644 {
3645         return cfs_rq->avg.last_update_time;
3646 }
3647 #endif
3648 
3649 /*
3650  * Synchronize entity load avg of dequeued entity without locking
3651  * the previous rq.
3652  */
3653 static void sync_entity_load_avg(struct sched_entity *se)
3654 {
3655         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3656         u64 last_update_time;
3657 
3658         last_update_time = cfs_rq_last_update_time(cfs_rq);
3659         __update_load_avg_blocked_se(last_update_time, se);
3660 }
3661 
3662 /*
3663  * Task first catches up with cfs_rq, and then subtract
3664  * itself from the cfs_rq (task must be off the queue now).
3665  */
3666 static void remove_entity_load_avg(struct sched_entity *se)
3667 {
3668         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3669         unsigned long flags;
3670 
3671         /*
3672          * tasks cannot exit without having gone through wake_up_new_task() ->
3673          * post_init_entity_util_avg() which will have added things to the
3674          * cfs_rq, so we can remove unconditionally.
3675          */
3676 
3677         sync_entity_load_avg(se);
3678 
3679         raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3680         ++cfs_rq->removed.nr;
3681         cfs_rq->removed.util_avg        += se->avg.util_avg;
3682         cfs_rq->removed.load_avg        += se->avg.load_avg;
3683         cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
3684         raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3685 }
3686 
3687 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3688 {
3689         return cfs_rq->avg.runnable_load_avg;
3690 }
3691 
3692 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3693 {
3694         return cfs_rq->avg.load_avg;
3695 }
3696 
3697 static inline unsigned long task_util(struct task_struct *p)
3698 {
3699         return READ_ONCE(p->se.avg.util_avg);
3700 }
3701 
3702 static inline unsigned long _task_util_est(struct task_struct *p)
3703 {
3704         struct util_est ue = READ_ONCE(p->se.avg.util_est);
3705 
3706         return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3707 }
3708 
3709 static inline unsigned long task_util_est(struct task_struct *p)
3710 {
3711         return max(task_util(p), _task_util_est(p));
3712 }
3713 
3714 #ifdef CONFIG_UCLAMP_TASK
3715 static inline unsigned long uclamp_task_util(struct task_struct *p)
3716 {
3717         return clamp(task_util_est(p),
3718                      uclamp_eff_value(p, UCLAMP_MIN),
3719                      uclamp_eff_value(p, UCLAMP_MAX));
3720 }
3721 #else
3722 static inline unsigned long uclamp_task_util(struct task_struct *p)
3723 {
3724         return task_util_est(p);
3725 }
3726 #endif
3727 
3728 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3729                                     struct task_struct *p)
3730 {
3731         unsigned int enqueued;
3732 
3733         if (!sched_feat(UTIL_EST))
3734                 return;
3735 
3736         /* Update root cfs_rq's estimated utilization */
3737         enqueued  = cfs_rq->avg.util_est.enqueued;
3738         enqueued += _task_util_est(p);
3739         WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3740 }
3741 
3742 /*
3743  * Check if a (signed) value is within a specified (unsigned) margin,
3744  * based on the observation that:
3745  *
3746  *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3747  *
3748  * NOTE: this only works when value + maring < INT_MAX.
3749  */
3750 static inline bool within_margin(int value, int margin)
3751 {
3752         return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3753 }
3754 
3755 static void
3756 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3757 {
3758         long last_ewma_diff;
3759         struct util_est ue;
3760         int cpu;
3761 
3762         if (!sched_feat(UTIL_EST))
3763                 return;
3764 
3765         /* Update root cfs_rq's estimated utilization */
3766         ue.enqueued  = cfs_rq->avg.util_est.enqueued;
3767         ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3768         WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3769 
3770         /*
3771          * Skip update of task's estimated utilization when the task has not
3772          * yet completed an activation, e.g. being migrated.
3773          */
3774         if (!task_sleep)
3775                 return;
3776 
3777         /*
3778          * If the PELT values haven't changed since enqueue time,
3779          * skip the util_est update.
3780          */
3781         ue = p->se.avg.util_est;
3782         if (ue.enqueued & UTIL_AVG_UNCHANGED)
3783                 return;
3784 
3785         /*
3786          * Reset EWMA on utilization increases, the moving average is used only
3787          * to smooth utilization decreases.
3788          */
3789         ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3790         if (sched_feat(UTIL_EST_FASTUP)) {
3791                 if (ue.ewma < ue.enqueued) {
3792                         ue.ewma = ue.enqueued;
3793                         goto done;
3794                 }
3795         }
3796 
3797         /*
3798          * Skip update of task's estimated utilization when its EWMA is
3799          * already ~1% close to its last activation value.
3800          */
3801         last_ewma_diff = ue.enqueued - ue.ewma;
3802         if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3803                 return;
3804 
3805         /*
3806          * To avoid overestimation of actual task utilization, skip updates if
3807          * we cannot grant there is idle time in this CPU.
3808          */
3809         cpu = cpu_of(rq_of(cfs_rq));
3810         if (task_util(p) > capacity_orig_of(cpu))
3811                 return;
3812 
3813         /*
3814          * Update Task's estimated utilization
3815          *
3816          * When *p completes an activation we can consolidate another sample
3817          * of the task size. This is done by storing the current PELT value
3818          * as ue.enqueued and by using this value to update the Exponential
3819          * Weighted Moving Average (EWMA):
3820          *
3821          *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
3822          *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
3823          *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
3824          *          = w * (      last_ewma_diff            ) +     ewma(t-1)
3825          *          = w * (last_ewma_diff  +  ewma(t-1) / w)
3826          *
3827          * Where 'w' is the weight of new samples, which is configured to be
3828          * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3829          */
3830         ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3831         ue.ewma  += last_ewma_diff;
3832         ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3833 done:
3834         WRITE_ONCE(p->se.avg.util_est, ue);
3835 }
3836 
3837 static inline int task_fits_capacity(struct task_struct *p, long capacity)
3838 {
3839         return fits_capacity(uclamp_task_util(p), capacity);
3840 }
3841 
3842 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3843 {
3844         if (!static_branch_unlikely(&sched_asym_cpucapacity))
3845                 return;
3846 
3847         if (!p) {
3848                 rq->misfit_task_load = 0;
3849                 return;
3850         }
3851 
3852         if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3853                 rq->misfit_task_load = 0;
3854                 return;
3855         }
3856 
3857         rq->misfit_task_load = task_h_load(p);
3858 }
3859 
3860 #else /* CONFIG_SMP */
3861 
3862 #define UPDATE_TG       0x0
3863 #define SKIP_AGE_LOAD   0x0
3864 #define DO_ATTACH       0x0
3865 
3866 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3867 {
3868         cfs_rq_util_change(cfs_rq, 0);
3869 }
3870 
3871 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3872 
3873 static inline void
3874 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3875 static inline void
3876 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3877 
3878 static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3879 {
3880         return 0;
3881 }
3882 
3883 static inline void
3884 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3885 
3886 static inline void
3887 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3888                  bool task_sleep) {}
3889 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3890 
3891 #endif /* CONFIG_SMP */
3892 
3893 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3894 {
3895 #ifdef CONFIG_SCHED_DEBUG
3896         s64 d = se->vruntime - cfs_rq->min_vruntime;
3897 
3898         if (d < 0)
3899                 d = -d;
3900 
3901         if (d > 3*sysctl_sched_latency)
3902                 schedstat_inc(cfs_rq->nr_spread_over);
3903 #endif
3904 }
3905 
3906 static void
3907 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3908 {
3909         u64 vruntime = cfs_rq->min_vruntime;
3910 
3911         /*
3912          * The 'current' period is already promised to the current tasks,
3913          * however the extra weight of the new task will slow them down a
3914          * little, place the new task so that it fits in the slot that
3915          * stays open at the end.
3916          */
3917         if (initial && sched_feat(START_DEBIT))
3918                 vruntime += sched_vslice(cfs_rq, se);
3919 
3920         /* sleeps up to a single latency don't count. */
3921         if (!initial) {
3922                 unsigned long thresh = sysctl_sched_latency;
3923 
3924                 /*
3925                  * Halve their sleep time's effect, to allow
3926                  * for a gentler effect of sleepers:
3927                  */
3928                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3929                         thresh >>= 1;
3930 
3931                 vruntime -= thresh;
3932         }
3933 
3934         /* ensure we never gain time by being placed backwards. */
3935         se->vruntime = max_vruntime(se->vruntime, vruntime);
3936 }
3937 
3938 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3939 
3940 static inline void check_schedstat_required(void)
3941 {
3942 #ifdef CONFIG_SCHEDSTATS
3943         if (schedstat_enabled())
3944                 return;
3945 
3946         /* Force schedstat enabled if a dependent tracepoint is active */
3947         if (trace_sched_stat_wait_enabled()    ||
3948                         trace_sched_stat_sleep_enabled()   ||
3949                         trace_sched_stat_iowait_enabled()  ||
3950                         trace_sched_stat_blocked_enabled() ||
3951                         trace_sched_stat_runtime_enabled())  {
3952                 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3953                              "stat_blocked and stat_runtime require the "
3954                              "kernel parameter schedstats=enable or "
3955                              "kernel.sched_schedstats=1\n");
3956         }
3957 #endif
3958 }
3959 
3960 static inline bool cfs_bandwidth_used(void);
3961 
3962 /*
3963  * MIGRATION
3964  *
3965  *      dequeue
3966  *        update_curr()
3967  *          update_min_vruntime()
3968  *        vruntime -= min_vruntime
3969  *
3970  *      enqueue
3971  *        update_curr()
3972  *          update_min_vruntime()
3973  *        vruntime += min_vruntime
3974  *
3975  * this way the vruntime transition between RQs is done when both
3976  * min_vruntime are up-to-date.
3977  *
3978  * WAKEUP (remote)
3979  *
3980  *      ->migrate_task_rq_fair() (p->state == TASK_WAKING)
3981  *        vruntime -= min_vruntime
3982  *
3983  *      enqueue
3984  *        update_curr()
3985  *          update_min_vruntime()
3986  *        vruntime += min_vruntime
3987  *
3988  * this way we don't have the most up-to-date min_vruntime on the originating
3989  * CPU and an up-to-date min_vruntime on the destination CPU.
3990  */
3991 
3992 static void
3993 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3994 {
3995         bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3996         bool curr = cfs_rq->curr == se;
3997 
3998         /*
3999          * If we're the current task, we must renormalise before calling
4000          * update_curr().
4001          */
4002         if (renorm && curr)
4003                 se->vruntime += cfs_rq->min_vruntime;
4004 
4005         update_curr(cfs_rq);
4006 
4007         /*
4008          * Otherwise, renormalise after, such that we're placed at the current
4009          * moment in time, instead of some random moment in the past. Being
4010          * placed in the past could significantly boost this task to the
4011          * fairness detriment of existing tasks.
4012          */
4013         if (renorm && !curr)
4014                 se->vruntime += cfs_rq->min_vruntime;
4015 
4016         /*
4017          * When enqueuing a sched_entity, we must:
4018          *   - Update loads to have both entity and cfs_rq synced with now.
4019          *   - Add its load to cfs_rq->runnable_avg
4020          *   - For group_entity, update its weight to reflect the new share of
4021          *     its group cfs_rq
4022          *   - Add its new weight to cfs_rq->load.weight
4023          */
4024         update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4025         update_cfs_group(se);
4026         enqueue_runnable_load_avg(cfs_rq, se);
4027         account_entity_enqueue(cfs_rq, se);
4028 
4029         if (flags & ENQUEUE_WAKEUP)
4030                 place_entity(cfs_rq, se, 0);
4031 
4032         check_schedstat_required();
4033         update_stats_enqueue(cfs_rq, se, flags);
4034         check_spread(cfs_rq, se);
4035         if (!curr)
4036                 __enqueue_entity(cfs_rq, se);
4037         se->on_rq = 1;
4038 
4039         /*
4040          * When bandwidth control is enabled, cfs might have been removed
4041          * because of a parent been throttled but cfs->nr_running > 1. Try to
4042          * add it unconditionnally.
4043          */
4044         if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4045                 list_add_leaf_cfs_rq(cfs_rq);
4046 
4047         if (cfs_rq->nr_running == 1)
4048                 check_enqueue_throttle(cfs_rq);
4049 }
4050 
4051 static void __clear_buddies_last(struct sched_entity *se)
4052 {
4053         for_each_sched_entity(se) {
4054                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4055                 if (cfs_rq->last != se)
4056                         break;
4057 
4058                 cfs_rq->last = NULL;
4059         }
4060 }
4061 
4062 static void __clear_buddies_next(struct sched_entity *se)
4063 {
4064         for_each_sched_entity(se) {
4065                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4066                 if (cfs_rq->next != se)
4067                         break;
4068 
4069                 cfs_rq->next = NULL;
4070         }
4071 }
4072 
4073 static void __clear_buddies_skip(struct sched_entity *se)
4074 {
4075         for_each_sched_entity(se) {
4076                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4077                 if (cfs_rq->skip != se)
4078                         break;
4079 
4080                 cfs_rq->skip = NULL;
4081         }
4082 }
4083 
4084 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4085 {
4086         if (cfs_rq->last == se)
4087                 __clear_buddies_last(se);
4088 
4089         if (cfs_rq->next == se)
4090                 __clear_buddies_next(se);
4091 
4092         if (cfs_rq->skip == se)
4093                 __clear_buddies_skip(se);
4094 }
4095 
4096 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4097 
4098 static void
4099 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4100 {
4101         /*
4102          * Update run-time statistics of the 'current'.
4103          */
4104         update_curr(cfs_rq);
4105 
4106         /*
4107          * When dequeuing a sched_entity, we must:
4108          *   - Update loads to have both entity and cfs_rq synced with now.
4109          *   - Subtract its load from the cfs_rq->runnable_avg.
4110          *   - Subtract its previous weight from cfs_rq->load.weight.
4111          *   - For group entity, update its weight to reflect the new share
4112          *     of its group cfs_rq.
4113          */
4114         update_load_avg(cfs_rq, se, UPDATE_TG);
4115         dequeue_runnable_load_avg(cfs_rq, se);
4116 
4117         update_stats_dequeue(cfs_rq, se, flags);
4118 
4119         clear_buddies(cfs_rq, se);
4120 
4121         if (se != cfs_rq->curr)
4122                 __dequeue_entity(cfs_rq, se);
4123         se->on_rq = 0;
4124         account_entity_dequeue(cfs_rq, se);
4125 
4126         /*
4127          * Normalize after update_curr(); which will also have moved
4128          * min_vruntime if @se is the one holding it back. But before doing
4129          * update_min_vruntime() again, which will discount @se's position and
4130          * can move min_vruntime forward still more.
4131          */
4132         if (!(flags & DEQUEUE_SLEEP))
4133                 se->vruntime -= cfs_rq->min_vruntime;
4134 
4135         /* return excess runtime on last dequeue */
4136         return_cfs_rq_runtime(cfs_rq);
4137 
4138         update_cfs_group(se);
4139 
4140         /*
4141          * Now advance min_vruntime if @se was the entity holding it back,
4142          * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4143          * put back on, and if we advance min_vruntime, we'll be placed back
4144          * further than we started -- ie. we'll be penalized.
4145          */
4146         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4147                 update_min_vruntime(cfs_rq);
4148 }
4149 
4150 /*
4151  * Preempt the current task with a newly woken task if needed:
4152  */
4153 static void
4154 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4155 {
4156         unsigned long ideal_runtime, delta_exec;
4157         struct sched_entity *se;
4158         s64 delta;
4159 
4160         ideal_runtime = sched_slice(cfs_rq, curr);
4161         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4162         if (delta_exec > ideal_runtime) {
4163                 resched_curr(rq_of(cfs_rq));
4164                 /*
4165                  * The current task ran long enough, ensure it doesn't get
4166                  * re-elected due to buddy favours.
4167                  */
4168                 clear_buddies(cfs_rq, curr);
4169                 return;
4170         }
4171 
4172         /*
4173          * Ensure that a task that missed wakeup preemption by a
4174          * narrow margin doesn't have to wait for a full slice.
4175          * This also mitigates buddy induced latencies under load.
4176          */
4177         if (delta_exec < sysctl_sched_min_granularity)
4178                 return;
4179 
4180         se = __pick_first_entity(cfs_rq);
4181         delta = curr->vruntime - se->vruntime;
4182 
4183         if (delta < 0)
4184                 return;
4185 
4186         if (delta > ideal_runtime)
4187                 resched_curr(rq_of(cfs_rq));
4188 }
4189 
4190 static void
4191 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4192 {
4193         /* 'current' is not kept within the tree. */
4194         if (se->on_rq) {
4195                 /*
4196                  * Any task has to be enqueued before it get to execute on
4197                  * a CPU. So account for the time it spent waiting on the
4198                  * runqueue.
4199                  */
4200                 update_stats_wait_end(cfs_rq, se);
4201                 __dequeue_entity(cfs_rq, se);
4202                 update_load_avg(cfs_rq, se, UPDATE_TG);
4203         }
4204 
4205         update_stats_curr_start(cfs_rq, se);
4206         cfs_rq->curr = se;
4207 
4208         /*
4209          * Track our maximum slice length, if the CPU's load is at
4210          * least twice that of our own weight (i.e. dont track it
4211          * when there are only lesser-weight tasks around):
4212          */
4213         if (schedstat_enabled() &&
4214             rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4215                 schedstat_set(se->statistics.slice_max,
4216                         max((u64)schedstat_val(se->statistics.slice_max),
4217                             se->sum_exec_runtime - se->prev_sum_exec_runtime));
4218         }
4219 
4220         se->prev_sum_exec_runtime = se->sum_exec_runtime;
4221 }
4222 
4223 static int
4224 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4225 
4226 /*
4227  * Pick the next process, keeping these things in mind, in this order:
4228  * 1) keep things fair between processes/task groups
4229  * 2) pick the "next" process, since someone really wants that to run
4230  * 3) pick the "last" process, for cache locality
4231  * 4) do not run the "skip" process, if something else is available
4232  */
4233 static struct sched_entity *
4234 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4235 {
4236         struct sched_entity *left = __pick_first_entity(cfs_rq);
4237         struct sched_entity *se;
4238 
4239         /*
4240          * If curr is set we have to see if its left of the leftmost entity
4241          * still in the tree, provided there was anything in the tree at all.
4242          */
4243         if (!left || (curr && entity_before(curr, left)))
4244                 left = curr;
4245 
4246         se = left; /* ideally we run the leftmost entity */
4247 
4248         /*
4249          * Avoid running the skip buddy, if running something else can
4250          * be done without getting too unfair.
4251          */
4252         if (cfs_rq->skip == se) {
4253                 struct sched_entity *second;
4254 
4255                 if (se == curr) {
4256                         second = __pick_first_entity(cfs_rq);
4257                 } else {
4258                         second = __pick_next_entity(se);
4259                         if (!second || (curr && entity_before(curr, second)))
4260                                 second = curr;
4261                 }
4262 
4263                 if (second && wakeup_preempt_entity(second, left) < 1)
4264                         se = second;
4265         }
4266 
4267         /*
4268          * Prefer last buddy, try to return the CPU to a preempted task.
4269          */
4270         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4271                 se = cfs_rq->last;
4272 
4273         /*
4274          * Someone really wants this to run. If it's not unfair, run it.
4275          */
4276         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4277                 se = cfs_rq->next;
4278 
4279         clear_buddies(cfs_rq, se);
4280 
4281         return se;
4282 }
4283 
4284 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4285 
4286 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4287 {
4288         /*
4289          * If still on the runqueue then deactivate_task()
4290          * was not called and update_curr() has to be done:
4291          */
4292         if (prev->on_rq)
4293                 update_curr(cfs_rq);
4294 
4295         /* throttle cfs_rqs exceeding runtime */
4296         check_cfs_rq_runtime(cfs_rq);
4297 
4298         check_spread(cfs_rq, prev);
4299 
4300         if (prev->on_rq) {
4301                 update_stats_wait_start(cfs_rq, prev);
4302                 /* Put 'current' back into the tree. */
4303                 __enqueue_entity(cfs_rq, prev);
4304                 /* in !on_rq case, update occurred at dequeue */
4305                 update_load_avg(cfs_rq, prev, 0);
4306         }
4307         cfs_rq->curr = NULL;
4308 }
4309 
4310 static void
4311 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4312 {
4313         /*
4314          * Update run-time statistics of the 'current'.
4315          */
4316         update_curr(cfs_rq);
4317 
4318         /*
4319          * Ensure that runnable average is periodically updated.
4320          */
4321         update_load_avg(cfs_rq, curr, UPDATE_TG);
4322         update_cfs_group(curr);
4323 
4324 #ifdef CONFIG_SCHED_HRTICK
4325         /*
4326          * queued ticks are scheduled to match the slice, so don't bother
4327          * validating it and just reschedule.
4328          */
4329         if (queued) {
4330                 resched_curr(rq_of(cfs_rq));
4331                 return;
4332         }
4333         /*
4334          * don't let the period tick interfere with the hrtick preemption
4335          */
4336         if (!sched_feat(DOUBLE_TICK) &&
4337                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4338                 return;
4339 #endif
4340 
4341         if (cfs_rq->nr_running > 1)
4342                 check_preempt_tick(cfs_rq, curr);
4343 }
4344 
4345 
4346 /**************************************************
4347  * CFS bandwidth control machinery
4348  */
4349 
4350 #ifdef CONFIG_CFS_BANDWIDTH
4351 
4352 #ifdef CONFIG_JUMP_LABEL
4353 static struct static_key __cfs_bandwidth_used;
4354 
4355 static inline bool cfs_bandwidth_used(void)
4356 {
4357         return static_key_false(&__cfs_bandwidth_used);
4358 }
4359 
4360 void cfs_bandwidth_usage_inc(void)
4361 {
4362         static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4363 }
4364 
4365 void cfs_bandwidth_usage_dec(void)
4366 {
4367         static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4368 }
4369 #else /* CONFIG_JUMP_LABEL */
4370 static bool cfs_bandwidth_used(void)
4371 {
4372         return true;
4373 }
4374 
4375 void cfs_bandwidth_usage_inc(void) {}
4376 void cfs_bandwidth_usage_dec(void) {}
4377 #endif /* CONFIG_JUMP_LABEL */
4378 
4379 /*
4380  * default period for cfs group bandwidth.
4381  * default: 0.1s, units: nanoseconds
4382  */
4383 static inline u64 default_cfs_period(void)
4384 {
4385         return 100000000ULL;
4386 }
4387 
4388 static inline u64 sched_cfs_bandwidth_slice(void)
4389 {
4390         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4391 }
4392 
4393 /*
4394  * Replenish runtime according to assigned quota. We use sched_clock_cpu
4395  * directly instead of rq->clock to avoid adding additional synchronization
4396  * around rq->lock.
4397  *
4398  * requires cfs_b->lock
4399  */
4400 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4401 {
4402         if (cfs_b->quota != RUNTIME_INF)
4403                 cfs_b->runtime = cfs_b->quota;
4404 }
4405 
4406 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4407 {
4408         return &tg->cfs_bandwidth;
4409 }
4410 
4411 /* returns 0 on failure to allocate runtime */
4412 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4413 {
4414         struct task_group *tg = cfs_rq->tg;
4415         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4416         u64 amount = 0, min_amount;
4417 
4418         /* note: this is a positive sum as runtime_remaining <= 0 */
4419         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4420 
4421         raw_spin_lock(&cfs_b->lock);
4422         if (cfs_b->quota == RUNTIME_INF)
4423                 amount = min_amount;
4424         else {
4425                 start_cfs_bandwidth(cfs_b);
4426 
4427                 if (cfs_b->runtime > 0) {
4428                         amount = min(cfs_b->runtime, min_amount);
4429                         cfs_b->runtime -= amount;
4430                         cfs_b->idle = 0;
4431                 }
4432         }
4433         raw_spin_unlock(&cfs_b->lock);
4434 
4435         cfs_rq->runtime_remaining += amount;
4436 
4437         return cfs_rq->runtime_remaining > 0;
4438 }
4439 
4440 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4441 {
4442         /* dock delta_exec before expiring quota (as it could span periods) */
4443         cfs_rq->runtime_remaining -= delta_exec;
4444 
4445         if (likely(cfs_rq->runtime_remaining > 0))
4446                 return;
4447 
4448         if (cfs_rq->throttled)
4449                 return;
4450         /*
4451          * if we're unable to extend our runtime we resched so that the active
4452          * hierarchy can be throttled
4453          */
4454         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4455                 resched_curr(rq_of(cfs_rq));
4456 }
4457 
4458 static __always_inline
4459 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4460 {
4461         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4462                 return;
4463 
4464         __account_cfs_rq_runtime(cfs_rq, delta_exec);
4465 }
4466 
4467 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4468 {
4469         return cfs_bandwidth_used() && cfs_rq->throttled;
4470 }
4471 
4472 /* check whether cfs_rq, or any parent, is throttled */
4473 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4474 {
4475         return cfs_bandwidth_used() && cfs_rq->throttle_count;
4476 }
4477 
4478 /*
4479  * Ensure that neither of the group entities corresponding to src_cpu or
4480  * dest_cpu are members of a throttled hierarchy when performing group
4481  * load-balance operations.
4482  */
4483 static inline int throttled_lb_pair(struct task_group *tg,
4484                                     int src_cpu, int dest_cpu)
4485 {
4486         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4487 
4488         src_cfs_rq = tg->cfs_rq[src_cpu];
4489         dest_cfs_rq = tg->cfs_rq[dest_cpu];
4490 
4491         return throttled_hierarchy(src_cfs_rq) ||
4492                throttled_hierarchy(dest_cfs_rq);
4493 }
4494 
4495 static int tg_unthrottle_up(struct task_group *tg, void *data)
4496 {
4497         struct rq *rq = data;
4498         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4499 
4500         cfs_rq->throttle_count--;
4501         if (!cfs_rq->throttle_count) {
4502                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4503                                              cfs_rq->throttled_clock_task;
4504 
4505                 /* Add cfs_rq with already running entity in the list */
4506                 if (cfs_rq->nr_running >= 1)
4507                         list_add_leaf_cfs_rq(cfs_rq);
4508         }
4509 
4510         return 0;
4511 }
4512 
4513 static int tg_throttle_down(struct task_group *tg, void *data)
4514 {
4515         struct rq *rq = data;
4516         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4517 
4518         /* group is entering throttled state, stop time */
4519         if (!cfs_rq->throttle_count) {
4520                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4521                 list_del_leaf_cfs_rq(cfs_rq);
4522         }
4523         cfs_rq->throttle_count++;
4524 
4525         return 0;
4526 }
4527 
4528 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4529 {
4530         struct rq *rq = rq_of(cfs_rq);
4531         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4532         struct sched_entity *se;
4533         long task_delta, idle_task_delta, dequeue = 1;
4534         bool empty;
4535 
4536         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4537 
4538         /* freeze hierarchy runnable averages while throttled */
4539         rcu_read_lock();
4540         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4541         rcu_read_unlock();
4542 
4543         task_delta = cfs_rq->h_nr_running;
4544         idle_task_delta = cfs_rq->idle_h_nr_running;
4545         for_each_sched_entity(se) {
4546                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4547                 /* throttled entity or throttle-on-deactivate */
4548                 if (!se->on_rq)
4549                         break;
4550 
4551                 if (dequeue)
4552                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4553                 qcfs_rq->h_nr_running -= task_delta;
4554                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4555 
4556                 if (qcfs_rq->load.weight)
4557                         dequeue = 0;
4558         }
4559 
4560         if (!se)
4561                 sub_nr_running(rq, task_delta);
4562 
4563         cfs_rq->throttled = 1;
4564         cfs_rq->throttled_clock = rq_clock(rq);
4565         raw_spin_lock(&cfs_b->lock);
4566         empty = list_empty(&cfs_b->throttled_cfs_rq);
4567 
4568         /*
4569          * Add to the _head_ of the list, so that an already-started
4570          * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4571          * not running add to the tail so that later runqueues don't get starved.
4572          */
4573         if (cfs_b->distribute_running)
4574                 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4575         else
4576                 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4577 
4578         /*
4579          * If we're the first throttled task, make sure the bandwidth
4580          * timer is running.
4581          */
4582         if (empty)
4583                 start_cfs_bandwidth(cfs_b);
4584 
4585         raw_spin_unlock(&cfs_b->lock);
4586 }
4587 
4588 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4589 {
4590         struct rq *rq = rq_of(cfs_rq);
4591         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4592         struct sched_entity *se;
4593         int enqueue = 1;
4594         long task_delta, idle_task_delta;
4595 
4596         se = cfs_rq->tg->se[cpu_of(rq)];
4597 
4598         cfs_rq->throttled = 0;
4599 
4600         update_rq_clock(rq);
4601 
4602         raw_spin_lock(&cfs_b->lock);
4603         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4604         list_del_rcu(&cfs_rq->throttled_list);
4605         raw_spin_unlock(&cfs_b->lock);
4606 
4607         /* update hierarchical throttle state */
4608         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4609 
4610         if (!cfs_rq->load.weight)
4611                 return;
4612 
4613         task_delta = cfs_rq->h_nr_running;
4614         idle_task_delta = cfs_rq->idle_h_nr_running;
4615         for_each_sched_entity(se) {
4616                 if (se->on_rq)
4617                         enqueue = 0;
4618 
4619                 cfs_rq = cfs_rq_of(se);
4620                 if (enqueue)
4621                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4622                 cfs_rq->h_nr_running += task_delta;
4623                 cfs_rq->idle_h_nr_running += idle_task_delta;
4624 
4625                 if (cfs_rq_throttled(cfs_rq))
4626                         break;
4627         }
4628 
4629         if (!se)
4630                 add_nr_running(rq, task_delta);
4631 
4632         /*
4633          * The cfs_rq_throttled() breaks in the above iteration can result in
4634          * incomplete leaf list maintenance, resulting in triggering the
4635          * assertion below.
4636          */
4637         for_each_sched_entity(se) {
4638                 cfs_rq = cfs_rq_of(se);
4639 
4640                 list_add_leaf_cfs_rq(cfs_rq);
4641         }
4642 
4643         assert_list_leaf_cfs_rq(rq);
4644 
4645         /* Determine whether we need to wake up potentially idle CPU: */
4646         if (rq->curr == rq->idle && rq->cfs.nr_running)
4647                 resched_curr(rq);
4648 }
4649 
4650 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4651 {
4652         struct cfs_rq *cfs_rq;
4653         u64 runtime;
4654         u64 starting_runtime = remaining;
4655 
4656         rcu_read_lock();
4657         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4658                                 throttled_list) {
4659                 struct rq *rq = rq_of(cfs_rq);
4660                 struct rq_flags rf;
4661 
4662                 rq_lock_irqsave(rq, &rf);
4663                 if (!cfs_rq_throttled(cfs_rq))
4664                         goto next;
4665 
4666                 /* By the above check, this should never be true */
4667                 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4668 
4669                 runtime = -cfs_rq->runtime_remaining + 1;
4670                 if (runtime > remaining)
4671                         runtime = remaining;
4672                 remaining -= runtime;
4673 
4674                 cfs_rq->runtime_remaining += runtime;
4675 
4676                 /* we check whether we're throttled above */
4677                 if (cfs_rq->runtime_remaining > 0)
4678                         unthrottle_cfs_rq(cfs_rq);
4679 
4680 next:
4681                 rq_unlock_irqrestore(rq, &rf);
4682 
4683                 if (!remaining)
4684                         break;
4685         }
4686         rcu_read_unlock();
4687 
4688         return starting_runtime - remaining;
4689 }
4690 
4691 /*
4692  * Responsible for refilling a task_group's bandwidth and unthrottling its
4693  * cfs_rqs as appropriate. If there has been no activity within the last
4694  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4695  * used to track this state.
4696  */
4697 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4698 {
4699         u64 runtime;
4700         int throttled;
4701 
4702         /* no need to continue the timer with no bandwidth constraint */
4703         if (cfs_b->quota == RUNTIME_INF)
4704                 goto out_deactivate;
4705 
4706         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4707         cfs_b->nr_periods += overrun;
4708 
4709         /*
4710          * idle depends on !throttled (for the case of a large deficit), and if
4711          * we're going inactive then everything else can be deferred
4712          */
4713         if (cfs_b->idle && !throttled)
4714                 goto out_deactivate;
4715 
4716         __refill_cfs_bandwidth_runtime(cfs_b);
4717 
4718         if (!throttled) {
4719                 /* mark as potentially idle for the upcoming period */
4720                 cfs_b->idle = 1;
4721                 return 0;
4722         }
4723 
4724         /* account preceding periods in which throttling occurred */
4725         cfs_b->nr_throttled += overrun;
4726 
4727         /*
4728          * This check is repeated as we are holding onto the new bandwidth while
4729          * we unthrottle. This can potentially race with an unthrottled group
4730          * trying to acquire new bandwidth from the global pool. This can result
4731          * in us over-using our runtime if it is all used during this loop, but
4732          * only by limited amounts in that extreme case.
4733          */
4734         while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4735                 runtime = cfs_b->runtime;
4736                 cfs_b->distribute_running = 1;
4737                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4738                 /* we can't nest cfs_b->lock while distributing bandwidth */
4739                 runtime = distribute_cfs_runtime(cfs_b, runtime);
4740                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4741 
4742                 cfs_b->distribute_running = 0;
4743                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4744 
4745                 lsub_positive(&cfs_b->runtime, runtime);
4746         }
4747 
4748         /*
4749          * While we are ensured activity in the period following an
4750          * unthrottle, this also covers the case in which the new bandwidth is
4751          * insufficient to cover the existing bandwidth deficit.  (Forcing the
4752          * timer to remain active while there are any throttled entities.)
4753          */
4754         cfs_b->idle = 0;
4755 
4756         return 0;
4757 
4758 out_deactivate:
4759         return 1;
4760 }
4761 
4762 /* a cfs_rq won't donate quota below this amount */
4763 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4764 /* minimum remaining period time to redistribute slack quota */
4765 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4766 /* how long we wait to gather additional slack before distributing */
4767 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4768 
4769 /*
4770  * Are we near the end of the current quota period?
4771  *
4772  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4773  * hrtimer base being cleared by hrtimer_start. In the case of
4774  * migrate_hrtimers, base is never cleared, so we are fine.
4775  */
4776 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4777 {
4778         struct hrtimer *refresh_timer = &cfs_b->period_timer;
4779         u64 remaining;
4780 
4781         /* if the call-back is running a quota refresh is already occurring */
4782         if (hrtimer_callback_running(refresh_timer))
4783                 return 1;
4784 
4785         /* is a quota refresh about to occur? */
4786         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4787         if (remaining < min_expire)
4788                 return 1;
4789 
4790         return 0;
4791 }
4792 
4793 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4794 {
4795         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4796 
4797         /* if there's a quota refresh soon don't bother with slack */
4798         if (runtime_refresh_within(cfs_b, min_left))
4799                 return;
4800 
4801         /* don't push forwards an existing deferred unthrottle */
4802         if (cfs_b->slack_started)
4803                 return;
4804         cfs_b->slack_started = true;
4805 
4806         hrtimer_start(&cfs_b->slack_timer,
4807                         ns_to_ktime(cfs_bandwidth_slack_period),
4808                         HRTIMER_MODE_REL);
4809 }
4810 
4811 /* we know any runtime found here is valid as update_curr() precedes return */
4812 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4813 {
4814         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4815         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4816 
4817         if (slack_runtime <= 0)
4818                 return;
4819 
4820         raw_spin_lock(&cfs_b->lock);
4821         if (cfs_b->quota != RUNTIME_INF) {
4822                 cfs_b->runtime += slack_runtime;
4823 
4824                 /* we are under rq->lock, defer unthrottling using a timer */
4825                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4826                     !list_empty(&cfs_b->throttled_cfs_rq))
4827                         start_cfs_slack_bandwidth(cfs_b);
4828         }
4829         raw_spin_unlock(&cfs_b->lock);
4830 
4831         /* even if it's not valid for return we don't want to try again */
4832         cfs_rq->runtime_remaining -= slack_runtime;
4833 }
4834 
4835 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4836 {
4837         if (!cfs_bandwidth_used())
4838                 return;
4839 
4840         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4841                 return;
4842 
4843         __return_cfs_rq_runtime(cfs_rq);
4844 }
4845 
4846 /*
4847  * This is done with a timer (instead of inline with bandwidth return) since
4848  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4849  */
4850 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4851 {
4852         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4853         unsigned long flags;
4854 
4855         /* confirm we're still not at a refresh boundary */
4856         raw_spin_lock_irqsave(&cfs_b->lock, flags);
4857         cfs_b->slack_started = false;
4858         if (cfs_b->distribute_running) {
4859                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4860                 return;
4861         }
4862 
4863         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4864                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4865                 return;
4866         }
4867 
4868         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4869                 runtime = cfs_b->runtime;
4870 
4871         if (runtime)
4872                 cfs_b->distribute_running = 1;
4873 
4874         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4875 
4876         if (!runtime)
4877                 return;
4878 
4879         runtime = distribute_cfs_runtime(cfs_b, runtime);
4880 
4881         raw_spin_lock_irqsave(&cfs_b->lock, flags);
4882         lsub_positive(&cfs_b->runtime, runtime);
4883         cfs_b->distribute_running = 0;
4884         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4885 }
4886 
4887 /*
4888  * When a group wakes up we want to make sure that its quota is not already
4889  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4890  * runtime as update_curr() throttling can not not trigger until it's on-rq.
4891  */
4892 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4893 {
4894         if (!cfs_bandwidth_used())
4895                 return;
4896 
4897         /* an active group must be handled by the update_curr()->put() path */
4898         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4899                 return;
4900 
4901         /* ensure the group is not already throttled */
4902         if (cfs_rq_throttled(cfs_rq))
4903                 return;
4904 
4905         /* update runtime allocation */
4906         account_cfs_rq_runtime(cfs_rq, 0);
4907         if (cfs_rq->runtime_remaining <= 0)
4908                 throttle_cfs_rq(cfs_rq);
4909 }
4910 
4911 static void sync_throttle(struct task_group *tg, int cpu)
4912 {
4913         struct cfs_rq *pcfs_rq, *cfs_rq;
4914 
4915         if (!cfs_bandwidth_used())
4916                 return;
4917 
4918         if (!tg->parent)
4919                 return;
4920 
4921         cfs_rq = tg->cfs_rq[cpu];
4922         pcfs_rq = tg->parent->cfs_rq[cpu];
4923 
4924         cfs_rq->throttle_count = pcfs_rq->throttle_count;
4925         cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4926 }
4927 
4928 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4929 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4930 {
4931         if (!cfs_bandwidth_used())
4932                 return false;
4933 
4934         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4935                 return false;
4936 
4937         /*
4938          * it's possible for a throttled entity to be forced into a running
4939          * state (e.g. set_curr_task), in this case we're finished.
4940          */
4941         if (cfs_rq_throttled(cfs_rq))
4942                 return true;
4943 
4944         throttle_cfs_rq(cfs_rq);
4945         return true;
4946 }
4947 
4948 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4949 {
4950         struct cfs_bandwidth *cfs_b =
4951                 container_of(timer, struct cfs_bandwidth, slack_timer);
4952 
4953         do_sched_cfs_slack_timer(cfs_b);
4954 
4955         return HRTIMER_NORESTART;
4956 }
4957 
4958 extern const u64 max_cfs_quota_period;
4959 
4960 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4961 {
4962         struct cfs_bandwidth *cfs_b =
4963                 container_of(timer, struct cfs_bandwidth, period_timer);
4964         unsigned long flags;
4965         int overrun;
4966         int idle = 0;
4967         int count = 0;
4968 
4969         raw_spin_lock_irqsave(&cfs_b->lock, flags);
4970         for (;;) {
4971                 overrun = hrtimer_forward_now(timer, cfs_b->period);
4972                 if (!overrun)
4973                         break;
4974 
4975                 if (++count > 3) {
4976                         u64 new, old = ktime_to_ns(cfs_b->period);
4977 
4978                         /*
4979                          * Grow period by a factor of 2 to avoid losing precision.
4980                          * Precision loss in the quota/period ratio can cause __cfs_schedulable
4981                          * to fail.
4982                          */
4983                         new = old * 2;
4984                         if (new < max_cfs_quota_period) {
4985                                 cfs_b->period = ns_to_ktime(new);
4986                                 cfs_b->quota *= 2;
4987 
4988                                 pr_warn_ratelimited(
4989         "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4990                                         smp_processor_id(),
4991                                         div_u64(new, NSEC_PER_USEC),
4992                                         div_u64(cfs_b->quota, NSEC_PER_USEC));
4993                         } else {
4994                                 pr_warn_ratelimited(
4995         "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4996                                         smp_processor_id(),
4997                                         div_u64(old, NSEC_PER_USEC),
4998                                         div_u64(cfs_b->quota, NSEC_PER_USEC));
4999                         }
5000 
5001                         /* reset count so we don't come right back in here */
5002                         count = 0;
5003                 }
5004 
5005                 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5006         }
5007         if (idle)
5008                 cfs_b->period_active = 0;
5009         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5010 
5011         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5012 }
5013 
5014 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5015 {
5016         raw_spin_lock_init(&cfs_b->lock);
5017         cfs_b->runtime = 0;
5018         cfs_b->quota = RUNTIME_INF;
5019         cfs_b->period = ns_to_ktime(default_cfs_period());
5020 
5021         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5022         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5023         cfs_b->period_timer.function = sched_cfs_period_timer;
5024         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5025         cfs_b->slack_timer.function = sched_cfs_slack_timer;
5026         cfs_b->distribute_running = 0;
5027         cfs_b->slack_started = false;
5028 }
5029 
5030 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5031 {
5032         cfs_rq->runtime_enabled = 0;
5033         INIT_LIST_HEAD(&cfs_rq->throttled_list);
5034 }
5035 
5036 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5037 {
5038         lockdep_assert_held(&cfs_b->lock);
5039 
5040         if (cfs_b->period_active)
5041                 return;
5042 
5043         cfs_b->period_active = 1;
5044         hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5045         hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5046 }
5047 
5048 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5049 {
5050         /* init_cfs_bandwidth() was not called */
5051         if (!cfs_b->throttled_cfs_rq.next)
5052                 return;
5053 
5054         hrtimer_cancel(&cfs_b->period_timer);
5055         hrtimer_cancel(&cfs_b->slack_timer);
5056 }
5057 
5058 /*
5059  * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5060  *
5061  * The race is harmless, since modifying bandwidth settings of unhooked group
5062  * bits doesn't do much.
5063  */
5064 
5065 /* cpu online calback */
5066 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5067 {
5068         struct task_group *tg;
5069 
5070         lockdep_assert_held(&rq->lock);
5071 
5072         rcu_read_lock();
5073         list_for_each_entry_rcu(tg, &task_groups, list) {
5074                 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5075                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5076 
5077                 raw_spin_lock(&cfs_b->lock);
5078                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5079                 raw_spin_unlock(&cfs_b->lock);
5080         }
5081         rcu_read_unlock();
5082 }
5083 
5084 /* cpu offline callback */
5085 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5086 {
5087         struct task_group *tg;
5088 
5089         lockdep_assert_held(&rq->lock);
5090 
5091         rcu_read_lock();
5092         list_for_each_entry_rcu(tg, &task_groups, list) {
5093                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5094 
5095                 if (!cfs_rq->runtime_enabled)
5096                         continue;
5097 
5098                 /*
5099                  * clock_task is not advancing so we just need to make sure
5100                  * there's some valid quota amount
5101                  */
5102                 cfs_rq->runtime_remaining = 1;
5103                 /*
5104                  * Offline rq is schedulable till CPU is completely disabled
5105                  * in take_cpu_down(), so we prevent new cfs throttling here.
5106                  */
5107                 cfs_rq->runtime_enabled = 0;
5108 
5109                 if (cfs_rq_throttled(cfs_rq))
5110                         unthrottle_cfs_rq(cfs_rq);
5111         }
5112         rcu_read_unlock();
5113 }
5114 
5115 #else /* CONFIG_CFS_BANDWIDTH */
5116 
5117 static inline bool cfs_bandwidth_used(void)
5118 {
5119         return false;
5120 }
5121 
5122 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5123 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5124 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5125 static inline void sync_throttle(struct task_group *tg, int cpu) {}
5126 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5127 
5128 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5129 {
5130         return 0;
5131 }
5132 
5133 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5134 {
5135         return 0;
5136 }
5137 
5138 static inline int throttled_lb_pair(struct task_group *tg,
5139                                     int src_cpu, int dest_cpu)
5140 {
5141         return 0;
5142 }
5143 
5144 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5145 
5146 #ifdef CONFIG_FAIR_GROUP_SCHED
5147 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5148 #endif
5149 
5150 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5151 {
5152         return NULL;
5153 }
5154 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5155 static inline void update_runtime_enabled(struct rq *rq) {}
5156 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5157 
5158 #endif /* CONFIG_CFS_BANDWIDTH */
5159 
5160 /**************************************************
5161  * CFS operations on tasks:
5162  */
5163 
5164 #ifdef CONFIG_SCHED_HRTICK
5165 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5166 {
5167         struct sched_entity *se = &p->se;
5168         struct cfs_rq *cfs_rq = cfs_rq_of(se);
5169 
5170         SCHED_WARN_ON(task_rq(p) != rq);
5171 
5172         if (rq->cfs.h_nr_running > 1) {
5173                 u64 slice = sched_slice(cfs_rq, se);
5174                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5175                 s64 delta = slice - ran;
5176 
5177                 if (delta < 0) {
5178                         if (rq->curr == p)
5179                                 resched_curr(rq);
5180                         return;
5181                 }
5182                 hrtick_start(rq, delta);
5183         }
5184 }
5185 
5186 /*
5187  * called from enqueue/dequeue and updates the hrtick when the
5188  * current task is from our class and nr_running is low enough
5189  * to matter.
5190  */
5191 static void hrtick_update(struct rq *rq)
5192 {
5193         struct task_struct *curr = rq->curr;
5194 
5195         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5196                 return;
5197 
5198         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5199                 hrtick_start_fair(rq, curr);
5200 }
5201 #else /* !CONFIG_SCHED_HRTICK */
5202 static inline void
5203 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5204 {
5205 }
5206 
5207 static inline void hrtick_update(struct rq *rq)
5208 {
5209 }
5210 #endif
5211 
5212 #ifdef CONFIG_SMP
5213 static inline unsigned long cpu_util(int cpu);
5214 
5215 static inline bool cpu_overutilized(int cpu)
5216 {
5217         return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5218 }
5219 
5220 static inline void update_overutilized_status(struct rq *rq)
5221 {
5222         if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5223                 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5224                 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5225         }
5226 }
5227 #else
5228 static inline void update_overutilized_status(struct rq *rq) { }
5229 #endif
5230 
5231 /* Runqueue only has SCHED_IDLE tasks enqueued */
5232 static int sched_idle_rq(struct rq *rq)
5233 {
5234         return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5235                         rq->nr_running);
5236 }
5237 
5238 #ifdef CONFIG_SMP
5239 static int sched_idle_cpu(int cpu)
5240 {
5241         return sched_idle_rq(cpu_rq(cpu));
5242 }
5243 #endif
5244 
5245 /*
5246  * The enqueue_task method is called before nr_running is
5247  * increased. Here we update the fair scheduling stats and
5248  * then put the task into the rbtree:
5249  */
5250 static void
5251 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5252 {
5253         struct cfs_rq *cfs_rq;
5254         struct sched_entity *se = &p->se;
5255         int idle_h_nr_running = task_has_idle_policy(p);
5256 
5257         /*
5258          * The code below (indirectly) updates schedutil which looks at
5259          * the cfs_rq utilization to select a frequency.
5260          * Let's add the task's estimated utilization to the cfs_rq's
5261          * estimated utilization, before we update schedutil.
5262          */
5263         util_est_enqueue(&rq->cfs, p);
5264 
5265         /*
5266          * If in_iowait is set, the code below may not trigger any cpufreq
5267          * utilization updates, so do it here explicitly with the IOWAIT flag
5268          * passed.
5269          */
5270         if (p->in_iowait)
5271                 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5272 
5273         for_each_sched_entity(se) {
5274                 if (se->on_rq)
5275                         break;
5276                 cfs_rq = cfs_rq_of(se);
5277                 enqueue_entity(cfs_rq, se, flags);
5278 
5279                 cfs_rq->h_nr_running++;
5280                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5281 
5282                 /* end evaluation on encountering a throttled cfs_rq */
5283                 if (cfs_rq_throttled(cfs_rq))
5284                         goto enqueue_throttle;
5285 
5286                 flags = ENQUEUE_WAKEUP;
5287         }
5288 
5289         for_each_sched_entity(se) {
5290                 cfs_rq = cfs_rq_of(se);
5291 
5292                 update_load_avg(cfs_rq, se, UPDATE_TG);
5293                 update_cfs_group(se);
5294 
5295                 cfs_rq->h_nr_running++;
5296                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5297 
5298                 /* end evaluation on encountering a throttled cfs_rq */
5299                 if (cfs_rq_throttled(cfs_rq))
5300                         goto enqueue_throttle;
5301 
5302                /*
5303                 * One parent has been throttled and cfs_rq removed from the
5304                 * list. Add it back to not break the leaf list.
5305                 */
5306                if (throttled_hierarchy(cfs_rq))
5307                        list_add_leaf_cfs_rq(cfs_rq);
5308         }
5309 
5310 enqueue_throttle:
5311         if (!se) {
5312                 add_nr_running(rq, 1);
5313                 /*
5314                  * Since new tasks are assigned an initial util_avg equal to
5315                  * half of the spare capacity of their CPU, tiny tasks have the
5316                  * ability to cross the overutilized threshold, which will
5317                  * result in the load balancer ruining all the task placement
5318                  * done by EAS. As a way to mitigate that effect, do not account
5319                  * for the first enqueue operation of new tasks during the
5320                  * overutilized flag detection.
5321                  *
5322                  * A better way of solving this problem would be to wait for
5323                  * the PELT signals of tasks to converge before taking them
5324                  * into account, but that is not straightforward to implement,
5325                  * and the following generally works well enough in practice.
5326                  */
5327                 if (flags & ENQUEUE_WAKEUP)
5328                         update_overutilized_status(rq);
5329 
5330         }
5331 
5332         if (cfs_bandwidth_used()) {
5333                 /*
5334                  * When bandwidth control is enabled; the cfs_rq_throttled()
5335                  * breaks in the above iteration can result in incomplete
5336                  * leaf list maintenance, resulting in triggering the assertion
5337                  * below.
5338                  */
5339                 for_each_sched_entity(se) {
5340                         cfs_rq = cfs_rq_of(se);
5341 
5342                         if (list_add_leaf_cfs_rq(cfs_rq))
5343                                 break;
5344                 }
5345         }
5346 
5347         assert_list_leaf_cfs_rq(rq);
5348 
5349         hrtick_update(rq);
5350 }
5351 
5352 static void set_next_buddy(struct sched_entity *se);
5353 
5354 /*
5355  * The dequeue_task method is called before nr_running is
5356  * decreased. We remove the task from the rbtree and
5357  * update the fair scheduling stats:
5358  */
5359 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5360 {
5361         struct cfs_rq *cfs_rq;
5362         struct sched_entity *se = &p->se;
5363         int task_sleep = flags & DEQUEUE_SLEEP;
5364         int idle_h_nr_running = task_has_idle_policy(p);
5365         bool was_sched_idle = sched_idle_rq(rq);
5366 
5367         for_each_sched_entity(se) {
5368                 cfs_rq = cfs_rq_of(se);
5369                 dequeue_entity(cfs_rq, se, flags);
5370 
5371                 cfs_rq->h_nr_running--;
5372                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5373 
5374                 /* end evaluation on encountering a throttled cfs_rq */
5375                 if (cfs_rq_throttled(cfs_rq))
5376                         goto dequeue_throttle;
5377 
5378                 /* Don't dequeue parent if it has other entities besides us */
5379                 if (cfs_rq->load.weight) {
5380                         /* Avoid re-evaluating load for this entity: */
5381                         se = parent_entity(se);
5382                         /*
5383                          * Bias pick_next to pick a task from this cfs_rq, as
5384                          * p is sleeping when it is within its sched_slice.
5385                          */
5386                         if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5387                                 set_next_buddy(se);
5388                         break;
5389                 }
5390                 flags |= DEQUEUE_SLEEP;
5391         }
5392 
5393         for_each_sched_entity(se) {
5394                 cfs_rq = cfs_rq_of(se);
5395 
5396                 update_load_avg(cfs_rq, se, UPDATE_TG);
5397                 update_cfs_group(se);
5398 
5399                 cfs_rq->h_nr_running--;
5400                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5401 
5402                 /* end evaluation on encountering a throttled cfs_rq */
5403                 if (cfs_rq_throttled(cfs_rq))
5404                         goto dequeue_throttle;
5405 
5406         }
5407 
5408 dequeue_throttle:
5409         if (!se)
5410                 sub_nr_running(rq, 1);
5411 
5412         /* balance early to pull high priority tasks */
5413         if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5414                 rq->next_balance = jiffies;
5415 
5416         util_est_dequeue(&rq->cfs, p, task_sleep);
5417         hrtick_update(rq);
5418 }
5419 
5420 #ifdef CONFIG_SMP
5421 
5422 /* Working cpumask for: load_balance, load_balance_newidle. */
5423 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5424 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5425 
5426 #ifdef CONFIG_NO_HZ_COMMON
5427 
5428 static struct {
5429         cpumask_var_t idle_cpus_mask;
5430         atomic_t nr_cpus;
5431         int has_blocked;                /* Idle CPUS has blocked load */
5432         unsigned long next_balance;     /* in jiffy units */
5433         unsigned long next_blocked;     /* Next update of blocked load in jiffies */
5434 } nohz ____cacheline_aligned;
5435 
5436 #endif /* CONFIG_NO_HZ_COMMON */
5437 
5438 static unsigned long cpu_load(struct rq *rq)
5439 {
5440         return cfs_rq_load_avg(&rq->cfs);
5441 }
5442 
5443 /*
5444  * cpu_load_without - compute CPU load without any contributions from *p
5445  * @cpu: the CPU which load is requested
5446  * @p: the task which load should be discounted
5447  *
5448  * The load of a CPU is defined by the load of tasks currently enqueued on that
5449  * CPU as well as tasks which are currently sleeping after an execution on that
5450  * CPU.
5451  *
5452  * This method returns the load of the specified CPU by discounting the load of
5453  * the specified task, whenever the task is currently contributing to the CPU
5454  * load.
5455  */
5456 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5457 {
5458         struct cfs_rq *cfs_rq;
5459         unsigned int load;
5460 
5461         /* Task has no contribution or is new */
5462         if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5463                 return cpu_load(rq);
5464 
5465         cfs_rq = &rq->cfs;
5466         load = READ_ONCE(cfs_rq->avg.load_avg);
5467 
5468         /* Discount task's util from CPU's util */
5469         lsub_positive(&load, task_h_load(p));
5470 
5471         return load;
5472 }
5473 
5474 static unsigned long capacity_of(int cpu)
5475 {
5476         return cpu_rq(cpu)->cpu_capacity;
5477 }
5478 
5479 static void record_wakee(struct task_struct *p)
5480 {
5481         /*
5482          * Only decay a single time; tasks that have less then 1 wakeup per
5483          * jiffy will not have built up many flips.
5484          */
5485         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5486                 current->wakee_flips >>= 1;
5487                 current->wakee_flip_decay_ts = jiffies;
5488         }
5489 
5490         if (current->last_wakee != p) {
5491                 current->last_wakee = p;
5492                 current->wakee_flips++;
5493         }
5494 }
5495 
5496 /*
5497  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5498  *
5499  * A waker of many should wake a different task than the one last awakened
5500  * at a frequency roughly N times higher than one of its wakees.
5501  *
5502  * In order to determine whether we should let the load spread vs consolidating
5503  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5504  * partner, and a factor of lls_size higher frequency in the other.
5505  *
5506  * With both conditions met, we can be relatively sure that the relationship is
5507  * non-monogamous, with partner count exceeding socket size.
5508  *
5509  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5510  * whatever is irrelevant, spread criteria is apparent partner count exceeds
5511  * socket size.
5512  */
5513 static int wake_wide(struct task_struct *p)
5514 {
5515         unsigned int master = current->wakee_flips;
5516         unsigned int slave = p->wakee_flips;
5517         int factor = this_cpu_read(sd_llc_size);
5518 
5519         if (master < slave)
5520                 swap(master, slave);
5521         if (slave < factor || master < slave * factor)
5522                 return 0;
5523         return 1;
5524 }
5525 
5526 /*
5527  * The purpose of wake_affine() is to quickly determine on which CPU we can run
5528  * soonest. For the purpose of speed we only consider the waking and previous
5529  * CPU.
5530  *
5531  * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5532  *                      cache-affine and is (or will be) idle.
5533  *
5534  * wake_affine_weight() - considers the weight to reflect the average
5535  *                        scheduling latency of the CPUs. This seems to work
5536  *                        for the overloaded case.
5537  */
5538 static int
5539 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5540 {
5541         /*
5542          * If this_cpu is idle, it implies the wakeup is from interrupt
5543          * context. Only allow the move if cache is shared. Otherwise an
5544          * interrupt intensive workload could force all tasks onto one
5545          * node depending on the IO topology or IRQ affinity settings.
5546          *
5547          * If the prev_cpu is idle and cache affine then avoid a migration.
5548          * There is no guarantee that the cache hot data from an interrupt
5549          * is more important than cache hot data on the prev_cpu and from
5550          * a cpufreq perspective, it's better to have higher utilisation
5551          * on one CPU.
5552          */
5553         if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5554                 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5555 
5556         if (sync && cpu_rq(this_cpu)->nr_running == 1)
5557                 return this_cpu;
5558 
5559         return nr_cpumask_bits;
5560 }
5561 
5562 static int
5563 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5564                    int this_cpu, int prev_cpu, int sync)
5565 {
5566         s64 this_eff_load, prev_eff_load;
5567         unsigned long task_load;
5568 
5569         this_eff_load = cpu_load(cpu_rq(this_cpu));
5570 
5571         if (sync) {
5572                 unsigned long current_load = task_h_load(current);
5573 
5574                 if (current_load > this_eff_load)
5575                         return this_cpu;
5576 
5577                 this_eff_load -= current_load;
5578         }
5579 
5580         task_load = task_h_load(p);
5581 
5582         this_eff_load += task_load;
5583         if (sched_feat(WA_BIAS))
5584                 this_eff_load *= 100;
5585         this_eff_load *= capacity_of(prev_cpu);
5586 
5587         prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5588         prev_eff_load -= task_load;
5589         if (sched_feat(WA_BIAS))
5590                 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5591         prev_eff_load *= capacity_of(this_cpu);
5592 
5593         /*
5594          * If sync, adjust the weight of prev_eff_load such that if
5595          * prev_eff == this_eff that select_idle_sibling() will consider
5596          * stacking the wakee on top of the waker if no other CPU is
5597          * idle.
5598          */
5599         if (sync)
5600                 prev_eff_load += 1;
5601 
5602         return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5603 }
5604 
5605 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5606                        int this_cpu, int prev_cpu, int sync)
5607 {
5608         int target = nr_cpumask_bits;
5609 
5610         if (sched_feat(WA_IDLE))
5611                 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5612 
5613         if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5614                 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5615 
5616         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5617         if (target == nr_cpumask_bits)
5618                 return prev_cpu;
5619 
5620         schedstat_inc(sd->ttwu_move_affine);
5621         schedstat_inc(p->se.statistics.nr_wakeups_affine);
5622         return target;
5623 }
5624 
5625 static struct sched_group *
5626 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5627                   int this_cpu, int sd_flag);
5628 
5629 /*
5630  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5631  */
5632 static int
5633 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5634 {
5635         unsigned long load, min_load = ULONG_MAX;
5636         unsigned int min_exit_latency = UINT_MAX;
5637         u64 latest_idle_timestamp = 0;
5638         int least_loaded_cpu = this_cpu;
5639         int shallowest_idle_cpu = -1;
5640         int i;
5641 
5642         /* Check if we have any choice: */
5643         if (group->group_weight == 1)
5644                 return cpumask_first(sched_group_span(group));
5645 
5646         /* Traverse only the allowed CPUs */
5647         for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5648                 if (sched_idle_cpu(i))
5649                         return i;
5650 
5651                 if (available_idle_cpu(i)) {
5652                         struct rq *rq = cpu_rq(i);
5653                         struct cpuidle_state *idle = idle_get_state(rq);
5654                         if (idle && idle->exit_latency < min_exit_latency) {
5655                                 /*
5656                                  * We give priority to a CPU whose idle state
5657                                  * has the smallest exit latency irrespective
5658                                  * of any idle timestamp.
5659                                  */
5660                                 min_exit_latency = idle->exit_latency;
5661                                 latest_idle_timestamp = rq->idle_stamp;
5662                                 shallowest_idle_cpu = i;
5663                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5664                                    rq->idle_stamp > latest_idle_timestamp) {
5665                                 /*
5666                                  * If equal or no active idle state, then
5667                                  * the most recently idled CPU might have
5668                                  * a warmer cache.
5669                                  */
5670                                 latest_idle_timestamp = rq->idle_stamp;
5671                                 shallowest_idle_cpu = i;
5672                         }
5673                 } else if (shallowest_idle_cpu == -1) {
5674                         load = cpu_load(cpu_rq(i));
5675                         if (load < min_load) {
5676                                 min_load = load;
5677                                 least_loaded_cpu = i;
5678                         }
5679                 }
5680         }
5681 
5682         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5683 }
5684 
5685 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5686                                   int cpu, int prev_cpu, int sd_flag)
5687 {
5688         int new_cpu = cpu;
5689 
5690         if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5691                 return prev_cpu;
5692 
5693         /*
5694          * We need task's util for cpu_util_without, sync it up to
5695          * prev_cpu's last_update_time.
5696          */
5697         if (!(sd_flag & SD_BALANCE_FORK))
5698                 sync_entity_load_avg(&p->se);
5699 
5700         while (sd) {
5701                 struct sched_group *group;
5702                 struct sched_domain *tmp;
5703                 int weight;
5704 
5705                 if (!(sd->flags & sd_flag)) {
5706                         sd = sd->child;
5707                         continue;
5708                 }
5709 
5710                 group = find_idlest_group(sd, p, cpu, sd_flag);
5711                 if (!group) {
5712                         sd = sd->child;
5713                         continue;
5714                 }
5715 
5716                 new_cpu = find_idlest_group_cpu(group, p, cpu);
5717                 if (new_cpu == cpu) {
5718                         /* Now try balancing at a lower domain level of 'cpu': */
5719                         sd = sd->child;
5720                         continue;
5721                 }
5722 
5723                 /* Now try balancing at a lower domain level of 'new_cpu': */
5724                 cpu = new_cpu;
5725                 weight = sd->span_weight;
5726                 sd = NULL;
5727                 for_each_domain(cpu, tmp) {
5728                         if (weight <= tmp->span_weight)
5729                                 break;
5730                         if (tmp->flags & sd_flag)
5731                                 sd = tmp;
5732                 }
5733         }
5734 
5735         return new_cpu;
5736 }
5737 
5738 #ifdef CONFIG_SCHED_SMT
5739 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5740 EXPORT_SYMBOL_GPL(sched_smt_present);
5741 
5742 static inline void set_idle_cores(int cpu, int val)
5743 {
5744         struct sched_domain_shared *sds;
5745 
5746         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5747         if (sds)
5748                 WRITE_ONCE(sds->has_idle_cores, val);
5749 }
5750 
5751 static inline bool test_idle_cores(int cpu, bool def)
5752 {
5753         struct sched_domain_shared *sds;
5754 
5755         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5756         if (sds)
5757                 return READ_ONCE(sds->has_idle_cores);
5758 
5759         return def;
5760 }
5761 
5762 /*
5763  * Scans the local SMT mask to see if the entire core is idle, and records this
5764  * information in sd_llc_shared->has_idle_cores.
5765  *
5766  * Since SMT siblings share all cache levels, inspecting this limited remote
5767  * state should be fairly cheap.
5768  */
5769 void __update_idle_core(struct rq *rq)
5770 {
5771         int core = cpu_of(rq);
5772         int cpu;
5773 
5774         rcu_read_lock();
5775         if (test_idle_cores(core, true))
5776                 goto unlock;
5777 
5778         for_each_cpu(cpu, cpu_smt_mask(core)) {
5779                 if (cpu == core)
5780                         continue;
5781 
5782                 if (!available_idle_cpu(cpu))
5783                         goto unlock;
5784         }
5785 
5786         set_idle_cores(core, 1);
5787 unlock:
5788         rcu_read_unlock();
5789 }
5790 
5791 /*
5792  * Scan the entire LLC domain for idle cores; this dynamically switches off if
5793  * there are no idle cores left in the system; tracked through
5794  * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5795  */
5796 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5797 {
5798         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5799         int core, cpu;
5800 
5801         if (!static_branch_likely(&sched_smt_present))
5802                 return -1;
5803 
5804         if (!test_idle_cores(target, false))
5805                 return -1;
5806 
5807         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5808 
5809         for_each_cpu_wrap(core, cpus, target) {
5810                 bool idle = true;
5811 
5812                 for_each_cpu(cpu, cpu_smt_mask(core)) {
5813                         __cpumask_clear_cpu(cpu, cpus);
5814                         if (!available_idle_cpu(cpu))
5815                                 idle = false;
5816                 }
5817 
5818                 if (idle)
5819                         return core;
5820         }
5821 
5822         /*
5823          * Failed to find an idle core; stop looking for one.
5824          */
5825         set_idle_cores(target, 0);
5826 
5827         return -1;
5828 }
5829 
5830 /*
5831  * Scan the local SMT mask for idle CPUs.
5832  */
5833 static int select_idle_smt(struct task_struct *p, int target)
5834 {
5835         int cpu;
5836 
5837         if (!static_branch_likely(&sched_smt_present))
5838                 return -1;
5839 
5840         for_each_cpu(cpu, cpu_smt_mask(target)) {
5841                 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
5842                         continue;
5843                 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
5844                         return cpu;
5845         }
5846 
5847         return -1;
5848 }
5849 
5850 #else /* CONFIG_SCHED_SMT */
5851 
5852 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5853 {
5854         return -1;
5855 }
5856 
5857 static inline int select_idle_smt(struct task_struct *p, int target)
5858 {
5859         return -1;
5860 }
5861 
5862 #endif /* CONFIG_SCHED_SMT */
5863 
5864 /*
5865  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5866  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5867  * average idle time for this rq (as found in rq->avg_idle).
5868  */
5869 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5870 {
5871         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5872         struct sched_domain *this_sd;
5873         u64 avg_cost, avg_idle;
5874         u64 time, cost;
5875         s64 delta;
5876         int this = smp_processor_id();
5877         int cpu, nr = INT_MAX;
5878 
5879         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5880         if (!this_sd)
5881                 return -1;
5882 
5883         /*
5884          * Due to large variance we need a large fuzz factor; hackbench in
5885          * particularly is sensitive here.
5886          */
5887         avg_idle = this_rq()->avg_idle / 512;
5888         avg_cost = this_sd->avg_scan_cost + 1;
5889 
5890         if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5891                 return -1;
5892 
5893         if (sched_feat(SIS_PROP)) {
5894                 u64 span_avg = sd->span_weight * avg_idle;
5895                 if (span_avg > 4*avg_cost)
5896                         nr = div_u64(span_avg, avg_cost);
5897                 else
5898                         nr = 4;
5899         }
5900 
5901         time = cpu_clock(this);
5902 
5903         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5904 
5905         for_each_cpu_wrap(cpu, cpus, target) {
5906                 if (!--nr)
5907                         return -1;
5908                 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
5909                         break;
5910         }
5911 
5912         time = cpu_clock(this) - time;
5913         cost = this_sd->avg_scan_cost;
5914         delta = (s64)(time - cost) / 8;
5915         this_sd->avg_scan_cost += delta;
5916 
5917         return cpu;
5918 }
5919 
5920 /*
5921  * Try and locate an idle core/thread in the LLC cache domain.
5922  */
5923 static int select_idle_sibling(struct task_struct *p, int prev, int target)
5924 {
5925         struct sched_domain *sd;
5926         int i, recent_used_cpu;
5927 
5928         if (available_idle_cpu(target) || sched_idle_cpu(target))
5929                 return target;
5930 
5931         /*
5932          * If the previous CPU is cache affine and idle, don't be stupid:
5933          */
5934         if (prev != target && cpus_share_cache(prev, target) &&
5935             (available_idle_cpu(prev) || sched_idle_cpu(prev)))
5936                 return prev;
5937 
5938         /*
5939          * Allow a per-cpu kthread to stack with the wakee if the
5940          * kworker thread and the tasks previous CPUs are the same.
5941          * The assumption is that the wakee queued work for the
5942          * per-cpu kthread that is now complete and the wakeup is
5943          * essentially a sync wakeup. An obvious example of this
5944          * pattern is IO completions.
5945          */
5946         if (is_per_cpu_kthread(current) &&
5947             prev == smp_processor_id() &&
5948             this_rq()->nr_running <= 1) {
5949                 return prev;
5950         }
5951 
5952         /* Check a recently used CPU as a potential idle candidate: */
5953         recent_used_cpu = p->recent_used_cpu;
5954         if (recent_used_cpu != prev &&
5955             recent_used_cpu != target &&
5956             cpus_share_cache(recent_used_cpu, target) &&
5957             (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
5958             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
5959                 /*
5960                  * Replace recent_used_cpu with prev as it is a potential
5961                  * candidate for the next wake:
5962                  */
5963                 p->recent_used_cpu = prev;
5964                 return recent_used_cpu;
5965         }
5966 
5967         sd = rcu_dereference(per_cpu(sd_llc, target));
5968         if (!sd)
5969                 return target;
5970 
5971         i = select_idle_core(p, sd, target);
5972         if ((unsigned)i < nr_cpumask_bits)
5973                 return i;
5974 
5975         i = select_idle_cpu(p, sd, target);
5976         if ((unsigned)i < nr_cpumask_bits)
5977                 return i;
5978 
5979         i = select_idle_smt(p, target);
5980         if ((unsigned)i < nr_cpumask_bits)
5981                 return i;
5982 
5983         return target;
5984 }
5985 
5986 /**
5987  * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
5988  * @cpu: the CPU to get the utilization of
5989  *
5990  * The unit of the return value must be the one of capacity so we can compare
5991  * the utilization with the capacity of the CPU that is available for CFS task
5992  * (ie cpu_capacity).
5993  *
5994  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5995  * recent utilization of currently non-runnable tasks on a CPU. It represents
5996  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5997  * capacity_orig is the cpu_capacity available at the highest frequency
5998  * (arch_scale_freq_capacity()).
5999  * The utilization of a CPU converges towards a sum equal to or less than the
6000  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6001  * the running time on this CPU scaled by capacity_curr.
6002  *
6003  * The estimated utilization of a CPU is defined to be the maximum between its
6004  * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6005  * currently RUNNABLE on that CPU.
6006  * This allows to properly represent the expected utilization of a CPU which
6007  * has just got a big task running since a long sleep period. At the same time
6008  * however it preserves the benefits of the "blocked utilization" in
6009  * describing the potential for other tasks waking up on the same CPU.
6010  *
6011  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6012  * higher than capacity_orig because of unfortunate rounding in
6013  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6014  * the average stabilizes with the new running time. We need to check that the
6015  * utilization stays within the range of [0..capacity_orig] and cap it if
6016  * necessary. Without utilization capping, a group could be seen as overloaded
6017  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6018  * available capacity. We allow utilization to overshoot capacity_curr (but not
6019  * capacity_orig) as it useful for predicting the capacity required after task
6020  * migrations (scheduler-driven DVFS).
6021  *
6022  * Return: the (estimated) utilization for the specified CPU
6023  */
6024 static inline unsigned long cpu_util(int cpu)
6025 {
6026         struct cfs_rq *cfs_rq;
6027         unsigned int util;
6028 
6029         cfs_rq = &cpu_rq(cpu)->cfs;
6030         util = READ_ONCE(cfs_rq->avg.util_avg);
6031 
6032         if (sched_feat(UTIL_EST))
6033                 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6034 
6035         return min_t(unsigned long, util, capacity_orig_of(cpu));
6036 }
6037 
6038 /*
6039  * cpu_util_without: compute cpu utilization without any contributions from *p
6040  * @cpu: the CPU which utilization is requested
6041  * @p: the task which utilization should be discounted
6042  *
6043  * The utilization of a CPU is defined by the utilization of tasks currently
6044  * enqueued on that CPU as well as tasks which are currently sleeping after an
6045  * execution on that CPU.
6046  *
6047  * This method returns the utilization of the specified CPU by discounting the
6048  * utilization of the specified task, whenever the task is currently
6049  * contributing to the CPU utilization.
6050  */
6051 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6052 {
6053         struct cfs_rq *cfs_rq;
6054         unsigned int util;
6055 
6056         /* Task has no contribution or is new */
6057         if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6058                 return cpu_util(cpu);
6059 
6060         cfs_rq = &cpu_rq(cpu)->cfs;
6061         util = READ_ONCE(cfs_rq->avg.util_avg);
6062 
6063         /* Discount task's util from CPU's util */
6064         lsub_positive(&util, task_util(p));
6065 
6066         /*
6067          * Covered cases:
6068          *
6069          * a) if *p is the only task sleeping on this CPU, then:
6070          *      cpu_util (== task_util) > util_est (== 0)
6071          *    and thus we return:
6072          *      cpu_util_without = (cpu_util - task_util) = 0
6073          *
6074          * b) if other tasks are SLEEPING on this CPU, which is now exiting
6075          *    IDLE, then:
6076          *      cpu_util >= task_util
6077          *      cpu_util > util_est (== 0)
6078          *    and thus we discount *p's blocked utilization to return:
6079          *      cpu_util_without = (cpu_util - task_util) >= 0
6080          *
6081          * c) if other tasks are RUNNABLE on that CPU and
6082          *      util_est > cpu_util
6083          *    then we use util_est since it returns a more restrictive
6084          *    estimation of the spare capacity on that CPU, by just
6085          *    considering the expected utilization of tasks already
6086          *    runnable on that CPU.
6087          *
6088          * Cases a) and b) are covered by the above code, while case c) is
6089          * covered by the following code when estimated utilization is
6090          * enabled.
6091          */
6092         if (sched_feat(UTIL_EST)) {
6093                 unsigned int estimated =
6094                         READ_ONCE(cfs_rq->avg.util_est.enqueued);
6095 
6096                 /*
6097                  * Despite the following checks we still have a small window
6098                  * for a possible race, when an execl's select_task_rq_fair()
6099                  * races with LB's detach_task():
6100                  *
6101                  *   detach_task()
6102                  *     p->on_rq = TASK_ON_RQ_MIGRATING;
6103                  *     ---------------------------------- A
6104                  *     deactivate_task()                   \
6105                  *       dequeue_task()                     + RaceTime
6106                  *         util_est_dequeue()              /
6107                  *     ---------------------------------- B
6108                  *
6109                  * The additional check on "current == p" it's required to
6110                  * properly fix the execl regression and it helps in further
6111                  * reducing the chances for the above race.
6112                  */
6113                 if (unlikely(task_on_rq_queued(p) || current == p))
6114                         lsub_positive(&estimated, _task_util_est(p));
6115 
6116                 util = max(util, estimated);
6117         }
6118 
6119         /*
6120          * Utilization (estimated) can exceed the CPU capacity, thus let's
6121          * clamp to the maximum CPU capacity to ensure consistency with
6122          * the cpu_util call.
6123          */
6124         return min_t(unsigned long, util, capacity_orig_of(cpu));
6125 }
6126 
6127 /*
6128  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6129  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6130  *
6131  * In that case WAKE_AFFINE doesn't make sense and we'll let
6132  * BALANCE_WAKE sort things out.
6133  */
6134 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6135 {
6136         long min_cap, max_cap;
6137 
6138         if (!static_branch_unlikely(&sched_asym_cpucapacity))
6139                 return 0;
6140 
6141         min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6142         max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6143 
6144         /* Minimum capacity is close to max, no need to abort wake_affine */
6145         if (max_cap - min_cap < max_cap >> 3)
6146                 return 0;
6147 
6148         /* Bring task utilization in sync with prev_cpu */
6149         sync_entity_load_avg(&p->se);
6150 
6151         return !task_fits_capacity(p, min_cap);
6152 }
6153 
6154 /*
6155  * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
6156  * to @dst_cpu.
6157  */
6158 static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6159 {
6160         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6161         unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6162 
6163         /*
6164          * If @p migrates from @cpu to another, remove its contribution. Or,
6165          * if @p migrates from another CPU to @cpu, add its contribution. In
6166          * the other cases, @cpu is not impacted by the migration, so the
6167          * util_avg should already be correct.
6168          */
6169         if (task_cpu(p) == cpu && dst_cpu != cpu)
6170                 sub_positive(&util, task_util(p));
6171         else if (task_cpu(p) != cpu && dst_cpu == cpu)
6172                 util += task_util(p);
6173 
6174         if (sched_feat(UTIL_EST)) {
6175                 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6176 
6177                 /*
6178                  * During wake-up, the task isn't enqueued yet and doesn't
6179                  * appear in the cfs_rq->avg.util_est.enqueued of any rq,
6180                  * so just add it (if needed) to "simulate" what will be
6181                  * cpu_util() after the task has been enqueued.
6182                  */
6183                 if (dst_cpu == cpu)
6184                         util_est += _task_util_est(p);
6185 
6186                 util = max(util, util_est);
6187         }
6188 
6189         return min(util, capacity_orig_of(cpu));
6190 }
6191 
6192 /*
6193  * compute_energy(): Estimates the energy that @pd would consume if @p was
6194  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6195  * landscape of @pd's CPUs after the task migration, and uses the Energy Model
6196  * to compute what would be the energy if we decided to actually migrate that
6197  * task.
6198  */
6199 static long
6200 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6201 {
6202         struct cpumask *pd_mask = perf_domain_span(pd);
6203         unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6204         unsigned long max_util = 0, sum_util = 0;
6205         int cpu;
6206 
6207         /*
6208          * The capacity state of CPUs of the current rd can be driven by CPUs
6209          * of another rd if they belong to the same pd. So, account for the
6210          * utilization of these CPUs too by masking pd with cpu_online_mask
6211          * instead of the rd span.
6212          *
6213          * If an entire pd is outside of the current rd, it will not appear in
6214          * its pd list and will not be accounted by compute_energy().
6215          */
6216         for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6217                 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6218                 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6219 
6220                 /*
6221                  * Busy time computation: utilization clamping is not
6222                  * required since the ratio (sum_util / cpu_capacity)
6223                  * is already enough to scale the EM reported power
6224                  * consumption at the (eventually clamped) cpu_capacity.
6225                  */
6226                 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6227                                                ENERGY_UTIL, NULL);
6228 
6229                 /*
6230                  * Performance domain frequency: utilization clamping
6231                  * must be considered since it affects the selection
6232                  * of the performance domain frequency.
6233                  * NOTE: in case RT tasks are running, by default the
6234                  * FREQUENCY_UTIL's utilization can be max OPP.
6235                  */
6236                 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6237                                               FREQUENCY_UTIL, tsk);
6238                 max_util = max(max_util, cpu_util);
6239         }
6240 
6241         return em_pd_energy(pd->em_pd, max_util, sum_util);
6242 }
6243 
6244 /*
6245  * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6246  * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6247  * spare capacity in each performance domain and uses it as a potential
6248  * candidate to execute the task. Then, it uses the Energy Model to figure
6249  * out which of the CPU candidates is the most energy-efficient.
6250  *
6251  * The rationale for this heuristic is as follows. In a performance domain,
6252  * all the most energy efficient CPU candidates (according to the Energy
6253  * Model) are those for which we'll request a low frequency. When there are
6254  * several CPUs for which the frequency request will be the same, we don't
6255  * have enough data to break the tie between them, because the Energy Model
6256  * only includes active power costs. With this model, if we assume that
6257  * frequency requests follow utilization (e.g. using schedutil), the CPU with
6258  * the maximum spare capacity in a performance domain is guaranteed to be among
6259  * the best candidates of the performance domain.
6260  *
6261  * In practice, it could be preferable from an energy standpoint to pack
6262  * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6263  * but that could also hurt our chances to go cluster idle, and we have no
6264  * ways to tell with the current Energy Model if this is actually a good
6265  * idea or not. So, find_energy_efficient_cpu() basically favors
6266  * cluster-packing, and spreading inside a cluster. That should at least be
6267  * a good thing for latency, and this is consistent with the idea that most
6268  * of the energy savings of EAS come from the asymmetry of the system, and
6269  * not so much from breaking the tie between identical CPUs. That's also the
6270  * reason why EAS is enabled in the topology code only for systems where
6271  * SD_ASYM_CPUCAPACITY is set.
6272  *
6273  * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6274  * they don't have any useful utilization data yet and it's not possible to
6275  * forecast their impact on energy consumption. Consequently, they will be
6276  * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6277  * to be energy-inefficient in some use-cases. The alternative would be to
6278  * bias new tasks towards specific types of CPUs first, or to try to infer
6279  * their util_avg from the parent task, but those heuristics could hurt
6280  * other use-cases too. So, until someone finds a better way to solve this,
6281  * let's keep things simple by re-using the existing slow path.
6282  */
6283 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6284 {
6285         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6286         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6287         unsigned long cpu_cap, util, base_energy = 0;
6288         int cpu, best_energy_cpu = prev_cpu;
6289         struct sched_domain *sd;
6290         struct perf_domain *pd;
6291 
6292         rcu_read_lock();
6293         pd = rcu_dereference(rd->pd);
6294         if (!pd || READ_ONCE(rd->overutilized))
6295                 goto fail;
6296 
6297         /*
6298          * Energy-aware wake-up happens on the lowest sched_domain starting
6299          * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6300          */
6301         sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6302         while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6303                 sd = sd->parent;
6304         if (!sd)
6305                 goto fail;
6306 
6307         sync_entity_load_avg(&p->se);
6308         if (!task_util_est(p))
6309                 goto unlock;
6310 
6311         for (; pd; pd = pd->next) {
6312                 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6313                 unsigned long base_energy_pd;
6314                 int max_spare_cap_cpu = -1;
6315 
6316                 /* Compute the 'base' energy of the pd, without @p */
6317                 base_energy_pd = compute_energy(p, -1, pd);
6318                 base_energy += base_energy_pd;
6319 
6320                 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6321                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6322                                 continue;
6323 
6324                         util = cpu_util_next(cpu, p, cpu);
6325                         cpu_cap = capacity_of(cpu);
6326                         spare_cap = cpu_cap - util;
6327 
6328                         /*
6329                          * Skip CPUs that cannot satisfy the capacity request.
6330                          * IOW, placing the task there would make the CPU
6331                          * overutilized. Take uclamp into account to see how
6332                          * much capacity we can get out of the CPU; this is
6333                          * aligned with schedutil_cpu_util().
6334                          */
6335                         util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6336                         if (!fits_capacity(util, cpu_cap))
6337                                 continue;
6338 
6339                         /* Always use prev_cpu as a candidate. */
6340                         if (cpu == prev_cpu) {
6341                                 prev_delta = compute_energy(p, prev_cpu, pd);
6342                                 prev_delta -= base_energy_pd;
6343                                 best_delta = min(best_delta, prev_delta);
6344                         }
6345 
6346                         /*
6347                          * Find the CPU with the maximum spare capacity in
6348                          * the performance domain
6349                          */
6350                         if (spare_cap > max_spare_cap) {
6351                                 max_spare_cap = spare_cap;
6352                                 max_spare_cap_cpu = cpu;
6353                         }
6354                 }
6355 
6356                 /* Evaluate the energy impact of using this CPU. */
6357                 if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
6358                         cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6359                         cur_delta -= base_energy_pd;
6360                         if (cur_delta < best_delta) {
6361                                 best_delta = cur_delta;
6362                                 best_energy_cpu = max_spare_cap_cpu;
6363                         }
6364                 }
6365         }
6366 unlock:
6367         rcu_read_unlock();
6368 
6369         /*
6370          * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6371          * least 6% of the energy used by prev_cpu.
6372          */
6373         if (prev_delta == ULONG_MAX)
6374                 return best_energy_cpu;
6375 
6376         if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6377                 return best_energy_cpu;
6378 
6379         return prev_cpu;
6380 
6381 fail:
6382         rcu_read_unlock();
6383 
6384         return -1;
6385 }
6386 
6387 /*
6388  * select_task_rq_fair: Select target runqueue for the waking task in domains
6389  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6390  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6391  *
6392  * Balances load by selecting the idlest CPU in the idlest group, or under
6393  * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6394  *
6395  * Returns the target CPU number.
6396  *
6397  * preempt must be disabled.
6398  */
6399 static int
6400 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6401 {
6402         struct sched_domain *tmp, *sd = NULL;
6403         int cpu = smp_processor_id();
6404         int new_cpu = prev_cpu;
6405         int want_affine = 0;
6406         int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6407 
6408         if (sd_flag & SD_BALANCE_WAKE) {
6409                 record_wakee(p);
6410 
6411                 if (sched_energy_enabled()) {
6412                         new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6413                         if (new_cpu >= 0)
6414                                 return new_cpu;
6415                         new_cpu = prev_cpu;
6416                 }
6417 
6418                 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6419                               cpumask_test_cpu(cpu, p->cpus_ptr);
6420         }
6421 
6422         rcu_read_lock();
6423         for_each_domain(cpu, tmp) {
6424                 if (!(tmp->flags & SD_LOAD_BALANCE))
6425                         break;
6426 
6427                 /*
6428                  * If both 'cpu' and 'prev_cpu' are part of this domain,
6429                  * cpu is a valid SD_WAKE_AFFINE target.
6430                  */
6431                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6432                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6433                         if (cpu != prev_cpu)
6434                                 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6435 
6436                         sd = NULL; /* Prefer wake_affine over balance flags */
6437                         break;
6438                 }
6439 
6440                 if (tmp->flags & sd_flag)
6441<