~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/sched/fair.c

Version: ~ [ linux-5.16-rc3 ] ~ [ linux-5.15.5 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.82 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.162 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.218 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.256 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.291 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.293 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
  3  *
  4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  5  *
  6  *  Interactivity improvements by Mike Galbraith
  7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
  8  *
  9  *  Various enhancements by Dmitry Adamushko.
 10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 11  *
 12  *  Group scheduling enhancements by Srivatsa Vaddagiri
 13  *  Copyright IBM Corporation, 2007
 14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 15  *
 16  *  Scaled math optimizations by Thomas Gleixner
 17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
 18  *
 19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 21  */
 22 
 23 #include <linux/latencytop.h>
 24 #include <linux/sched.h>
 25 #include <linux/cpumask.h>
 26 #include <linux/cpuidle.h>
 27 #include <linux/slab.h>
 28 #include <linux/profile.h>
 29 #include <linux/interrupt.h>
 30 #include <linux/mempolicy.h>
 31 #include <linux/migrate.h>
 32 #include <linux/task_work.h>
 33 
 34 #include <trace/events/sched.h>
 35 
 36 #include "sched.h"
 37 
 38 /*
 39  * Targeted preemption latency for CPU-bound tasks:
 40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 41  *
 42  * NOTE: this latency value is not the same as the concept of
 43  * 'timeslice length' - timeslices in CFS are of variable length
 44  * and have no persistent notion like in traditional, time-slice
 45  * based scheduling concepts.
 46  *
 47  * (to see the precise effective timeslice length of your workload,
 48  *  run vmstat and monitor the context-switches (cs) field)
 49  */
 50 unsigned int sysctl_sched_latency = 6000000ULL;
 51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 52 
 53 /*
 54  * The initial- and re-scaling of tunables is configurable
 55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
 56  *
 57  * Options are:
 58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
 59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
 60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 61  */
 62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
 63         = SCHED_TUNABLESCALING_LOG;
 64 
 65 /*
 66  * Minimal preemption granularity for CPU-bound tasks:
 67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 68  */
 69 unsigned int sysctl_sched_min_granularity = 750000ULL;
 70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
 71 
 72 /*
 73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 74  */
 75 static unsigned int sched_nr_latency = 8;
 76 
 77 /*
 78  * After fork, child runs first. If set to 0 (default) then
 79  * parent will (try to) run first.
 80  */
 81 unsigned int sysctl_sched_child_runs_first __read_mostly;
 82 
 83 /*
 84  * SCHED_OTHER wake-up granularity.
 85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 86  *
 87  * This option delays the preemption effects of decoupled workloads
 88  * and reduces their over-scheduling. Synchronous workloads will still
 89  * have immediate wakeup/sleep latencies.
 90  */
 91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 93 
 94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 95 
 96 /*
 97  * The exponential sliding  window over which load is averaged for shares
 98  * distribution.
 99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119         lw->weight += inc;
120         lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125         lw->weight -= dec;
126         lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131         lw->weight = w;
132         lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static unsigned int get_update_sysctl_factor(void)
145 {
146         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147         unsigned int factor;
148 
149         switch (sysctl_sched_tunable_scaling) {
150         case SCHED_TUNABLESCALING_NONE:
151                 factor = 1;
152                 break;
153         case SCHED_TUNABLESCALING_LINEAR:
154                 factor = cpus;
155                 break;
156         case SCHED_TUNABLESCALING_LOG:
157         default:
158                 factor = 1 + ilog2(cpus);
159                 break;
160         }
161 
162         return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167         unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170         (sysctl_##name = (factor) * normalized_sysctl_##name)
171         SET_SYSCTL(sched_min_granularity);
172         SET_SYSCTL(sched_latency);
173         SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179         update_sysctl();
180 }
181 
182 #define WMULT_CONST     (~0U)
183 #define WMULT_SHIFT     32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187         unsigned long w;
188 
189         if (likely(lw->inv_weight))
190                 return;
191 
192         w = scale_load_down(lw->weight);
193 
194         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195                 lw->inv_weight = 1;
196         else if (unlikely(!w))
197                 lw->inv_weight = WMULT_CONST;
198         else
199                 lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216         u64 fact = scale_load_down(weight);
217         int shift = WMULT_SHIFT;
218 
219         __update_inv_weight(lw);
220 
221         if (unlikely(fact >> 32)) {
222                 while (fact >> 32) {
223                         fact >>= 1;
224                         shift--;
225                 }
226         }
227 
228         /* hint to use a 32x32->64 mul */
229         fact = (u64)(u32)fact * lw->inv_weight;
230 
231         while (fact >> 32) {
232                 fact >>= 1;
233                 shift--;
234         }
235 
236         return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251         return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)      (!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260         WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262         return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267                 for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271         return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277         return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283         return grp->my_q;
284 }
285 
286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287 {
288         if (!cfs_rq->on_list) {
289                 /*
290                  * Ensure we either appear before our parent (if already
291                  * enqueued) or force our parent to appear after us when it is
292                  * enqueued.  The fact that we always enqueue bottom-up
293                  * reduces this to two cases.
294                  */
295                 if (cfs_rq->tg->parent &&
296                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
299                 } else {
300                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
302                 }
303 
304                 cfs_rq->on_list = 1;
305         }
306 }
307 
308 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309 {
310         if (cfs_rq->on_list) {
311                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312                 cfs_rq->on_list = 0;
313         }
314 }
315 
316 /* Iterate thr' all leaf cfs_rq's on a runqueue */
317 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
318         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319 
320 /* Do the two (enqueued) entities belong to the same group ? */
321 static inline struct cfs_rq *
322 is_same_group(struct sched_entity *se, struct sched_entity *pse)
323 {
324         if (se->cfs_rq == pse->cfs_rq)
325                 return se->cfs_rq;
326 
327         return NULL;
328 }
329 
330 static inline struct sched_entity *parent_entity(struct sched_entity *se)
331 {
332         return se->parent;
333 }
334 
335 static void
336 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337 {
338         int se_depth, pse_depth;
339 
340         /*
341          * preemption test can be made between sibling entities who are in the
342          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
343          * both tasks until we find their ancestors who are siblings of common
344          * parent.
345          */
346 
347         /* First walk up until both entities are at same depth */
348         se_depth = (*se)->depth;
349         pse_depth = (*pse)->depth;
350 
351         while (se_depth > pse_depth) {
352                 se_depth--;
353                 *se = parent_entity(*se);
354         }
355 
356         while (pse_depth > se_depth) {
357                 pse_depth--;
358                 *pse = parent_entity(*pse);
359         }
360 
361         while (!is_same_group(*se, *pse)) {
362                 *se = parent_entity(*se);
363                 *pse = parent_entity(*pse);
364         }
365 }
366 
367 #else   /* !CONFIG_FAIR_GROUP_SCHED */
368 
369 static inline struct task_struct *task_of(struct sched_entity *se)
370 {
371         return container_of(se, struct task_struct, se);
372 }
373 
374 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375 {
376         return container_of(cfs_rq, struct rq, cfs);
377 }
378 
379 #define entity_is_task(se)      1
380 
381 #define for_each_sched_entity(se) \
382                 for (; se; se = NULL)
383 
384 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385 {
386         return &task_rq(p)->cfs;
387 }
388 
389 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390 {
391         struct task_struct *p = task_of(se);
392         struct rq *rq = task_rq(p);
393 
394         return &rq->cfs;
395 }
396 
397 /* runqueue "owned" by this group */
398 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399 {
400         return NULL;
401 }
402 
403 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404 {
405 }
406 
407 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408 {
409 }
410 
411 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
412                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413 
414 static inline struct sched_entity *parent_entity(struct sched_entity *se)
415 {
416         return NULL;
417 }
418 
419 static inline void
420 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421 {
422 }
423 
424 #endif  /* CONFIG_FAIR_GROUP_SCHED */
425 
426 static __always_inline
427 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428 
429 /**************************************************************
430  * Scheduling class tree data structure manipulation methods:
431  */
432 
433 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434 {
435         s64 delta = (s64)(vruntime - max_vruntime);
436         if (delta > 0)
437                 max_vruntime = vruntime;
438 
439         return max_vruntime;
440 }
441 
442 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443 {
444         s64 delta = (s64)(vruntime - min_vruntime);
445         if (delta < 0)
446                 min_vruntime = vruntime;
447 
448         return min_vruntime;
449 }
450 
451 static inline int entity_before(struct sched_entity *a,
452                                 struct sched_entity *b)
453 {
454         return (s64)(a->vruntime - b->vruntime) < 0;
455 }
456 
457 static void update_min_vruntime(struct cfs_rq *cfs_rq)
458 {
459         u64 vruntime = cfs_rq->min_vruntime;
460 
461         if (cfs_rq->curr)
462                 vruntime = cfs_rq->curr->vruntime;
463 
464         if (cfs_rq->rb_leftmost) {
465                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466                                                    struct sched_entity,
467                                                    run_node);
468 
469                 if (!cfs_rq->curr)
470                         vruntime = se->vruntime;
471                 else
472                         vruntime = min_vruntime(vruntime, se->vruntime);
473         }
474 
475         /* ensure we never gain time by being placed backwards. */
476         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477 #ifndef CONFIG_64BIT
478         smp_wmb();
479         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480 #endif
481 }
482 
483 /*
484  * Enqueue an entity into the rb-tree:
485  */
486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487 {
488         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489         struct rb_node *parent = NULL;
490         struct sched_entity *entry;
491         int leftmost = 1;
492 
493         /*
494          * Find the right place in the rbtree:
495          */
496         while (*link) {
497                 parent = *link;
498                 entry = rb_entry(parent, struct sched_entity, run_node);
499                 /*
500                  * We dont care about collisions. Nodes with
501                  * the same key stay together.
502                  */
503                 if (entity_before(se, entry)) {
504                         link = &parent->rb_left;
505                 } else {
506                         link = &parent->rb_right;
507                         leftmost = 0;
508                 }
509         }
510 
511         /*
512          * Maintain a cache of leftmost tree entries (it is frequently
513          * used):
514          */
515         if (leftmost)
516                 cfs_rq->rb_leftmost = &se->run_node;
517 
518         rb_link_node(&se->run_node, parent, link);
519         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520 }
521 
522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523 {
524         if (cfs_rq->rb_leftmost == &se->run_node) {
525                 struct rb_node *next_node;
526 
527                 next_node = rb_next(&se->run_node);
528                 cfs_rq->rb_leftmost = next_node;
529         }
530 
531         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532 }
533 
534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535 {
536         struct rb_node *left = cfs_rq->rb_leftmost;
537 
538         if (!left)
539                 return NULL;
540 
541         return rb_entry(left, struct sched_entity, run_node);
542 }
543 
544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545 {
546         struct rb_node *next = rb_next(&se->run_node);
547 
548         if (!next)
549                 return NULL;
550 
551         return rb_entry(next, struct sched_entity, run_node);
552 }
553 
554 #ifdef CONFIG_SCHED_DEBUG
555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556 {
557         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558 
559         if (!last)
560                 return NULL;
561 
562         return rb_entry(last, struct sched_entity, run_node);
563 }
564 
565 /**************************************************************
566  * Scheduling class statistics methods:
567  */
568 
569 int sched_proc_update_handler(struct ctl_table *table, int write,
570                 void __user *buffer, size_t *lenp,
571                 loff_t *ppos)
572 {
573         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574         unsigned int factor = get_update_sysctl_factor();
575 
576         if (ret || !write)
577                 return ret;
578 
579         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580                                         sysctl_sched_min_granularity);
581 
582 #define WRT_SYSCTL(name) \
583         (normalized_sysctl_##name = sysctl_##name / (factor))
584         WRT_SYSCTL(sched_min_granularity);
585         WRT_SYSCTL(sched_latency);
586         WRT_SYSCTL(sched_wakeup_granularity);
587 #undef WRT_SYSCTL
588 
589         return 0;
590 }
591 #endif
592 
593 /*
594  * delta /= w
595  */
596 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597 {
598         if (unlikely(se->load.weight != NICE_0_LOAD))
599                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600 
601         return delta;
602 }
603 
604 /*
605  * The idea is to set a period in which each task runs once.
606  *
607  * When there are too many tasks (sched_nr_latency) we have to stretch
608  * this period because otherwise the slices get too small.
609  *
610  * p = (nr <= nl) ? l : l*nr/nl
611  */
612 static u64 __sched_period(unsigned long nr_running)
613 {
614         if (unlikely(nr_running > sched_nr_latency))
615                 return nr_running * sysctl_sched_min_granularity;
616         else
617                 return sysctl_sched_latency;
618 }
619 
620 /*
621  * We calculate the wall-time slice from the period by taking a part
622  * proportional to the weight.
623  *
624  * s = p*P[w/rw]
625  */
626 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627 {
628         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629 
630         for_each_sched_entity(se) {
631                 struct load_weight *load;
632                 struct load_weight lw;
633 
634                 cfs_rq = cfs_rq_of(se);
635                 load = &cfs_rq->load;
636 
637                 if (unlikely(!se->on_rq)) {
638                         lw = cfs_rq->load;
639 
640                         update_load_add(&lw, se->load.weight);
641                         load = &lw;
642                 }
643                 slice = __calc_delta(slice, se->load.weight, load);
644         }
645         return slice;
646 }
647 
648 /*
649  * We calculate the vruntime slice of a to-be-inserted task.
650  *
651  * vs = s/w
652  */
653 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654 {
655         return calc_delta_fair(sched_slice(cfs_rq, se), se);
656 }
657 
658 #ifdef CONFIG_SMP
659 static int select_idle_sibling(struct task_struct *p, int cpu);
660 static unsigned long task_h_load(struct task_struct *p);
661 
662 /*
663  * We choose a half-life close to 1 scheduling period.
664  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665  * dependent on this value.
666  */
667 #define LOAD_AVG_PERIOD 32
668 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
669 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
670 
671 /* Give new sched_entity start runnable values to heavy its load in infant time */
672 void init_entity_runnable_average(struct sched_entity *se)
673 {
674         struct sched_avg *sa = &se->avg;
675 
676         sa->last_update_time = 0;
677         /*
678          * sched_avg's period_contrib should be strictly less then 1024, so
679          * we give it 1023 to make sure it is almost a period (1024us), and
680          * will definitely be update (after enqueue).
681          */
682         sa->period_contrib = 1023;
683         sa->load_avg = scale_load_down(se->load.weight);
684         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685         sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
686         sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
687         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
688 }
689 
690 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
691 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
692 #else
693 void init_entity_runnable_average(struct sched_entity *se)
694 {
695 }
696 #endif
697 
698 /*
699  * Update the current task's runtime statistics.
700  */
701 static void update_curr(struct cfs_rq *cfs_rq)
702 {
703         struct sched_entity *curr = cfs_rq->curr;
704         u64 now = rq_clock_task(rq_of(cfs_rq));
705         u64 delta_exec;
706 
707         if (unlikely(!curr))
708                 return;
709 
710         delta_exec = now - curr->exec_start;
711         if (unlikely((s64)delta_exec <= 0))
712                 return;
713 
714         curr->exec_start = now;
715 
716         schedstat_set(curr->statistics.exec_max,
717                       max(delta_exec, curr->statistics.exec_max));
718 
719         curr->sum_exec_runtime += delta_exec;
720         schedstat_add(cfs_rq, exec_clock, delta_exec);
721 
722         curr->vruntime += calc_delta_fair(delta_exec, curr);
723         update_min_vruntime(cfs_rq);
724 
725         if (entity_is_task(curr)) {
726                 struct task_struct *curtask = task_of(curr);
727 
728                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
729                 cpuacct_charge(curtask, delta_exec);
730                 account_group_exec_runtime(curtask, delta_exec);
731         }
732 
733         account_cfs_rq_runtime(cfs_rq, delta_exec);
734 }
735 
736 static void update_curr_fair(struct rq *rq)
737 {
738         update_curr(cfs_rq_of(&rq->curr->se));
739 }
740 
741 #ifdef CONFIG_SCHEDSTATS
742 static inline void
743 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
744 {
745         u64 wait_start = rq_clock(rq_of(cfs_rq));
746 
747         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
748             likely(wait_start > se->statistics.wait_start))
749                 wait_start -= se->statistics.wait_start;
750 
751         se->statistics.wait_start = wait_start;
752 }
753 
754 static void
755 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
756 {
757         struct task_struct *p;
758         u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
759 
760         if (entity_is_task(se)) {
761                 p = task_of(se);
762                 if (task_on_rq_migrating(p)) {
763                         /*
764                          * Preserve migrating task's wait time so wait_start
765                          * time stamp can be adjusted to accumulate wait time
766                          * prior to migration.
767                          */
768                         se->statistics.wait_start = delta;
769                         return;
770                 }
771                 trace_sched_stat_wait(p, delta);
772         }
773 
774         se->statistics.wait_max = max(se->statistics.wait_max, delta);
775         se->statistics.wait_count++;
776         se->statistics.wait_sum += delta;
777         se->statistics.wait_start = 0;
778 }
779 #else
780 static inline void
781 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
782 {
783 }
784 
785 static inline void
786 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
787 {
788 }
789 #endif
790 
791 /*
792  * Task is being enqueued - update stats:
793  */
794 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
795 {
796         /*
797          * Are we enqueueing a waiting task? (for current tasks
798          * a dequeue/enqueue event is a NOP)
799          */
800         if (se != cfs_rq->curr)
801                 update_stats_wait_start(cfs_rq, se);
802 }
803 
804 static inline void
805 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
806 {
807         /*
808          * Mark the end of the wait period if dequeueing a
809          * waiting task:
810          */
811         if (se != cfs_rq->curr)
812                 update_stats_wait_end(cfs_rq, se);
813 }
814 
815 /*
816  * We are picking a new current task - update its stats:
817  */
818 static inline void
819 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
820 {
821         /*
822          * We are starting a new run period:
823          */
824         se->exec_start = rq_clock_task(rq_of(cfs_rq));
825 }
826 
827 /**************************************************
828  * Scheduling class queueing methods:
829  */
830 
831 #ifdef CONFIG_NUMA_BALANCING
832 /*
833  * Approximate time to scan a full NUMA task in ms. The task scan period is
834  * calculated based on the tasks virtual memory size and
835  * numa_balancing_scan_size.
836  */
837 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
838 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
839 
840 /* Portion of address space to scan in MB */
841 unsigned int sysctl_numa_balancing_scan_size = 256;
842 
843 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
844 unsigned int sysctl_numa_balancing_scan_delay = 1000;
845 
846 static unsigned int task_nr_scan_windows(struct task_struct *p)
847 {
848         unsigned long rss = 0;
849         unsigned long nr_scan_pages;
850 
851         /*
852          * Calculations based on RSS as non-present and empty pages are skipped
853          * by the PTE scanner and NUMA hinting faults should be trapped based
854          * on resident pages
855          */
856         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
857         rss = get_mm_rss(p->mm);
858         if (!rss)
859                 rss = nr_scan_pages;
860 
861         rss = round_up(rss, nr_scan_pages);
862         return rss / nr_scan_pages;
863 }
864 
865 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
866 #define MAX_SCAN_WINDOW 2560
867 
868 static unsigned int task_scan_min(struct task_struct *p)
869 {
870         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
871         unsigned int scan, floor;
872         unsigned int windows = 1;
873 
874         if (scan_size < MAX_SCAN_WINDOW)
875                 windows = MAX_SCAN_WINDOW / scan_size;
876         floor = 1000 / windows;
877 
878         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
879         return max_t(unsigned int, floor, scan);
880 }
881 
882 static unsigned int task_scan_max(struct task_struct *p)
883 {
884         unsigned int smin = task_scan_min(p);
885         unsigned int smax;
886 
887         /* Watch for min being lower than max due to floor calculations */
888         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
889         return max(smin, smax);
890 }
891 
892 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
893 {
894         rq->nr_numa_running += (p->numa_preferred_nid != -1);
895         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
896 }
897 
898 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
899 {
900         rq->nr_numa_running -= (p->numa_preferred_nid != -1);
901         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
902 }
903 
904 struct numa_group {
905         atomic_t refcount;
906 
907         spinlock_t lock; /* nr_tasks, tasks */
908         int nr_tasks;
909         pid_t gid;
910 
911         struct rcu_head rcu;
912         nodemask_t active_nodes;
913         unsigned long total_faults;
914         /*
915          * Faults_cpu is used to decide whether memory should move
916          * towards the CPU. As a consequence, these stats are weighted
917          * more by CPU use than by memory faults.
918          */
919         unsigned long *faults_cpu;
920         unsigned long faults[0];
921 };
922 
923 /* Shared or private faults. */
924 #define NR_NUMA_HINT_FAULT_TYPES 2
925 
926 /* Memory and CPU locality */
927 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
928 
929 /* Averaged statistics, and temporary buffers. */
930 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
931 
932 pid_t task_numa_group_id(struct task_struct *p)
933 {
934         return p->numa_group ? p->numa_group->gid : 0;
935 }
936 
937 /*
938  * The averaged statistics, shared & private, memory & cpu,
939  * occupy the first half of the array. The second half of the
940  * array is for current counters, which are averaged into the
941  * first set by task_numa_placement.
942  */
943 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
944 {
945         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
946 }
947 
948 static inline unsigned long task_faults(struct task_struct *p, int nid)
949 {
950         if (!p->numa_faults)
951                 return 0;
952 
953         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
954                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
955 }
956 
957 static inline unsigned long group_faults(struct task_struct *p, int nid)
958 {
959         if (!p->numa_group)
960                 return 0;
961 
962         return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
963                 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
964 }
965 
966 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
967 {
968         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
969                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
970 }
971 
972 /* Handle placement on systems where not all nodes are directly connected. */
973 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
974                                         int maxdist, bool task)
975 {
976         unsigned long score = 0;
977         int node;
978 
979         /*
980          * All nodes are directly connected, and the same distance
981          * from each other. No need for fancy placement algorithms.
982          */
983         if (sched_numa_topology_type == NUMA_DIRECT)
984                 return 0;
985 
986         /*
987          * This code is called for each node, introducing N^2 complexity,
988          * which should be ok given the number of nodes rarely exceeds 8.
989          */
990         for_each_online_node(node) {
991                 unsigned long faults;
992                 int dist = node_distance(nid, node);
993 
994                 /*
995                  * The furthest away nodes in the system are not interesting
996                  * for placement; nid was already counted.
997                  */
998                 if (dist == sched_max_numa_distance || node == nid)
999                         continue;
1000 
1001                 /*
1002                  * On systems with a backplane NUMA topology, compare groups
1003                  * of nodes, and move tasks towards the group with the most
1004                  * memory accesses. When comparing two nodes at distance
1005                  * "hoplimit", only nodes closer by than "hoplimit" are part
1006                  * of each group. Skip other nodes.
1007                  */
1008                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1009                                         dist > maxdist)
1010                         continue;
1011 
1012                 /* Add up the faults from nearby nodes. */
1013                 if (task)
1014                         faults = task_faults(p, node);
1015                 else
1016                         faults = group_faults(p, node);
1017 
1018                 /*
1019                  * On systems with a glueless mesh NUMA topology, there are
1020                  * no fixed "groups of nodes". Instead, nodes that are not
1021                  * directly connected bounce traffic through intermediate
1022                  * nodes; a numa_group can occupy any set of nodes.
1023                  * The further away a node is, the less the faults count.
1024                  * This seems to result in good task placement.
1025                  */
1026                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1027                         faults *= (sched_max_numa_distance - dist);
1028                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1029                 }
1030 
1031                 score += faults;
1032         }
1033 
1034         return score;
1035 }
1036 
1037 /*
1038  * These return the fraction of accesses done by a particular task, or
1039  * task group, on a particular numa node.  The group weight is given a
1040  * larger multiplier, in order to group tasks together that are almost
1041  * evenly spread out between numa nodes.
1042  */
1043 static inline unsigned long task_weight(struct task_struct *p, int nid,
1044                                         int dist)
1045 {
1046         unsigned long faults, total_faults;
1047 
1048         if (!p->numa_faults)
1049                 return 0;
1050 
1051         total_faults = p->total_numa_faults;
1052 
1053         if (!total_faults)
1054                 return 0;
1055 
1056         faults = task_faults(p, nid);
1057         faults += score_nearby_nodes(p, nid, dist, true);
1058 
1059         return 1000 * faults / total_faults;
1060 }
1061 
1062 static inline unsigned long group_weight(struct task_struct *p, int nid,
1063                                          int dist)
1064 {
1065         unsigned long faults, total_faults;
1066 
1067         if (!p->numa_group)
1068                 return 0;
1069 
1070         total_faults = p->numa_group->total_faults;
1071 
1072         if (!total_faults)
1073                 return 0;
1074 
1075         faults = group_faults(p, nid);
1076         faults += score_nearby_nodes(p, nid, dist, false);
1077 
1078         return 1000 * faults / total_faults;
1079 }
1080 
1081 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1082                                 int src_nid, int dst_cpu)
1083 {
1084         struct numa_group *ng = p->numa_group;
1085         int dst_nid = cpu_to_node(dst_cpu);
1086         int last_cpupid, this_cpupid;
1087 
1088         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1089 
1090         /*
1091          * Multi-stage node selection is used in conjunction with a periodic
1092          * migration fault to build a temporal task<->page relation. By using
1093          * a two-stage filter we remove short/unlikely relations.
1094          *
1095          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1096          * a task's usage of a particular page (n_p) per total usage of this
1097          * page (n_t) (in a given time-span) to a probability.
1098          *
1099          * Our periodic faults will sample this probability and getting the
1100          * same result twice in a row, given these samples are fully
1101          * independent, is then given by P(n)^2, provided our sample period
1102          * is sufficiently short compared to the usage pattern.
1103          *
1104          * This quadric squishes small probabilities, making it less likely we
1105          * act on an unlikely task<->page relation.
1106          */
1107         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1108         if (!cpupid_pid_unset(last_cpupid) &&
1109                                 cpupid_to_nid(last_cpupid) != dst_nid)
1110                 return false;
1111 
1112         /* Always allow migrate on private faults */
1113         if (cpupid_match_pid(p, last_cpupid))
1114                 return true;
1115 
1116         /* A shared fault, but p->numa_group has not been set up yet. */
1117         if (!ng)
1118                 return true;
1119 
1120         /*
1121          * Do not migrate if the destination is not a node that
1122          * is actively used by this numa group.
1123          */
1124         if (!node_isset(dst_nid, ng->active_nodes))
1125                 return false;
1126 
1127         /*
1128          * Source is a node that is not actively used by this
1129          * numa group, while the destination is. Migrate.
1130          */
1131         if (!node_isset(src_nid, ng->active_nodes))
1132                 return true;
1133 
1134         /*
1135          * Both source and destination are nodes in active
1136          * use by this numa group. Maximize memory bandwidth
1137          * by migrating from more heavily used groups, to less
1138          * heavily used ones, spreading the load around.
1139          * Use a 1/4 hysteresis to avoid spurious page movement.
1140          */
1141         return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1142 }
1143 
1144 static unsigned long weighted_cpuload(const int cpu);
1145 static unsigned long source_load(int cpu, int type);
1146 static unsigned long target_load(int cpu, int type);
1147 static unsigned long capacity_of(int cpu);
1148 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1149 
1150 /* Cached statistics for all CPUs within a node */
1151 struct numa_stats {
1152         unsigned long nr_running;
1153         unsigned long load;
1154 
1155         /* Total compute capacity of CPUs on a node */
1156         unsigned long compute_capacity;
1157 
1158         /* Approximate capacity in terms of runnable tasks on a node */
1159         unsigned long task_capacity;
1160         int has_free_capacity;
1161 };
1162 
1163 /*
1164  * XXX borrowed from update_sg_lb_stats
1165  */
1166 static void update_numa_stats(struct numa_stats *ns, int nid)
1167 {
1168         int smt, cpu, cpus = 0;
1169         unsigned long capacity;
1170 
1171         memset(ns, 0, sizeof(*ns));
1172         for_each_cpu(cpu, cpumask_of_node(nid)) {
1173                 struct rq *rq = cpu_rq(cpu);
1174 
1175                 ns->nr_running += rq->nr_running;
1176                 ns->load += weighted_cpuload(cpu);
1177                 ns->compute_capacity += capacity_of(cpu);
1178 
1179                 cpus++;
1180         }
1181 
1182         /*
1183          * If we raced with hotplug and there are no CPUs left in our mask
1184          * the @ns structure is NULL'ed and task_numa_compare() will
1185          * not find this node attractive.
1186          *
1187          * We'll either bail at !has_free_capacity, or we'll detect a huge
1188          * imbalance and bail there.
1189          */
1190         if (!cpus)
1191                 return;
1192 
1193         /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1194         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1195         capacity = cpus / smt; /* cores */
1196 
1197         ns->task_capacity = min_t(unsigned, capacity,
1198                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1199         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1200 }
1201 
1202 struct task_numa_env {
1203         struct task_struct *p;
1204 
1205         int src_cpu, src_nid;
1206         int dst_cpu, dst_nid;
1207 
1208         struct numa_stats src_stats, dst_stats;
1209 
1210         int imbalance_pct;
1211         int dist;
1212 
1213         struct task_struct *best_task;
1214         long best_imp;
1215         int best_cpu;
1216 };
1217 
1218 static void task_numa_assign(struct task_numa_env *env,
1219                              struct task_struct *p, long imp)
1220 {
1221         if (env->best_task)
1222                 put_task_struct(env->best_task);
1223 
1224         env->best_task = p;
1225         env->best_imp = imp;
1226         env->best_cpu = env->dst_cpu;
1227 }
1228 
1229 static bool load_too_imbalanced(long src_load, long dst_load,
1230                                 struct task_numa_env *env)
1231 {
1232         long imb, old_imb;
1233         long orig_src_load, orig_dst_load;
1234         long src_capacity, dst_capacity;
1235 
1236         /*
1237          * The load is corrected for the CPU capacity available on each node.
1238          *
1239          * src_load        dst_load
1240          * ------------ vs ---------
1241          * src_capacity    dst_capacity
1242          */
1243         src_capacity = env->src_stats.compute_capacity;
1244         dst_capacity = env->dst_stats.compute_capacity;
1245 
1246         /* We care about the slope of the imbalance, not the direction. */
1247         if (dst_load < src_load)
1248                 swap(dst_load, src_load);
1249 
1250         /* Is the difference below the threshold? */
1251         imb = dst_load * src_capacity * 100 -
1252               src_load * dst_capacity * env->imbalance_pct;
1253         if (imb <= 0)
1254                 return false;
1255 
1256         /*
1257          * The imbalance is above the allowed threshold.
1258          * Compare it with the old imbalance.
1259          */
1260         orig_src_load = env->src_stats.load;
1261         orig_dst_load = env->dst_stats.load;
1262 
1263         if (orig_dst_load < orig_src_load)
1264                 swap(orig_dst_load, orig_src_load);
1265 
1266         old_imb = orig_dst_load * src_capacity * 100 -
1267                   orig_src_load * dst_capacity * env->imbalance_pct;
1268 
1269         /* Would this change make things worse? */
1270         return (imb > old_imb);
1271 }
1272 
1273 /*
1274  * This checks if the overall compute and NUMA accesses of the system would
1275  * be improved if the source tasks was migrated to the target dst_cpu taking
1276  * into account that it might be best if task running on the dst_cpu should
1277  * be exchanged with the source task
1278  */
1279 static void task_numa_compare(struct task_numa_env *env,
1280                               long taskimp, long groupimp)
1281 {
1282         struct rq *src_rq = cpu_rq(env->src_cpu);
1283         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1284         struct task_struct *cur;
1285         long src_load, dst_load;
1286         long load;
1287         long imp = env->p->numa_group ? groupimp : taskimp;
1288         long moveimp = imp;
1289         int dist = env->dist;
1290         bool assigned = false;
1291 
1292         rcu_read_lock();
1293 
1294         raw_spin_lock_irq(&dst_rq->lock);
1295         cur = dst_rq->curr;
1296         /*
1297          * No need to move the exiting task or idle task.
1298          */
1299         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1300                 cur = NULL;
1301         else {
1302                 /*
1303                  * The task_struct must be protected here to protect the
1304                  * p->numa_faults access in the task_weight since the
1305                  * numa_faults could already be freed in the following path:
1306                  * finish_task_switch()
1307                  *     --> put_task_struct()
1308                  *         --> __put_task_struct()
1309                  *             --> task_numa_free()
1310                  */
1311                 get_task_struct(cur);
1312         }
1313 
1314         raw_spin_unlock_irq(&dst_rq->lock);
1315 
1316         /*
1317          * Because we have preemption enabled we can get migrated around and
1318          * end try selecting ourselves (current == env->p) as a swap candidate.
1319          */
1320         if (cur == env->p)
1321                 goto unlock;
1322 
1323         /*
1324          * "imp" is the fault differential for the source task between the
1325          * source and destination node. Calculate the total differential for
1326          * the source task and potential destination task. The more negative
1327          * the value is, the more rmeote accesses that would be expected to
1328          * be incurred if the tasks were swapped.
1329          */
1330         if (cur) {
1331                 /* Skip this swap candidate if cannot move to the source cpu */
1332                 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1333                         goto unlock;
1334 
1335                 /*
1336                  * If dst and source tasks are in the same NUMA group, or not
1337                  * in any group then look only at task weights.
1338                  */
1339                 if (cur->numa_group == env->p->numa_group) {
1340                         imp = taskimp + task_weight(cur, env->src_nid, dist) -
1341                               task_weight(cur, env->dst_nid, dist);
1342                         /*
1343                          * Add some hysteresis to prevent swapping the
1344                          * tasks within a group over tiny differences.
1345                          */
1346                         if (cur->numa_group)
1347                                 imp -= imp/16;
1348                 } else {
1349                         /*
1350                          * Compare the group weights. If a task is all by
1351                          * itself (not part of a group), use the task weight
1352                          * instead.
1353                          */
1354                         if (cur->numa_group)
1355                                 imp += group_weight(cur, env->src_nid, dist) -
1356                                        group_weight(cur, env->dst_nid, dist);
1357                         else
1358                                 imp += task_weight(cur, env->src_nid, dist) -
1359                                        task_weight(cur, env->dst_nid, dist);
1360                 }
1361         }
1362 
1363         if (imp <= env->best_imp && moveimp <= env->best_imp)
1364                 goto unlock;
1365 
1366         if (!cur) {
1367                 /* Is there capacity at our destination? */
1368                 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1369                     !env->dst_stats.has_free_capacity)
1370                         goto unlock;
1371 
1372                 goto balance;
1373         }
1374 
1375         /* Balance doesn't matter much if we're running a task per cpu */
1376         if (imp > env->best_imp && src_rq->nr_running == 1 &&
1377                         dst_rq->nr_running == 1)
1378                 goto assign;
1379 
1380         /*
1381          * In the overloaded case, try and keep the load balanced.
1382          */
1383 balance:
1384         load = task_h_load(env->p);
1385         dst_load = env->dst_stats.load + load;
1386         src_load = env->src_stats.load - load;
1387 
1388         if (moveimp > imp && moveimp > env->best_imp) {
1389                 /*
1390                  * If the improvement from just moving env->p direction is
1391                  * better than swapping tasks around, check if a move is
1392                  * possible. Store a slightly smaller score than moveimp,
1393                  * so an actually idle CPU will win.
1394                  */
1395                 if (!load_too_imbalanced(src_load, dst_load, env)) {
1396                         imp = moveimp - 1;
1397                         put_task_struct(cur);
1398                         cur = NULL;
1399                         goto assign;
1400                 }
1401         }
1402 
1403         if (imp <= env->best_imp)
1404                 goto unlock;
1405 
1406         if (cur) {
1407                 load = task_h_load(cur);
1408                 dst_load -= load;
1409                 src_load += load;
1410         }
1411 
1412         if (load_too_imbalanced(src_load, dst_load, env))
1413                 goto unlock;
1414 
1415         /*
1416          * One idle CPU per node is evaluated for a task numa move.
1417          * Call select_idle_sibling to maybe find a better one.
1418          */
1419         if (!cur)
1420                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1421 
1422 assign:
1423         assigned = true;
1424         task_numa_assign(env, cur, imp);
1425 unlock:
1426         rcu_read_unlock();
1427         /*
1428          * The dst_rq->curr isn't assigned. The protection for task_struct is
1429          * finished.
1430          */
1431         if (cur && !assigned)
1432                 put_task_struct(cur);
1433 }
1434 
1435 static void task_numa_find_cpu(struct task_numa_env *env,
1436                                 long taskimp, long groupimp)
1437 {
1438         int cpu;
1439 
1440         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1441                 /* Skip this CPU if the source task cannot migrate */
1442                 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1443                         continue;
1444 
1445                 env->dst_cpu = cpu;
1446                 task_numa_compare(env, taskimp, groupimp);
1447         }
1448 }
1449 
1450 /* Only move tasks to a NUMA node less busy than the current node. */
1451 static bool numa_has_capacity(struct task_numa_env *env)
1452 {
1453         struct numa_stats *src = &env->src_stats;
1454         struct numa_stats *dst = &env->dst_stats;
1455 
1456         if (src->has_free_capacity && !dst->has_free_capacity)
1457                 return false;
1458 
1459         /*
1460          * Only consider a task move if the source has a higher load
1461          * than the destination, corrected for CPU capacity on each node.
1462          *
1463          *      src->load                dst->load
1464          * --------------------- vs ---------------------
1465          * src->compute_capacity    dst->compute_capacity
1466          */
1467         if (src->load * dst->compute_capacity * env->imbalance_pct >
1468 
1469             dst->load * src->compute_capacity * 100)
1470                 return true;
1471 
1472         return false;
1473 }
1474 
1475 static int task_numa_migrate(struct task_struct *p)
1476 {
1477         struct task_numa_env env = {
1478                 .p = p,
1479 
1480                 .src_cpu = task_cpu(p),
1481                 .src_nid = task_node(p),
1482 
1483                 .imbalance_pct = 112,
1484 
1485                 .best_task = NULL,
1486                 .best_imp = 0,
1487                 .best_cpu = -1
1488         };
1489         struct sched_domain *sd;
1490         unsigned long taskweight, groupweight;
1491         int nid, ret, dist;
1492         long taskimp, groupimp;
1493 
1494         /*
1495          * Pick the lowest SD_NUMA domain, as that would have the smallest
1496          * imbalance and would be the first to start moving tasks about.
1497          *
1498          * And we want to avoid any moving of tasks about, as that would create
1499          * random movement of tasks -- counter the numa conditions we're trying
1500          * to satisfy here.
1501          */
1502         rcu_read_lock();
1503         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1504         if (sd)
1505                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1506         rcu_read_unlock();
1507 
1508         /*
1509          * Cpusets can break the scheduler domain tree into smaller
1510          * balance domains, some of which do not cross NUMA boundaries.
1511          * Tasks that are "trapped" in such domains cannot be migrated
1512          * elsewhere, so there is no point in (re)trying.
1513          */
1514         if (unlikely(!sd)) {
1515                 p->numa_preferred_nid = task_node(p);
1516                 return -EINVAL;
1517         }
1518 
1519         env.dst_nid = p->numa_preferred_nid;
1520         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1521         taskweight = task_weight(p, env.src_nid, dist);
1522         groupweight = group_weight(p, env.src_nid, dist);
1523         update_numa_stats(&env.src_stats, env.src_nid);
1524         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1525         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1526         update_numa_stats(&env.dst_stats, env.dst_nid);
1527 
1528         /* Try to find a spot on the preferred nid. */
1529         if (numa_has_capacity(&env))
1530                 task_numa_find_cpu(&env, taskimp, groupimp);
1531 
1532         /*
1533          * Look at other nodes in these cases:
1534          * - there is no space available on the preferred_nid
1535          * - the task is part of a numa_group that is interleaved across
1536          *   multiple NUMA nodes; in order to better consolidate the group,
1537          *   we need to check other locations.
1538          */
1539         if (env.best_cpu == -1 || (p->numa_group &&
1540                         nodes_weight(p->numa_group->active_nodes) > 1)) {
1541                 for_each_online_node(nid) {
1542                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1543                                 continue;
1544 
1545                         dist = node_distance(env.src_nid, env.dst_nid);
1546                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1547                                                 dist != env.dist) {
1548                                 taskweight = task_weight(p, env.src_nid, dist);
1549                                 groupweight = group_weight(p, env.src_nid, dist);
1550                         }
1551 
1552                         /* Only consider nodes where both task and groups benefit */
1553                         taskimp = task_weight(p, nid, dist) - taskweight;
1554                         groupimp = group_weight(p, nid, dist) - groupweight;
1555                         if (taskimp < 0 && groupimp < 0)
1556                                 continue;
1557 
1558                         env.dist = dist;
1559                         env.dst_nid = nid;
1560                         update_numa_stats(&env.dst_stats, env.dst_nid);
1561                         if (numa_has_capacity(&env))
1562                                 task_numa_find_cpu(&env, taskimp, groupimp);
1563                 }
1564         }
1565 
1566         /*
1567          * If the task is part of a workload that spans multiple NUMA nodes,
1568          * and is migrating into one of the workload's active nodes, remember
1569          * this node as the task's preferred numa node, so the workload can
1570          * settle down.
1571          * A task that migrated to a second choice node will be better off
1572          * trying for a better one later. Do not set the preferred node here.
1573          */
1574         if (p->numa_group) {
1575                 if (env.best_cpu == -1)
1576                         nid = env.src_nid;
1577                 else
1578                         nid = env.dst_nid;
1579 
1580                 if (node_isset(nid, p->numa_group->active_nodes))
1581                         sched_setnuma(p, env.dst_nid);
1582         }
1583 
1584         /* No better CPU than the current one was found. */
1585         if (env.best_cpu == -1)
1586                 return -EAGAIN;
1587 
1588         /*
1589          * Reset the scan period if the task is being rescheduled on an
1590          * alternative node to recheck if the tasks is now properly placed.
1591          */
1592         p->numa_scan_period = task_scan_min(p);
1593 
1594         if (env.best_task == NULL) {
1595                 ret = migrate_task_to(p, env.best_cpu);
1596                 if (ret != 0)
1597                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1598                 return ret;
1599         }
1600 
1601         ret = migrate_swap(p, env.best_task);
1602         if (ret != 0)
1603                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1604         put_task_struct(env.best_task);
1605         return ret;
1606 }
1607 
1608 /* Attempt to migrate a task to a CPU on the preferred node. */
1609 static void numa_migrate_preferred(struct task_struct *p)
1610 {
1611         unsigned long interval = HZ;
1612 
1613         /* This task has no NUMA fault statistics yet */
1614         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1615                 return;
1616 
1617         /* Periodically retry migrating the task to the preferred node */
1618         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1619         p->numa_migrate_retry = jiffies + interval;
1620 
1621         /* Success if task is already running on preferred CPU */
1622         if (task_node(p) == p->numa_preferred_nid)
1623                 return;
1624 
1625         /* Otherwise, try migrate to a CPU on the preferred node */
1626         task_numa_migrate(p);
1627 }
1628 
1629 /*
1630  * Find the nodes on which the workload is actively running. We do this by
1631  * tracking the nodes from which NUMA hinting faults are triggered. This can
1632  * be different from the set of nodes where the workload's memory is currently
1633  * located.
1634  *
1635  * The bitmask is used to make smarter decisions on when to do NUMA page
1636  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1637  * are added when they cause over 6/16 of the maximum number of faults, but
1638  * only removed when they drop below 3/16.
1639  */
1640 static void update_numa_active_node_mask(struct numa_group *numa_group)
1641 {
1642         unsigned long faults, max_faults = 0;
1643         int nid;
1644 
1645         for_each_online_node(nid) {
1646                 faults = group_faults_cpu(numa_group, nid);
1647                 if (faults > max_faults)
1648                         max_faults = faults;
1649         }
1650 
1651         for_each_online_node(nid) {
1652                 faults = group_faults_cpu(numa_group, nid);
1653                 if (!node_isset(nid, numa_group->active_nodes)) {
1654                         if (faults > max_faults * 6 / 16)
1655                                 node_set(nid, numa_group->active_nodes);
1656                 } else if (faults < max_faults * 3 / 16)
1657                         node_clear(nid, numa_group->active_nodes);
1658         }
1659 }
1660 
1661 /*
1662  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1663  * increments. The more local the fault statistics are, the higher the scan
1664  * period will be for the next scan window. If local/(local+remote) ratio is
1665  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1666  * the scan period will decrease. Aim for 70% local accesses.
1667  */
1668 #define NUMA_PERIOD_SLOTS 10
1669 #define NUMA_PERIOD_THRESHOLD 7
1670 
1671 /*
1672  * Increase the scan period (slow down scanning) if the majority of
1673  * our memory is already on our local node, or if the majority of
1674  * the page accesses are shared with other processes.
1675  * Otherwise, decrease the scan period.
1676  */
1677 static void update_task_scan_period(struct task_struct *p,
1678                         unsigned long shared, unsigned long private)
1679 {
1680         unsigned int period_slot;
1681         int ratio;
1682         int diff;
1683 
1684         unsigned long remote = p->numa_faults_locality[0];
1685         unsigned long local = p->numa_faults_locality[1];
1686 
1687         /*
1688          * If there were no record hinting faults then either the task is
1689          * completely idle or all activity is areas that are not of interest
1690          * to automatic numa balancing. Related to that, if there were failed
1691          * migration then it implies we are migrating too quickly or the local
1692          * node is overloaded. In either case, scan slower
1693          */
1694         if (local + shared == 0 || p->numa_faults_locality[2]) {
1695                 p->numa_scan_period = min(p->numa_scan_period_max,
1696                         p->numa_scan_period << 1);
1697 
1698                 p->mm->numa_next_scan = jiffies +
1699                         msecs_to_jiffies(p->numa_scan_period);
1700 
1701                 return;
1702         }
1703 
1704         /*
1705          * Prepare to scale scan period relative to the current period.
1706          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1707          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1708          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1709          */
1710         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1711         ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1712         if (ratio >= NUMA_PERIOD_THRESHOLD) {
1713                 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1714                 if (!slot)
1715                         slot = 1;
1716                 diff = slot * period_slot;
1717         } else {
1718                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1719 
1720                 /*
1721                  * Scale scan rate increases based on sharing. There is an
1722                  * inverse relationship between the degree of sharing and
1723                  * the adjustment made to the scanning period. Broadly
1724                  * speaking the intent is that there is little point
1725                  * scanning faster if shared accesses dominate as it may
1726                  * simply bounce migrations uselessly
1727                  */
1728                 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1729                 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1730         }
1731 
1732         p->numa_scan_period = clamp(p->numa_scan_period + diff,
1733                         task_scan_min(p), task_scan_max(p));
1734         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1735 }
1736 
1737 /*
1738  * Get the fraction of time the task has been running since the last
1739  * NUMA placement cycle. The scheduler keeps similar statistics, but
1740  * decays those on a 32ms period, which is orders of magnitude off
1741  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1742  * stats only if the task is so new there are no NUMA statistics yet.
1743  */
1744 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1745 {
1746         u64 runtime, delta, now;
1747         /* Use the start of this time slice to avoid calculations. */
1748         now = p->se.exec_start;
1749         runtime = p->se.sum_exec_runtime;
1750 
1751         if (p->last_task_numa_placement) {
1752                 delta = runtime - p->last_sum_exec_runtime;
1753                 *period = now - p->last_task_numa_placement;
1754         } else {
1755                 delta = p->se.avg.load_sum / p->se.load.weight;
1756                 *period = LOAD_AVG_MAX;
1757         }
1758 
1759         p->last_sum_exec_runtime = runtime;
1760         p->last_task_numa_placement = now;
1761 
1762         return delta;
1763 }
1764 
1765 /*
1766  * Determine the preferred nid for a task in a numa_group. This needs to
1767  * be done in a way that produces consistent results with group_weight,
1768  * otherwise workloads might not converge.
1769  */
1770 static int preferred_group_nid(struct task_struct *p, int nid)
1771 {
1772         nodemask_t nodes;
1773         int dist;
1774 
1775         /* Direct connections between all NUMA nodes. */
1776         if (sched_numa_topology_type == NUMA_DIRECT)
1777                 return nid;
1778 
1779         /*
1780          * On a system with glueless mesh NUMA topology, group_weight
1781          * scores nodes according to the number of NUMA hinting faults on
1782          * both the node itself, and on nearby nodes.
1783          */
1784         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1785                 unsigned long score, max_score = 0;
1786                 int node, max_node = nid;
1787 
1788                 dist = sched_max_numa_distance;
1789 
1790                 for_each_online_node(node) {
1791                         score = group_weight(p, node, dist);
1792                         if (score > max_score) {
1793                                 max_score = score;
1794                                 max_node = node;
1795                         }
1796                 }
1797                 return max_node;
1798         }
1799 
1800         /*
1801          * Finding the preferred nid in a system with NUMA backplane
1802          * interconnect topology is more involved. The goal is to locate
1803          * tasks from numa_groups near each other in the system, and
1804          * untangle workloads from different sides of the system. This requires
1805          * searching down the hierarchy of node groups, recursively searching
1806          * inside the highest scoring group of nodes. The nodemask tricks
1807          * keep the complexity of the search down.
1808          */
1809         nodes = node_online_map;
1810         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1811                 unsigned long max_faults = 0;
1812                 nodemask_t max_group = NODE_MASK_NONE;
1813                 int a, b;
1814 
1815                 /* Are there nodes at this distance from each other? */
1816                 if (!find_numa_distance(dist))
1817                         continue;
1818 
1819                 for_each_node_mask(a, nodes) {
1820                         unsigned long faults = 0;
1821                         nodemask_t this_group;
1822                         nodes_clear(this_group);
1823 
1824                         /* Sum group's NUMA faults; includes a==b case. */
1825                         for_each_node_mask(b, nodes) {
1826                                 if (node_distance(a, b) < dist) {
1827                                         faults += group_faults(p, b);
1828                                         node_set(b, this_group);
1829                                         node_clear(b, nodes);
1830                                 }
1831                         }
1832 
1833                         /* Remember the top group. */
1834                         if (faults > max_faults) {
1835                                 max_faults = faults;
1836                                 max_group = this_group;
1837                                 /*
1838                                  * subtle: at the smallest distance there is
1839                                  * just one node left in each "group", the
1840                                  * winner is the preferred nid.
1841                                  */
1842                                 nid = a;
1843                         }
1844                 }
1845                 /* Next round, evaluate the nodes within max_group. */
1846                 if (!max_faults)
1847                         break;
1848                 nodes = max_group;
1849         }
1850         return nid;
1851 }
1852 
1853 static void task_numa_placement(struct task_struct *p)
1854 {
1855         int seq, nid, max_nid = -1, max_group_nid = -1;
1856         unsigned long max_faults = 0, max_group_faults = 0;
1857         unsigned long fault_types[2] = { 0, 0 };
1858         unsigned long total_faults;
1859         u64 runtime, period;
1860         spinlock_t *group_lock = NULL;
1861 
1862         /*
1863          * The p->mm->numa_scan_seq field gets updated without
1864          * exclusive access. Use READ_ONCE() here to ensure
1865          * that the field is read in a single access:
1866          */
1867         seq = READ_ONCE(p->mm->numa_scan_seq);
1868         if (p->numa_scan_seq == seq)
1869                 return;
1870         p->numa_scan_seq = seq;
1871         p->numa_scan_period_max = task_scan_max(p);
1872 
1873         total_faults = p->numa_faults_locality[0] +
1874                        p->numa_faults_locality[1];
1875         runtime = numa_get_avg_runtime(p, &period);
1876 
1877         /* If the task is part of a group prevent parallel updates to group stats */
1878         if (p->numa_group) {
1879                 group_lock = &p->numa_group->lock;
1880                 spin_lock_irq(group_lock);
1881         }
1882 
1883         /* Find the node with the highest number of faults */
1884         for_each_online_node(nid) {
1885                 /* Keep track of the offsets in numa_faults array */
1886                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1887                 unsigned long faults = 0, group_faults = 0;
1888                 int priv;
1889 
1890                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1891                         long diff, f_diff, f_weight;
1892 
1893                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1894                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1895                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1896                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1897 
1898                         /* Decay existing window, copy faults since last scan */
1899                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1900                         fault_types[priv] += p->numa_faults[membuf_idx];
1901                         p->numa_faults[membuf_idx] = 0;
1902 
1903                         /*
1904                          * Normalize the faults_from, so all tasks in a group
1905                          * count according to CPU use, instead of by the raw
1906                          * number of faults. Tasks with little runtime have
1907                          * little over-all impact on throughput, and thus their
1908                          * faults are less important.
1909                          */
1910                         f_weight = div64_u64(runtime << 16, period + 1);
1911                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1912                                    (total_faults + 1);
1913                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1914                         p->numa_faults[cpubuf_idx] = 0;
1915 
1916                         p->numa_faults[mem_idx] += diff;
1917                         p->numa_faults[cpu_idx] += f_diff;
1918                         faults += p->numa_faults[mem_idx];
1919                         p->total_numa_faults += diff;
1920                         if (p->numa_group) {
1921                                 /*
1922                                  * safe because we can only change our own group
1923                                  *
1924                                  * mem_idx represents the offset for a given
1925                                  * nid and priv in a specific region because it
1926                                  * is at the beginning of the numa_faults array.
1927                                  */
1928                                 p->numa_group->faults[mem_idx] += diff;
1929                                 p->numa_group->faults_cpu[mem_idx] += f_diff;
1930                                 p->numa_group->total_faults += diff;
1931                                 group_faults += p->numa_group->faults[mem_idx];
1932                         }
1933                 }
1934 
1935                 if (faults > max_faults) {
1936                         max_faults = faults;
1937                         max_nid = nid;
1938                 }
1939 
1940                 if (group_faults > max_group_faults) {
1941                         max_group_faults = group_faults;
1942                         max_group_nid = nid;
1943                 }
1944         }
1945 
1946         update_task_scan_period(p, fault_types[0], fault_types[1]);
1947 
1948         if (p->numa_group) {
1949                 update_numa_active_node_mask(p->numa_group);
1950                 spin_unlock_irq(group_lock);
1951                 max_nid = preferred_group_nid(p, max_group_nid);
1952         }
1953 
1954         if (max_faults) {
1955                 /* Set the new preferred node */
1956                 if (max_nid != p->numa_preferred_nid)
1957                         sched_setnuma(p, max_nid);
1958 
1959                 if (task_node(p) != p->numa_preferred_nid)
1960                         numa_migrate_preferred(p);
1961         }
1962 }
1963 
1964 static inline int get_numa_group(struct numa_group *grp)
1965 {
1966         return atomic_inc_not_zero(&grp->refcount);
1967 }
1968 
1969 static inline void put_numa_group(struct numa_group *grp)
1970 {
1971         if (atomic_dec_and_test(&grp->refcount))
1972                 kfree_rcu(grp, rcu);
1973 }
1974 
1975 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1976                         int *priv)
1977 {
1978         struct numa_group *grp, *my_grp;
1979         struct task_struct *tsk;
1980         bool join = false;
1981         int cpu = cpupid_to_cpu(cpupid);
1982         int i;
1983 
1984         if (unlikely(!p->numa_group)) {
1985                 unsigned int size = sizeof(struct numa_group) +
1986                                     4*nr_node_ids*sizeof(unsigned long);
1987 
1988                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1989                 if (!grp)
1990                         return;
1991 
1992                 atomic_set(&grp->refcount, 1);
1993                 spin_lock_init(&grp->lock);
1994                 grp->gid = p->pid;
1995                 /* Second half of the array tracks nids where faults happen */
1996                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1997                                                 nr_node_ids;
1998 
1999                 node_set(task_node(current), grp->active_nodes);
2000 
2001                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2002                         grp->faults[i] = p->numa_faults[i];
2003 
2004                 grp->total_faults = p->total_numa_faults;
2005 
2006                 grp->nr_tasks++;
2007                 rcu_assign_pointer(p->numa_group, grp);
2008         }
2009 
2010         rcu_read_lock();
2011         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2012 
2013         if (!cpupid_match_pid(tsk, cpupid))
2014                 goto no_join;
2015 
2016         grp = rcu_dereference(tsk->numa_group);
2017         if (!grp)
2018                 goto no_join;
2019 
2020         my_grp = p->numa_group;
2021         if (grp == my_grp)
2022                 goto no_join;
2023 
2024         /*
2025          * Only join the other group if its bigger; if we're the bigger group,
2026          * the other task will join us.
2027          */
2028         if (my_grp->nr_tasks > grp->nr_tasks)
2029                 goto no_join;
2030 
2031         /*
2032          * Tie-break on the grp address.
2033          */
2034         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2035                 goto no_join;
2036 
2037         /* Always join threads in the same process. */
2038         if (tsk->mm == current->mm)
2039                 join = true;
2040 
2041         /* Simple filter to avoid false positives due to PID collisions */
2042         if (flags & TNF_SHARED)
2043                 join = true;
2044 
2045         /* Update priv based on whether false sharing was detected */
2046         *priv = !join;
2047 
2048         if (join && !get_numa_group(grp))
2049                 goto no_join;
2050 
2051         rcu_read_unlock();
2052 
2053         if (!join)
2054                 return;
2055 
2056         BUG_ON(irqs_disabled());
2057         double_lock_irq(&my_grp->lock, &grp->lock);
2058 
2059         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2060                 my_grp->faults[i] -= p->numa_faults[i];
2061                 grp->faults[i] += p->numa_faults[i];
2062         }
2063         my_grp->total_faults -= p->total_numa_faults;
2064         grp->total_faults += p->total_numa_faults;
2065 
2066         my_grp->nr_tasks--;
2067         grp->nr_tasks++;
2068 
2069         spin_unlock(&my_grp->lock);
2070         spin_unlock_irq(&grp->lock);
2071 
2072         rcu_assign_pointer(p->numa_group, grp);
2073 
2074         put_numa_group(my_grp);
2075         return;
2076 
2077 no_join:
2078         rcu_read_unlock();
2079         return;
2080 }
2081 
2082 void task_numa_free(struct task_struct *p)
2083 {
2084         struct numa_group *grp = p->numa_group;
2085         void *numa_faults = p->numa_faults;
2086         unsigned long flags;
2087         int i;
2088 
2089         if (grp) {
2090                 spin_lock_irqsave(&grp->lock, flags);
2091                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2092                         grp->faults[i] -= p->numa_faults[i];
2093                 grp->total_faults -= p->total_numa_faults;
2094 
2095                 grp->nr_tasks--;
2096                 spin_unlock_irqrestore(&grp->lock, flags);
2097                 RCU_INIT_POINTER(p->numa_group, NULL);
2098                 put_numa_group(grp);
2099         }
2100 
2101         p->numa_faults = NULL;
2102         kfree(numa_faults);
2103 }
2104 
2105 /*
2106  * Got a PROT_NONE fault for a page on @node.
2107  */
2108 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2109 {
2110         struct task_struct *p = current;
2111         bool migrated = flags & TNF_MIGRATED;
2112         int cpu_node = task_node(current);
2113         int local = !!(flags & TNF_FAULT_LOCAL);
2114         int priv;
2115 
2116         if (!static_branch_likely(&sched_numa_balancing))
2117                 return;
2118 
2119         /* for example, ksmd faulting in a user's mm */
2120         if (!p->mm)
2121                 return;
2122 
2123         /* Allocate buffer to track faults on a per-node basis */
2124         if (unlikely(!p->numa_faults)) {
2125                 int size = sizeof(*p->numa_faults) *
2126                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2127 
2128                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2129                 if (!p->numa_faults)
2130                         return;
2131 
2132                 p->total_numa_faults = 0;
2133                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2134         }
2135 
2136         /*
2137          * First accesses are treated as private, otherwise consider accesses
2138          * to be private if the accessing pid has not changed
2139          */
2140         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2141                 priv = 1;
2142         } else {
2143                 priv = cpupid_match_pid(p, last_cpupid);
2144                 if (!priv && !(flags & TNF_NO_GROUP))
2145                         task_numa_group(p, last_cpupid, flags, &priv);
2146         }
2147 
2148         /*
2149          * If a workload spans multiple NUMA nodes, a shared fault that
2150          * occurs wholly within the set of nodes that the workload is
2151          * actively using should be counted as local. This allows the
2152          * scan rate to slow down when a workload has settled down.
2153          */
2154         if (!priv && !local && p->numa_group &&
2155                         node_isset(cpu_node, p->numa_group->active_nodes) &&
2156                         node_isset(mem_node, p->numa_group->active_nodes))
2157                 local = 1;
2158 
2159         task_numa_placement(p);
2160 
2161         /*
2162          * Retry task to preferred node migration periodically, in case it
2163          * case it previously failed, or the scheduler moved us.
2164          */
2165         if (time_after(jiffies, p->numa_migrate_retry))
2166                 numa_migrate_preferred(p);
2167 
2168         if (migrated)
2169                 p->numa_pages_migrated += pages;
2170         if (flags & TNF_MIGRATE_FAIL)
2171                 p->numa_faults_locality[2] += pages;
2172 
2173         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2174         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2175         p->numa_faults_locality[local] += pages;
2176 }
2177 
2178 static void reset_ptenuma_scan(struct task_struct *p)
2179 {
2180         /*
2181          * We only did a read acquisition of the mmap sem, so
2182          * p->mm->numa_scan_seq is written to without exclusive access
2183          * and the update is not guaranteed to be atomic. That's not
2184          * much of an issue though, since this is just used for
2185          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2186          * expensive, to avoid any form of compiler optimizations:
2187          */
2188         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2189         p->mm->numa_scan_offset = 0;
2190 }
2191 
2192 /*
2193  * The expensive part of numa migration is done from task_work context.
2194  * Triggered from task_tick_numa().
2195  */
2196 void task_numa_work(struct callback_head *work)
2197 {
2198         unsigned long migrate, next_scan, now = jiffies;
2199         struct task_struct *p = current;
2200         struct mm_struct *mm = p->mm;
2201         u64 runtime = p->se.sum_exec_runtime;
2202         struct vm_area_struct *vma;
2203         unsigned long start, end;
2204         unsigned long nr_pte_updates = 0;
2205         long pages, virtpages;
2206 
2207         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2208 
2209         work->next = work; /* protect against double add */
2210         /*
2211          * Who cares about NUMA placement when they're dying.
2212          *
2213          * NOTE: make sure not to dereference p->mm before this check,
2214          * exit_task_work() happens _after_ exit_mm() so we could be called
2215          * without p->mm even though we still had it when we enqueued this
2216          * work.
2217          */
2218         if (p->flags & PF_EXITING)
2219                 return;
2220 
2221         if (!mm->numa_next_scan) {
2222                 mm->numa_next_scan = now +
2223                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2224         }
2225 
2226         /*
2227          * Enforce maximal scan/migration frequency..
2228          */
2229         migrate = mm->numa_next_scan;
2230         if (time_before(now, migrate))
2231                 return;
2232 
2233         if (p->numa_scan_period == 0) {
2234                 p->numa_scan_period_max = task_scan_max(p);
2235                 p->numa_scan_period = task_scan_min(p);
2236         }
2237 
2238         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2239         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2240                 return;
2241 
2242         /*
2243          * Delay this task enough that another task of this mm will likely win
2244          * the next time around.
2245          */
2246         p->node_stamp += 2 * TICK_NSEC;
2247 
2248         start = mm->numa_scan_offset;
2249         pages = sysctl_numa_balancing_scan_size;
2250         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2251         virtpages = pages * 8;     /* Scan up to this much virtual space */
2252         if (!pages)
2253                 return;
2254 
2255 
2256         down_read(&mm->mmap_sem);
2257         vma = find_vma(mm, start);
2258         if (!vma) {
2259                 reset_ptenuma_scan(p);
2260                 start = 0;
2261                 vma = mm->mmap;
2262         }
2263         for (; vma; vma = vma->vm_next) {
2264                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2265                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2266                         continue;
2267                 }
2268 
2269                 /*
2270                  * Shared library pages mapped by multiple processes are not
2271                  * migrated as it is expected they are cache replicated. Avoid
2272                  * hinting faults in read-only file-backed mappings or the vdso
2273                  * as migrating the pages will be of marginal benefit.
2274                  */
2275                 if (!vma->vm_mm ||
2276                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2277                         continue;
2278 
2279                 /*
2280                  * Skip inaccessible VMAs to avoid any confusion between
2281                  * PROT_NONE and NUMA hinting ptes
2282                  */
2283                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2284                         continue;
2285 
2286                 do {
2287                         start = max(start, vma->vm_start);
2288                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2289                         end = min(end, vma->vm_end);
2290                         nr_pte_updates = change_prot_numa(vma, start, end);
2291 
2292                         /*
2293                          * Try to scan sysctl_numa_balancing_size worth of
2294                          * hpages that have at least one present PTE that
2295                          * is not already pte-numa. If the VMA contains
2296                          * areas that are unused or already full of prot_numa
2297                          * PTEs, scan up to virtpages, to skip through those
2298                          * areas faster.
2299                          */
2300                         if (nr_pte_updates)
2301                                 pages -= (end - start) >> PAGE_SHIFT;
2302                         virtpages -= (end - start) >> PAGE_SHIFT;
2303 
2304                         start = end;
2305                         if (pages <= 0 || virtpages <= 0)
2306                                 goto out;
2307 
2308                         cond_resched();
2309                 } while (end != vma->vm_end);
2310         }
2311 
2312 out:
2313         /*
2314          * It is possible to reach the end of the VMA list but the last few
2315          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2316          * would find the !migratable VMA on the next scan but not reset the
2317          * scanner to the start so check it now.
2318          */
2319         if (vma)
2320                 mm->numa_scan_offset = start;
2321         else
2322                 reset_ptenuma_scan(p);
2323         up_read(&mm->mmap_sem);
2324 
2325         /*
2326          * Make sure tasks use at least 32x as much time to run other code
2327          * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2328          * Usually update_task_scan_period slows down scanning enough; on an
2329          * overloaded system we need to limit overhead on a per task basis.
2330          */
2331         if (unlikely(p->se.sum_exec_runtime != runtime)) {
2332                 u64 diff = p->se.sum_exec_runtime - runtime;
2333                 p->node_stamp += 32 * diff;
2334         }
2335 }
2336 
2337 /*
2338  * Drive the periodic memory faults..
2339  */
2340 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2341 {
2342         struct callback_head *work = &curr->numa_work;
2343         u64 period, now;
2344 
2345         /*
2346          * We don't care about NUMA placement if we don't have memory.
2347          */
2348         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2349                 return;
2350 
2351         /*
2352          * Using runtime rather than walltime has the dual advantage that
2353          * we (mostly) drive the selection from busy threads and that the
2354          * task needs to have done some actual work before we bother with
2355          * NUMA placement.
2356          */
2357         now = curr->se.sum_exec_runtime;
2358         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2359 
2360         if (now > curr->node_stamp + period) {
2361                 if (!curr->node_stamp)
2362                         curr->numa_scan_period = task_scan_min(curr);
2363                 curr->node_stamp += period;
2364 
2365                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2366                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2367                         task_work_add(curr, work, true);
2368                 }
2369         }
2370 }
2371 #else
2372 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2373 {
2374 }
2375 
2376 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2377 {
2378 }
2379 
2380 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2381 {
2382 }
2383 #endif /* CONFIG_NUMA_BALANCING */
2384 
2385 static void
2386 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2387 {
2388         update_load_add(&cfs_rq->load, se->load.weight);
2389         if (!parent_entity(se))
2390                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2391 #ifdef CONFIG_SMP
2392         if (entity_is_task(se)) {
2393                 struct rq *rq = rq_of(cfs_rq);
2394 
2395                 account_numa_enqueue(rq, task_of(se));
2396                 list_add(&se->group_node, &rq->cfs_tasks);
2397         }
2398 #endif
2399         cfs_rq->nr_running++;
2400 }
2401 
2402 static void
2403 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2404 {
2405         update_load_sub(&cfs_rq->load, se->load.weight);
2406         if (!parent_entity(se))
2407                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2408         if (entity_is_task(se)) {
2409                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2410                 list_del_init(&se->group_node);
2411         }
2412         cfs_rq->nr_running--;
2413 }
2414 
2415 #ifdef CONFIG_FAIR_GROUP_SCHED
2416 # ifdef CONFIG_SMP
2417 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2418 {
2419         long tg_weight;
2420 
2421         /*
2422          * Use this CPU's real-time load instead of the last load contribution
2423          * as the updating of the contribution is delayed, and we will use the
2424          * the real-time load to calc the share. See update_tg_load_avg().
2425          */
2426         tg_weight = atomic_long_read(&tg->load_avg);
2427         tg_weight -= cfs_rq->tg_load_avg_contrib;
2428         tg_weight += cfs_rq->load.weight;
2429 
2430         return tg_weight;
2431 }
2432 
2433 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2434 {
2435         long tg_weight, load, shares;
2436 
2437         tg_weight = calc_tg_weight(tg, cfs_rq);
2438         load = cfs_rq->load.weight;
2439 
2440         shares = (tg->shares * load);
2441         if (tg_weight)
2442                 shares /= tg_weight;
2443 
2444         if (shares < MIN_SHARES)
2445                 shares = MIN_SHARES;
2446         if (shares > tg->shares)
2447                 shares = tg->shares;
2448 
2449         return shares;
2450 }
2451 # else /* CONFIG_SMP */
2452 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2453 {
2454         return tg->shares;
2455 }
2456 # endif /* CONFIG_SMP */
2457 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2458                             unsigned long weight)
2459 {
2460         if (se->on_rq) {
2461                 /* commit outstanding execution time */
2462                 if (cfs_rq->curr == se)
2463                         update_curr(cfs_rq);
2464                 account_entity_dequeue(cfs_rq, se);
2465         }
2466 
2467         update_load_set(&se->load, weight);
2468 
2469         if (se->on_rq)
2470                 account_entity_enqueue(cfs_rq, se);
2471 }
2472 
2473 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2474 
2475 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2476 {
2477         struct task_group *tg;
2478         struct sched_entity *se;
2479         long shares;
2480 
2481         tg = cfs_rq->tg;
2482         se = tg->se[cpu_of(rq_of(cfs_rq))];
2483         if (!se || throttled_hierarchy(cfs_rq))
2484                 return;
2485 #ifndef CONFIG_SMP
2486         if (likely(se->load.weight == tg->shares))
2487                 return;
2488 #endif
2489         shares = calc_cfs_shares(cfs_rq, tg);
2490 
2491         reweight_entity(cfs_rq_of(se), se, shares);
2492 }
2493 #else /* CONFIG_FAIR_GROUP_SCHED */
2494 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2495 {
2496 }
2497 #endif /* CONFIG_FAIR_GROUP_SCHED */
2498 
2499 #ifdef CONFIG_SMP
2500 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2501 static const u32 runnable_avg_yN_inv[] = {
2502         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2503         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2504         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2505         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2506         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2507         0x85aac367, 0x82cd8698,
2508 };
2509 
2510 /*
2511  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2512  * over-estimates when re-combining.
2513  */
2514 static const u32 runnable_avg_yN_sum[] = {
2515             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2516          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2517         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2518 };
2519 
2520 /*
2521  * Approximate:
2522  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2523  */
2524 static __always_inline u64 decay_load(u64 val, u64 n)
2525 {
2526         unsigned int local_n;
2527 
2528         if (!n)
2529                 return val;
2530         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2531                 return 0;
2532 
2533         /* after bounds checking we can collapse to 32-bit */
2534         local_n = n;
2535 
2536         /*
2537          * As y^PERIOD = 1/2, we can combine
2538          *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2539          * With a look-up table which covers y^n (n<PERIOD)
2540          *
2541          * To achieve constant time decay_load.
2542          */
2543         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2544                 val >>= local_n / LOAD_AVG_PERIOD;
2545                 local_n %= LOAD_AVG_PERIOD;
2546         }
2547 
2548         val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2549         return val;
2550 }
2551 
2552 /*
2553  * For updates fully spanning n periods, the contribution to runnable
2554  * average will be: \Sum 1024*y^n
2555  *
2556  * We can compute this reasonably efficiently by combining:
2557  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2558  */
2559 static u32 __compute_runnable_contrib(u64 n)
2560 {
2561         u32 contrib = 0;
2562 
2563         if (likely(n <= LOAD_AVG_PERIOD))
2564                 return runnable_avg_yN_sum[n];
2565         else if (unlikely(n >= LOAD_AVG_MAX_N))
2566                 return LOAD_AVG_MAX;
2567 
2568         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2569         do {
2570                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2571                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2572 
2573                 n -= LOAD_AVG_PERIOD;
2574         } while (n > LOAD_AVG_PERIOD);
2575 
2576         contrib = decay_load(contrib, n);
2577         return contrib + runnable_avg_yN_sum[n];
2578 }
2579 
2580 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2581 #error "load tracking assumes 2^10 as unit"
2582 #endif
2583 
2584 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2585 
2586 /*
2587  * We can represent the historical contribution to runnable average as the
2588  * coefficients of a geometric series.  To do this we sub-divide our runnable
2589  * history into segments of approximately 1ms (1024us); label the segment that
2590  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2591  *
2592  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2593  *      p0            p1           p2
2594  *     (now)       (~1ms ago)  (~2ms ago)
2595  *
2596  * Let u_i denote the fraction of p_i that the entity was runnable.
2597  *
2598  * We then designate the fractions u_i as our co-efficients, yielding the
2599  * following representation of historical load:
2600  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2601  *
2602  * We choose y based on the with of a reasonably scheduling period, fixing:
2603  *   y^32 = 0.5
2604  *
2605  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2606  * approximately half as much as the contribution to load within the last ms
2607  * (u_0).
2608  *
2609  * When a period "rolls over" and we have new u_0`, multiplying the previous
2610  * sum again by y is sufficient to update:
2611  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2612  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2613  */
2614 static __always_inline int
2615 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2616                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
2617 {
2618         u64 delta, scaled_delta, periods;
2619         u32 contrib;
2620         unsigned int delta_w, scaled_delta_w, decayed = 0;
2621         unsigned long scale_freq, scale_cpu;
2622 
2623         delta = now - sa->last_update_time;
2624         /*
2625          * This should only happen when time goes backwards, which it
2626          * unfortunately does during sched clock init when we swap over to TSC.
2627          */
2628         if ((s64)delta < 0) {
2629                 sa->last_update_time = now;
2630                 return 0;
2631         }
2632 
2633         /*
2634          * Use 1024ns as the unit of measurement since it's a reasonable
2635          * approximation of 1us and fast to compute.
2636          */
2637         delta >>= 10;
2638         if (!delta)
2639                 return 0;
2640         sa->last_update_time = now;
2641 
2642         scale_freq = arch_scale_freq_capacity(NULL, cpu);
2643         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2644 
2645         /* delta_w is the amount already accumulated against our next period */
2646         delta_w = sa->period_contrib;
2647         if (delta + delta_w >= 1024) {
2648                 decayed = 1;
2649 
2650                 /* how much left for next period will start over, we don't know yet */
2651                 sa->period_contrib = 0;
2652 
2653                 /*
2654                  * Now that we know we're crossing a period boundary, figure
2655                  * out how much from delta we need to complete the current
2656                  * period and accrue it.
2657                  */
2658                 delta_w = 1024 - delta_w;
2659                 scaled_delta_w = cap_scale(delta_w, scale_freq);
2660                 if (weight) {
2661                         sa->load_sum += weight * scaled_delta_w;
2662                         if (cfs_rq) {
2663                                 cfs_rq->runnable_load_sum +=
2664                                                 weight * scaled_delta_w;
2665                         }
2666                 }
2667                 if (running)
2668                         sa->util_sum += scaled_delta_w * scale_cpu;
2669 
2670                 delta -= delta_w;
2671 
2672                 /* Figure out how many additional periods this update spans */
2673                 periods = delta / 1024;
2674                 delta %= 1024;
2675 
2676                 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2677                 if (cfs_rq) {
2678                         cfs_rq->runnable_load_sum =
2679                                 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2680                 }
2681                 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2682 
2683                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2684                 contrib = __compute_runnable_contrib(periods);
2685                 contrib = cap_scale(contrib, scale_freq);
2686                 if (weight) {
2687                         sa->load_sum += weight * contrib;
2688                         if (cfs_rq)
2689                                 cfs_rq->runnable_load_sum += weight * contrib;
2690                 }
2691                 if (running)
2692                         sa->util_sum += contrib * scale_cpu;
2693         }
2694 
2695         /* Remainder of delta accrued against u_0` */
2696         scaled_delta = cap_scale(delta, scale_freq);
2697         if (weight) {
2698                 sa->load_sum += weight * scaled_delta;
2699                 if (cfs_rq)
2700                         cfs_rq->runnable_load_sum += weight * scaled_delta;
2701         }
2702         if (running)
2703                 sa->util_sum += scaled_delta * scale_cpu;
2704 
2705         sa->period_contrib += delta;
2706 
2707         if (decayed) {
2708                 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2709                 if (cfs_rq) {
2710                         cfs_rq->runnable_load_avg =
2711                                 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2712                 }
2713                 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2714         }
2715 
2716         return decayed;
2717 }
2718 
2719 #ifdef CONFIG_FAIR_GROUP_SCHED
2720 /*
2721  * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2722  * and effective_load (which is not done because it is too costly).
2723  */
2724 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2725 {
2726         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2727 
2728         /*
2729          * No need to update load_avg for root_task_group as it is not used.
2730          */
2731         if (cfs_rq->tg == &root_task_group)
2732                 return;
2733 
2734         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2735                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2736                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2737         }
2738 }
2739 
2740 /*
2741  * Called within set_task_rq() right before setting a task's cpu. The
2742  * caller only guarantees p->pi_lock is held; no other assumptions,
2743  * including the state of rq->lock, should be made.
2744  */
2745 void set_task_rq_fair(struct sched_entity *se,
2746                       struct cfs_rq *prev, struct cfs_rq *next)
2747 {
2748         if (!sched_feat(ATTACH_AGE_LOAD))
2749                 return;
2750 
2751         /*
2752          * We are supposed to update the task to "current" time, then its up to
2753          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2754          * getting what current time is, so simply throw away the out-of-date
2755          * time. This will result in the wakee task is less decayed, but giving
2756          * the wakee more load sounds not bad.
2757          */
2758         if (se->avg.last_update_time && prev) {
2759                 u64 p_last_update_time;
2760                 u64 n_last_update_time;
2761 
2762 #ifndef CONFIG_64BIT
2763                 u64 p_last_update_time_copy;
2764                 u64 n_last_update_time_copy;
2765 
2766                 do {
2767                         p_last_update_time_copy = prev->load_last_update_time_copy;
2768                         n_last_update_time_copy = next->load_last_update_time_copy;
2769 
2770                         smp_rmb();
2771 
2772                         p_last_update_time = prev->avg.last_update_time;
2773                         n_last_update_time = next->avg.last_update_time;
2774 
2775                 } while (p_last_update_time != p_last_update_time_copy ||
2776                          n_last_update_time != n_last_update_time_copy);
2777 #else
2778                 p_last_update_time = prev->avg.last_update_time;
2779                 n_last_update_time = next->avg.last_update_time;
2780 #endif
2781                 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2782                                   &se->avg, 0, 0, NULL);
2783                 se->avg.last_update_time = n_last_update_time;
2784         }
2785 }
2786 #else /* CONFIG_FAIR_GROUP_SCHED */
2787 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2788 #endif /* CONFIG_FAIR_GROUP_SCHED */
2789 
2790 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2791 
2792 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2793 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2794 {
2795         struct sched_avg *sa = &cfs_rq->avg;
2796         int decayed, removed = 0;
2797 
2798         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2799                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2800                 sa->load_avg = max_t(long, sa->load_avg - r, 0);
2801                 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2802                 removed = 1;
2803         }
2804 
2805         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2806                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2807                 sa->util_avg = max_t(long, sa->util_avg - r, 0);
2808                 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2809         }
2810 
2811         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2812                 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2813 
2814 #ifndef CONFIG_64BIT
2815         smp_wmb();
2816         cfs_rq->load_last_update_time_copy = sa->last_update_time;
2817 #endif
2818 
2819         return decayed || removed;
2820 }
2821 
2822 /* Update task and its cfs_rq load average */
2823 static inline void update_load_avg(struct sched_entity *se, int update_tg)
2824 {
2825         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2826         u64 now = cfs_rq_clock_task(cfs_rq);
2827         int cpu = cpu_of(rq_of(cfs_rq));
2828 
2829         /*
2830          * Track task load average for carrying it to new CPU after migrated, and
2831          * track group sched_entity load average for task_h_load calc in migration
2832          */
2833         __update_load_avg(now, cpu, &se->avg,
2834                           se->on_rq * scale_load_down(se->load.weight),
2835                           cfs_rq->curr == se, NULL);
2836 
2837         if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2838                 update_tg_load_avg(cfs_rq, 0);
2839 }
2840 
2841 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2842 {
2843         if (!sched_feat(ATTACH_AGE_LOAD))
2844                 goto skip_aging;
2845 
2846         /*
2847          * If we got migrated (either between CPUs or between cgroups) we'll
2848          * have aged the average right before clearing @last_update_time.
2849          */
2850         if (se->avg.last_update_time) {
2851                 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2852                                   &se->avg, 0, 0, NULL);
2853 
2854                 /*
2855                  * XXX: we could have just aged the entire load away if we've been
2856                  * absent from the fair class for too long.
2857                  */
2858         }
2859 
2860 skip_aging:
2861         se->avg.last_update_time = cfs_rq->avg.last_update_time;
2862         cfs_rq->avg.load_avg += se->avg.load_avg;
2863         cfs_rq->avg.load_sum += se->avg.load_sum;
2864         cfs_rq->avg.util_avg += se->avg.util_avg;
2865         cfs_rq->avg.util_sum += se->avg.util_sum;
2866 }
2867 
2868 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2869 {
2870         __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2871                           &se->avg, se->on_rq * scale_load_down(se->load.weight),
2872                           cfs_rq->curr == se, NULL);
2873 
2874         cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2875         cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2876         cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2877         cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2878 }
2879 
2880 /* Add the load generated by se into cfs_rq's load average */
2881 static inline void
2882 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2883 {
2884         struct sched_avg *sa = &se->avg;
2885         u64 now = cfs_rq_clock_task(cfs_rq);
2886         int migrated, decayed;
2887 
2888         migrated = !sa->last_update_time;
2889         if (!migrated) {
2890                 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2891                         se->on_rq * scale_load_down(se->load.weight),
2892                         cfs_rq->curr == se, NULL);
2893         }
2894 
2895         decayed = update_cfs_rq_load_avg(now, cfs_rq);
2896 
2897         cfs_rq->runnable_load_avg += sa->load_avg;
2898         cfs_rq->runnable_load_sum += sa->load_sum;
2899 
2900         if (migrated)
2901                 attach_entity_load_avg(cfs_rq, se);
2902 
2903         if (decayed || migrated)
2904                 update_tg_load_avg(cfs_rq, 0);
2905 }
2906 
2907 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
2908 static inline void
2909 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2910 {
2911         update_load_avg(se, 1);
2912 
2913         cfs_rq->runnable_load_avg =
2914                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2915         cfs_rq->runnable_load_sum =
2916                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2917 }
2918 
2919 #ifndef CONFIG_64BIT
2920 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2921 {
2922         u64 last_update_time_copy;
2923         u64 last_update_time;
2924 
2925         do {
2926                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
2927                 smp_rmb();
2928                 last_update_time = cfs_rq->avg.last_update_time;
2929         } while (last_update_time != last_update_time_copy);
2930 
2931         return last_update_time;
2932 }
2933 #else
2934 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2935 {
2936         return cfs_rq->avg.last_update_time;
2937 }
2938 #endif
2939 
2940 /*
2941  * Task first catches up with cfs_rq, and then subtract
2942  * itself from the cfs_rq (task must be off the queue now).
2943  */
2944 void remove_entity_load_avg(struct sched_entity *se)
2945 {
2946         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2947         u64 last_update_time;
2948 
2949         /*
2950          * Newly created task or never used group entity should not be removed
2951          * from its (source) cfs_rq
2952          */
2953         if (se->avg.last_update_time == 0)
2954                 return;
2955 
2956         last_update_time = cfs_rq_last_update_time(cfs_rq);
2957 
2958         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2959         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2960         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2961 }
2962 
2963 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2964 {
2965         return cfs_rq->runnable_load_avg;
2966 }
2967 
2968 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2969 {
2970         return cfs_rq->avg.load_avg;
2971 }
2972 
2973 static int idle_balance(struct rq *this_rq);
2974 
2975 #else /* CONFIG_SMP */
2976 
2977 static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2978 static inline void
2979 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2980 static inline void
2981 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2982 static inline void remove_entity_load_avg(struct sched_entity *se) {}
2983 
2984 static inline void
2985 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2986 static inline void
2987 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2988 
2989 static inline int idle_balance(struct rq *rq)
2990 {
2991         return 0;
2992 }
2993 
2994 #endif /* CONFIG_SMP */
2995 
2996 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2997 {
2998 #ifdef CONFIG_SCHEDSTATS
2999         struct task_struct *tsk = NULL;
3000 
3001         if (entity_is_task(se))
3002                 tsk = task_of(se);
3003 
3004         if (se->statistics.sleep_start) {
3005                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3006 
3007                 if ((s64)delta < 0)
3008                         delta = 0;
3009 
3010                 if (unlikely(delta > se->statistics.sleep_max))
3011                         se->statistics.sleep_max = delta;
3012 
3013                 se->statistics.sleep_start = 0;
3014                 se->statistics.sum_sleep_runtime += delta;
3015 
3016                 if (tsk) {
3017                         account_scheduler_latency(tsk, delta >> 10, 1);
3018                         trace_sched_stat_sleep(tsk, delta);
3019                 }
3020         }
3021         if (se->statistics.block_start) {
3022                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3023 
3024                 if ((s64)delta < 0)
3025                         delta = 0;
3026 
3027                 if (unlikely(delta > se->statistics.block_max))
3028                         se->statistics.block_max = delta;
3029 
3030                 se->statistics.block_start = 0;
3031                 se->statistics.sum_sleep_runtime += delta;
3032 
3033                 if (tsk) {
3034                         if (tsk->in_iowait) {
3035                                 se->statistics.iowait_sum += delta;
3036                                 se->statistics.iowait_count++;
3037                                 trace_sched_stat_iowait(tsk, delta);
3038                         }
3039 
3040                         trace_sched_stat_blocked(tsk, delta);
3041 
3042                         /*
3043                          * Blocking time is in units of nanosecs, so shift by
3044                          * 20 to get a milliseconds-range estimation of the
3045                          * amount of time that the task spent sleeping:
3046                          */
3047                         if (unlikely(prof_on == SLEEP_PROFILING)) {
3048                                 profile_hits(SLEEP_PROFILING,
3049                                                 (void *)get_wchan(tsk),
3050                                                 delta >> 20);
3051                         }
3052                         account_scheduler_latency(tsk, delta >> 10, 0);
3053                 }
3054         }
3055 #endif
3056 }
3057 
3058 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3059 {
3060 #ifdef CONFIG_SCHED_DEBUG
3061         s64 d = se->vruntime - cfs_rq->min_vruntime;
3062 
3063         if (d < 0)
3064                 d = -d;
3065 
3066         if (d > 3*sysctl_sched_latency)
3067                 schedstat_inc(cfs_rq, nr_spread_over);
3068 #endif
3069 }
3070 
3071 static void
3072 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3073 {
3074         u64 vruntime = cfs_rq->min_vruntime;
3075 
3076         /*
3077          * The 'current' period is already promised to the current tasks,
3078          * however the extra weight of the new task will slow them down a
3079          * little, place the new task so that it fits in the slot that
3080          * stays open at the end.
3081          */
3082         if (initial && sched_feat(START_DEBIT))
3083                 vruntime += sched_vslice(cfs_rq, se);
3084 
3085         /* sleeps up to a single latency don't count. */
3086         if (!initial) {
3087                 unsigned long thresh = sysctl_sched_latency;
3088 
3089                 /*
3090                  * Halve their sleep time's effect, to allow
3091                  * for a gentler effect of sleepers:
3092                  */
3093                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3094                         thresh >>= 1;
3095 
3096                 vruntime -= thresh;
3097         }
3098 
3099         /* ensure we never gain time by being placed backwards. */
3100         se->vruntime = max_vruntime(se->vruntime, vruntime);
3101 }
3102 
3103 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3104 
3105 static void
3106 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3107 {
3108         /*
3109          * Update the normalized vruntime before updating min_vruntime
3110          * through calling update_curr().
3111          */
3112         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3113                 se->vruntime += cfs_rq->min_vruntime;
3114 
3115         /*
3116          * Update run-time statistics of the 'current'.
3117          */
3118         update_curr(cfs_rq);
3119         enqueue_entity_load_avg(cfs_rq, se);
3120         account_entity_enqueue(cfs_rq, se);
3121         update_cfs_shares(cfs_rq);
3122 
3123         if (flags & ENQUEUE_WAKEUP) {
3124                 place_entity(cfs_rq, se, 0);
3125                 enqueue_sleeper(cfs_rq, se);
3126         }
3127 
3128         update_stats_enqueue(cfs_rq, se);
3129         check_spread(cfs_rq, se);
3130         if (se != cfs_rq->curr)
3131                 __enqueue_entity(cfs_rq, se);
3132         se->on_rq = 1;
3133 
3134         if (cfs_rq->nr_running == 1) {
3135                 list_add_leaf_cfs_rq(cfs_rq);
3136                 check_enqueue_throttle(cfs_rq);
3137         }
3138 }
3139 
3140 static void __clear_buddies_last(struct sched_entity *se)
3141 {
3142         for_each_sched_entity(se) {
3143                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3144                 if (cfs_rq->last != se)
3145                         break;
3146 
3147                 cfs_rq->last = NULL;
3148         }
3149 }
3150 
3151 static void __clear_buddies_next(struct sched_entity *se)
3152 {
3153         for_each_sched_entity(se) {
3154                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3155                 if (cfs_rq->next != se)
3156                         break;
3157 
3158                 cfs_rq->next = NULL;
3159         }
3160 }
3161 
3162 static void __clear_buddies_skip(struct sched_entity *se)
3163 {
3164         for_each_sched_entity(se) {
3165                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3166                 if (cfs_rq->skip != se)
3167                         break;
3168 
3169                 cfs_rq->skip = NULL;
3170         }
3171 }
3172 
3173 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3174 {
3175         if (cfs_rq->last == se)
3176                 __clear_buddies_last(se);
3177 
3178         if (cfs_rq->next == se)
3179                 __clear_buddies_next(se);
3180 
3181         if (cfs_rq->skip == se)
3182                 __clear_buddies_skip(se);
3183 }
3184 
3185 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3186 
3187 static void
3188 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3189 {
3190         /*
3191          * Update run-time statistics of the 'current'.
3192          */
3193         update_curr(cfs_rq);
3194         dequeue_entity_load_avg(cfs_rq, se);
3195 
3196         update_stats_dequeue(cfs_rq, se);
3197         if (flags & DEQUEUE_SLEEP) {
3198 #ifdef CONFIG_SCHEDSTATS
3199                 if (entity_is_task(se)) {
3200                         struct task_struct *tsk = task_of(se);
3201 
3202                         if (tsk->state & TASK_INTERRUPTIBLE)
3203                                 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3204                         if (tsk->state & TASK_UNINTERRUPTIBLE)
3205                                 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3206                 }
3207 #endif
3208         }
3209 
3210         clear_buddies(cfs_rq, se);
3211 
3212         if (se != cfs_rq->curr)
3213                 __dequeue_entity(cfs_rq, se);
3214         se->on_rq = 0;
3215         account_entity_dequeue(cfs_rq, se);
3216 
3217         /*
3218          * Normalize the entity after updating the min_vruntime because the
3219          * update can refer to the ->curr item and we need to reflect this
3220          * movement in our normalized position.
3221          */
3222         if (!(flags & DEQUEUE_SLEEP))
3223                 se->vruntime -= cfs_rq->min_vruntime;
3224 
3225         /* return excess runtime on last dequeue */
3226         return_cfs_rq_runtime(cfs_rq);
3227 
3228         update_min_vruntime(cfs_rq);
3229         update_cfs_shares(cfs_rq);
3230 }
3231 
3232 /*
3233  * Preempt the current task with a newly woken task if needed:
3234  */
3235 static void
3236 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3237 {
3238         unsigned long ideal_runtime, delta_exec;
3239         struct sched_entity *se;
3240         s64 delta;
3241 
3242         ideal_runtime = sched_slice(cfs_rq, curr);
3243         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3244         if (delta_exec > ideal_runtime) {
3245                 resched_curr(rq_of(cfs_rq));
3246                 /*
3247                  * The current task ran long enough, ensure it doesn't get
3248                  * re-elected due to buddy favours.
3249                  */
3250                 clear_buddies(cfs_rq, curr);
3251                 return;
3252         }
3253 
3254         /*
3255          * Ensure that a task that missed wakeup preemption by a
3256          * narrow margin doesn't have to wait for a full slice.
3257          * This also mitigates buddy induced latencies under load.
3258          */
3259         if (delta_exec < sysctl_sched_min_granularity)
3260                 return;
3261 
3262         se = __pick_first_entity(cfs_rq);
3263         delta = curr->vruntime - se->vruntime;
3264 
3265         if (delta < 0)
3266                 return;
3267 
3268         if (delta > ideal_runtime)
3269                 resched_curr(rq_of(cfs_rq));
3270 }
3271 
3272 static void
3273 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3274 {
3275         /* 'current' is not kept within the tree. */
3276         if (se->on_rq) {
3277                 /*
3278                  * Any task has to be enqueued before it get to execute on
3279                  * a CPU. So account for the time it spent waiting on the
3280                  * runqueue.
3281                  */
3282                 update_stats_wait_end(cfs_rq, se);
3283                 __dequeue_entity(cfs_rq, se);
3284                 update_load_avg(se, 1);
3285         }
3286 
3287         update_stats_curr_start(cfs_rq, se);
3288         cfs_rq->curr = se;
3289 #ifdef CONFIG_SCHEDSTATS
3290         /*
3291          * Track our maximum slice length, if the CPU's load is at
3292          * least twice that of our own weight (i.e. dont track it
3293          * when there are only lesser-weight tasks around):
3294          */
3295         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3296                 se->statistics.slice_max = max(se->statistics.slice_max,
3297                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
3298         }
3299 #endif
3300         se->prev_sum_exec_runtime = se->sum_exec_runtime;
3301 }
3302 
3303 static int
3304 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3305 
3306 /*
3307  * Pick the next process, keeping these things in mind, in this order:
3308  * 1) keep things fair between processes/task groups
3309  * 2) pick the "next" process, since someone really wants that to run
3310  * 3) pick the "last" process, for cache locality
3311  * 4) do not run the "skip" process, if something else is available
3312  */
3313 static struct sched_entity *
3314 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3315 {
3316         struct sched_entity *left = __pick_first_entity(cfs_rq);
3317         struct sched_entity *se;
3318 
3319         /*
3320          * If curr is set we have to see if its left of the leftmost entity
3321          * still in the tree, provided there was anything in the tree at all.
3322          */
3323         if (!left || (curr && entity_before(curr, left)))
3324                 left = curr;
3325 
3326         se = left; /* ideally we run the leftmost entity */
3327 
3328         /*
3329          * Avoid running the skip buddy, if running something else can
3330          * be done without getting too unfair.
3331          */
3332         if (cfs_rq->skip == se) {
3333                 struct sched_entity *second;
3334 
3335                 if (se == curr) {
3336                         second = __pick_first_entity(cfs_rq);
3337                 } else {
3338                         second = __pick_next_entity(se);
3339                         if (!second || (curr && entity_before(curr, second)))
3340                                 second = curr;
3341                 }
3342 
3343                 if (second && wakeup_preempt_entity(second, left) < 1)
3344                         se = second;
3345         }
3346 
3347         /*
3348          * Prefer last buddy, try to return the CPU to a preempted task.
3349          */
3350         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3351                 se = cfs_rq->last;
3352 
3353         /*
3354          * Someone really wants this to run. If it's not unfair, run it.
3355          */
3356         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3357                 se = cfs_rq->next;
3358 
3359         clear_buddies(cfs_rq, se);
3360 
3361         return se;
3362 }
3363 
3364 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3365 
3366 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3367 {
3368         /*
3369          * If still on the runqueue then deactivate_task()
3370          * was not called and update_curr() has to be done:
3371          */
3372         if (prev->on_rq)
3373                 update_curr(cfs_rq);
3374 
3375         /* throttle cfs_rqs exceeding runtime */
3376         check_cfs_rq_runtime(cfs_rq);
3377 
3378         check_spread(cfs_rq, prev);
3379         if (prev->on_rq) {
3380                 update_stats_wait_start(cfs_rq, prev);
3381                 /* Put 'current' back into the tree. */
3382                 __enqueue_entity(cfs_rq, prev);
3383                 /* in !on_rq case, update occurred at dequeue */
3384                 update_load_avg(prev, 0);
3385         }
3386         cfs_rq->curr = NULL;
3387 }
3388 
3389 static void
3390 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3391 {
3392         /*
3393          * Update run-time statistics of the 'current'.
3394          */
3395         update_curr(cfs_rq);
3396 
3397         /*
3398          * Ensure that runnable average is periodically updated.
3399          */
3400         update_load_avg(curr, 1);
3401         update_cfs_shares(cfs_rq);
3402 
3403 #ifdef CONFIG_SCHED_HRTICK
3404         /*
3405          * queued ticks are scheduled to match the slice, so don't bother
3406          * validating it and just reschedule.
3407          */
3408         if (queued) {
3409                 resched_curr(rq_of(cfs_rq));
3410                 return;
3411         }
3412         /*
3413          * don't let the period tick interfere with the hrtick preemption
3414          */
3415         if (!sched_feat(DOUBLE_TICK) &&
3416                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3417                 return;
3418 #endif
3419 
3420         if (cfs_rq->nr_running > 1)
3421                 check_preempt_tick(cfs_rq, curr);
3422 }
3423 
3424 
3425 /**************************************************
3426  * CFS bandwidth control machinery
3427  */
3428 
3429 #ifdef CONFIG_CFS_BANDWIDTH
3430 
3431 #ifdef HAVE_JUMP_LABEL
3432 static struct static_key __cfs_bandwidth_used;
3433 
3434 static inline bool cfs_bandwidth_used(void)
3435 {
3436         return static_key_false(&__cfs_bandwidth_used);
3437 }
3438 
3439 void cfs_bandwidth_usage_inc(void)
3440 {
3441         static_key_slow_inc(&__cfs_bandwidth_used);
3442 }
3443 
3444 void cfs_bandwidth_usage_dec(void)
3445 {
3446         static_key_slow_dec(&__cfs_bandwidth_used);
3447 }
3448 #else /* HAVE_JUMP_LABEL */
3449 static bool cfs_bandwidth_used(void)
3450 {
3451         return true;
3452 }
3453 
3454 void cfs_bandwidth_usage_inc(void) {}
3455 void cfs_bandwidth_usage_dec(void) {}
3456 #endif /* HAVE_JUMP_LABEL */
3457 
3458 /*
3459  * default period for cfs group bandwidth.
3460  * default: 0.1s, units: nanoseconds
3461  */
3462 static inline u64 default_cfs_period(void)
3463 {
3464         return 100000000ULL;
3465 }
3466 
3467 static inline u64 sched_cfs_bandwidth_slice(void)
3468 {
3469         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3470 }
3471 
3472 /*
3473  * Replenish runtime according to assigned quota and update expiration time.
3474  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3475  * additional synchronization around rq->lock.
3476  *
3477  * requires cfs_b->lock
3478  */
3479 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3480 {
3481         u64 now;
3482 
3483         if (cfs_b->quota == RUNTIME_INF)
3484                 return;
3485 
3486         now = sched_clock_cpu(smp_processor_id());
3487         cfs_b->runtime = cfs_b->quota;
3488         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3489 }
3490 
3491 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3492 {
3493         return &tg->cfs_bandwidth;
3494 }
3495 
3496 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3497 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3498 {
3499         if (unlikely(cfs_rq->throttle_count))
3500                 return cfs_rq->throttled_clock_task;
3501 
3502         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3503 }
3504 
3505 /* returns 0 on failure to allocate runtime */
3506 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3507 {
3508         struct task_group *tg = cfs_rq->tg;
3509         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3510         u64 amount = 0, min_amount, expires;
3511 
3512         /* note: this is a positive sum as runtime_remaining <= 0 */
3513         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3514 
3515         raw_spin_lock(&cfs_b->lock);
3516         if (cfs_b->quota == RUNTIME_INF)
3517                 amount = min_amount;
3518         else {
3519                 start_cfs_bandwidth(cfs_b);
3520 
3521                 if (cfs_b->runtime > 0) {
3522                         amount = min(cfs_b->runtime, min_amount);
3523                         cfs_b->runtime -= amount;
3524                         cfs_b->idle = 0;
3525                 }
3526         }
3527         expires = cfs_b->runtime_expires;
3528         raw_spin_unlock(&cfs_b->lock);
3529 
3530         cfs_rq->runtime_remaining += amount;
3531         /*
3532          * we may have advanced our local expiration to account for allowed
3533          * spread between our sched_clock and the one on which runtime was
3534          * issued.
3535          */
3536         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3537                 cfs_rq->runtime_expires = expires;
3538 
3539         return cfs_rq->runtime_remaining > 0;
3540 }
3541 
3542 /*
3543  * Note: This depends on the synchronization provided by sched_clock and the
3544  * fact that rq->clock snapshots this value.
3545  */
3546 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3547 {
3548         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3549 
3550         /* if the deadline is ahead of our clock, nothing to do */
3551         if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3552                 return;
3553 
3554         if (cfs_rq->runtime_remaining < 0)
3555                 return;
3556 
3557         /*
3558          * If the local deadline has passed we have to consider the
3559          * possibility that our sched_clock is 'fast' and the global deadline
3560          * has not truly expired.
3561          *
3562          * Fortunately we can check determine whether this the case by checking
3563          * whether the global deadline has advanced. It is valid to compare
3564          * cfs_b->runtime_expires without any locks since we only care about
3565          * exact equality, so a partial write will still work.
3566          */
3567 
3568         if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3569                 /* extend local deadline, drift is bounded above by 2 ticks */
3570                 cfs_rq->runtime_expires += TICK_NSEC;
3571         } else {
3572                 /* global deadline is ahead, expiration has passed */
3573                 cfs_rq->runtime_remaining = 0;
3574         }
3575 }
3576 
3577 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3578 {
3579         /* dock delta_exec before expiring quota (as it could span periods) */
3580         cfs_rq->runtime_remaining -= delta_exec;
3581         expire_cfs_rq_runtime(cfs_rq);
3582 
3583         if (likely(cfs_rq->runtime_remaining > 0))
3584                 return;
3585 
3586         /*
3587          * if we're unable to extend our runtime we resched so that the active
3588          * hierarchy can be throttled
3589          */
3590         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3591                 resched_curr(rq_of(cfs_rq));
3592 }
3593 
3594 static __always_inline
3595 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3596 {
3597         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3598                 return;
3599 
3600         __account_cfs_rq_runtime(cfs_rq, delta_exec);
3601 }
3602 
3603 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3604 {
3605         return cfs_bandwidth_used() && cfs_rq->throttled;
3606 }
3607 
3608 /* check whether cfs_rq, or any parent, is throttled */
3609 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3610 {
3611         return cfs_bandwidth_used() && cfs_rq->throttle_count;
3612 }
3613 
3614 /*
3615  * Ensure that neither of the group entities corresponding to src_cpu or
3616  * dest_cpu are members of a throttled hierarchy when performing group
3617  * load-balance operations.
3618  */
3619 static inline int throttled_lb_pair(struct task_group *tg,
3620                                     int src_cpu, int dest_cpu)
3621 {
3622         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3623 
3624         src_cfs_rq = tg->cfs_rq[src_cpu];
3625         dest_cfs_rq = tg->cfs_rq[dest_cpu];
3626 
3627         return throttled_hierarchy(src_cfs_rq) ||
3628                throttled_hierarchy(dest_cfs_rq);
3629 }
3630 
3631 /* updated child weight may affect parent so we have to do this bottom up */
3632 static int tg_unthrottle_up(struct task_group *tg, void *data)
3633 {
3634         struct rq *rq = data;
3635         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3636 
3637         cfs_rq->throttle_count--;
3638 #ifdef CONFIG_SMP
3639         if (!cfs_rq->throttle_count) {
3640                 /* adjust cfs_rq_clock_task() */
3641                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3642                                              cfs_rq->throttled_clock_task;
3643         }
3644 #endif
3645 
3646         return 0;
3647 }
3648 
3649 static int tg_throttle_down(struct task_group *tg, void *data)
3650 {
3651         struct rq *rq = data;
3652         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3653 
3654         /* group is entering throttled state, stop time */
3655         if (!cfs_rq->throttle_count)
3656                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3657         cfs_rq->throttle_count++;
3658 
3659         return 0;
3660 }
3661 
3662 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3663 {
3664         struct rq *rq = rq_of(cfs_rq);
3665         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3666         struct sched_entity *se;
3667         long task_delta, dequeue = 1;
3668         bool empty;
3669 
3670         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3671 
3672         /* freeze hierarchy runnable averages while throttled */
3673         rcu_read_lock();
3674         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3675         rcu_read_unlock();
3676 
3677         task_delta = cfs_rq->h_nr_running;
3678         for_each_sched_entity(se) {
3679                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3680                 /* throttled entity or throttle-on-deactivate */
3681                 if (!se->on_rq)
3682                         break;
3683 
3684                 if (dequeue)
3685                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3686                 qcfs_rq->h_nr_running -= task_delta;
3687 
3688                 if (qcfs_rq->load.weight)
3689                         dequeue = 0;
3690         }
3691 
3692         if (!se)
3693                 sub_nr_running(rq, task_delta);
3694 
3695         cfs_rq->throttled = 1;
3696         cfs_rq->throttled_clock = rq_clock(rq);
3697         raw_spin_lock(&cfs_b->lock);
3698         empty = list_empty(&cfs_b->throttled_cfs_rq);
3699 
3700         /*
3701          * Add to the _head_ of the list, so that an already-started
3702          * distribute_cfs_runtime will not see us
3703          */
3704         list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3705 
3706         /*
3707          * If we're the first throttled task, make sure the bandwidth
3708          * timer is running.
3709          */
3710         if (empty)
3711                 start_cfs_bandwidth(cfs_b);
3712 
3713         raw_spin_unlock(&cfs_b->lock);
3714 }
3715 
3716 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3717 {
3718         struct rq *rq = rq_of(cfs_rq);
3719         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3720         struct sched_entity *se;
3721         int enqueue = 1;
3722         long task_delta;
3723 
3724         se = cfs_rq->tg->se[cpu_of(rq)];
3725 
3726         cfs_rq->throttled = 0;
3727 
3728         update_rq_clock(rq);
3729 
3730         raw_spin_lock(&cfs_b->lock);
3731         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3732         list_del_rcu(&cfs_rq->throttled_list);
3733         raw_spin_unlock(&cfs_b->lock);
3734 
3735         /* update hierarchical throttle state */
3736         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3737 
3738         if (!cfs_rq->load.weight)
3739                 return;
3740 
3741         task_delta = cfs_rq->h_nr_running;
3742         for_each_sched_entity(se) {
3743                 if (se->on_rq)
3744                         enqueue = 0;
3745 
3746                 cfs_rq = cfs_rq_of(se);
3747                 if (enqueue)
3748                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3749                 cfs_rq->h_nr_running += task_delta;
3750 
3751                 if (cfs_rq_throttled(cfs_rq))
3752                         break;
3753         }
3754 
3755         if (!se)
3756                 add_nr_running(rq, task_delta);
3757 
3758         /* determine whether we need to wake up potentially idle cpu */
3759         if (rq->curr == rq->idle && rq->cfs.nr_running)
3760                 resched_curr(rq);
3761 }
3762 
3763 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3764                 u64 remaining, u64 expires)
3765 {
3766         struct cfs_rq *cfs_rq;
3767         u64 runtime;
3768         u64 starting_runtime = remaining;
3769 
3770         rcu_read_lock();
3771         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3772                                 throttled_list) {
3773                 struct rq *rq = rq_of(cfs_rq);
3774 
3775                 raw_spin_lock(&rq->lock);
3776                 if (!cfs_rq_throttled(cfs_rq))
3777                         goto next;
3778 
3779                 runtime = -cfs_rq->runtime_remaining + 1;
3780                 if (runtime > remaining)
3781                         runtime = remaining;
3782                 remaining -= runtime;
3783 
3784                 cfs_rq->runtime_remaining += runtime;
3785                 cfs_rq->runtime_expires = expires;
3786 
3787                 /* we check whether we're throttled above */
3788                 if (cfs_rq->runtime_remaining > 0)
3789                         unthrottle_cfs_rq(cfs_rq);
3790 
3791 next:
3792                 raw_spin_unlock(&rq->lock);
3793 
3794                 if (!remaining)
3795                         break;
3796         }
3797         rcu_read_unlock();
3798 
3799         return starting_runtime - remaining;
3800 }
3801 
3802 /*
3803  * Responsible for refilling a task_group's bandwidth and unthrottling its
3804  * cfs_rqs as appropriate. If there has been no activity within the last
3805  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3806  * used to track this state.
3807  */
3808 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3809 {
3810         u64 runtime, runtime_expires;
3811         int throttled;
3812 
3813         /* no need to continue the timer with no bandwidth constraint */
3814         if (cfs_b->quota == RUNTIME_INF)
3815                 goto out_deactivate;
3816 
3817         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3818         cfs_b->nr_periods += overrun;
3819 
3820         /*
3821          * idle depends on !throttled (for the case of a large deficit), and if
3822          * we're going inactive then everything else can be deferred
3823          */
3824         if (cfs_b->idle && !throttled)
3825                 goto out_deactivate;
3826 
3827         __refill_cfs_bandwidth_runtime(cfs_b);
3828 
3829         if (!throttled) {
3830                 /* mark as potentially idle for the upcoming period */
3831                 cfs_b->idle = 1;
3832                 return 0;
3833         }
3834 
3835         /* account preceding periods in which throttling occurred */
3836         cfs_b->nr_throttled += overrun;
3837 
3838         runtime_expires = cfs_b->runtime_expires;
3839 
3840         /*
3841          * This check is repeated as we are holding onto the new bandwidth while
3842          * we unthrottle. This can potentially race with an unthrottled group
3843          * trying to acquire new bandwidth from the global pool. This can result
3844          * in us over-using our runtime if it is all used during this loop, but
3845          * only by limited amounts in that extreme case.
3846          */
3847         while (throttled && cfs_b->runtime > 0) {
3848                 runtime = cfs_b->runtime;
3849                 raw_spin_unlock(&cfs_b->lock);
3850                 /* we can't nest cfs_b->lock while distributing bandwidth */
3851                 runtime = distribute_cfs_runtime(cfs_b, runtime,
3852                                                  runtime_expires);
3853                 raw_spin_lock(&cfs_b->lock);
3854 
3855                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3856 
3857                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3858         }
3859 
3860         /*
3861          * While we are ensured activity in the period following an
3862          * unthrottle, this also covers the case in which the new bandwidth is
3863          * insufficient to cover the existing bandwidth deficit.  (Forcing the
3864          * timer to remain active while there are any throttled entities.)
3865          */
3866         cfs_b->idle = 0;
3867 
3868         return 0;
3869 
3870 out_deactivate:
3871         return 1;
3872 }
3873 
3874 /* a cfs_rq won't donate quota below this amount */
3875 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3876 /* minimum remaining period time to redistribute slack quota */
3877 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3878 /* how long we wait to gather additional slack before distributing */
3879 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3880 
3881 /*
3882  * Are we near the end of the current quota period?
3883  *
3884  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3885  * hrtimer base being cleared by hrtimer_start. In the case of
3886  * migrate_hrtimers, base is never cleared, so we are fine.
3887  */
3888 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3889 {
3890         struct hrtimer *refresh_timer = &cfs_b->period_timer;
3891         u64 remaining;
3892 
3893         /* if the call-back is running a quota refresh is already occurring */
3894         if (hrtimer_callback_running(refresh_timer))
3895                 return 1;
3896 
3897         /* is a quota refresh about to occur? */
3898         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3899         if (remaining < min_expire)
3900                 return 1;
3901 
3902         return 0;
3903 }
3904 
3905 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3906 {
3907         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3908 
3909         /* if there's a quota refresh soon don't bother with slack */
3910         if (runtime_refresh_within(cfs_b, min_left))
3911                 return;
3912 
3913         hrtimer_start(&cfs_b->slack_timer,
3914                         ns_to_ktime(cfs_bandwidth_slack_period),
3915                         HRTIMER_MODE_REL);
3916 }
3917 
3918 /* we know any runtime found here is valid as update_curr() precedes return */
3919 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3920 {
3921         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3922         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3923 
3924         if (slack_runtime <= 0)
3925                 return;
3926 
3927         raw_spin_lock(&cfs_b->lock);
3928         if (cfs_b->quota != RUNTIME_INF &&
3929             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3930                 cfs_b->runtime += slack_runtime;
3931 
3932                 /* we are under rq->lock, defer unthrottling using a timer */
3933                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3934                     !list_empty(&cfs_b->throttled_cfs_rq))
3935                         start_cfs_slack_bandwidth(cfs_b);
3936         }
3937         raw_spin_unlock(&cfs_b->lock);
3938 
3939         /* even if it's not valid for return we don't want to try again */
3940         cfs_rq->runtime_remaining -= slack_runtime;
3941 }
3942 
3943 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3944 {
3945         if (!cfs_bandwidth_used())
3946                 return;
3947 
3948         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3949                 return;
3950 
3951         __return_cfs_rq_runtime(cfs_rq);
3952 }
3953 
3954 /*
3955  * This is done with a timer (instead of inline with bandwidth return) since
3956  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3957  */
3958 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3959 {
3960         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3961         u64 expires;
3962 
3963         /* confirm we're still not at a refresh boundary */
3964         raw_spin_lock(&cfs_b->lock);
3965         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3966                 raw_spin_unlock(&cfs_b->lock);
3967                 return;
3968         }
3969 
3970         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3971                 runtime = cfs_b->runtime;
3972 
3973         expires = cfs_b->runtime_expires;
3974         raw_spin_unlock(&cfs_b->lock);
3975 
3976         if (!runtime)
3977                 return;
3978 
3979         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3980 
3981         raw_spin_lock(&cfs_b->lock);
3982         if (expires == cfs_b->runtime_expires)
3983                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3984         raw_spin_unlock(&cfs_b->lock);
3985 }
3986 
3987 /*
3988  * When a group wakes up we want to make sure that its quota is not already
3989  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3990  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3991  */
3992 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3993 {
3994         if (!cfs_bandwidth_used())
3995                 return;
3996 
3997         /* an active group must be handled by the update_curr()->put() path */
3998         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3999                 return;
4000 
4001         /* ensure the group is not already throttled */
4002         if (cfs_rq_throttled(cfs_rq))
4003                 return;
4004 
4005         /* update runtime allocation */
4006         account_cfs_rq_runtime(cfs_rq, 0);
4007         if (cfs_rq->runtime_remaining <= 0)
4008                 throttle_cfs_rq(cfs_rq);
4009 }
4010 
4011 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4012 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4013 {
4014         if (!cfs_bandwidth_used())
4015                 return false;
4016 
4017         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4018                 return false;
4019 
4020         /*
4021          * it's possible for a throttled entity to be forced into a running
4022          * state (e.g. set_curr_task), in this case we're finished.
4023          */
4024         if (cfs_rq_throttled(cfs_rq))
4025                 return true;
4026 
4027         throttle_cfs_rq(cfs_rq);
4028         return true;
4029 }
4030 
4031 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4032 {
4033         struct cfs_bandwidth *cfs_b =
4034                 container_of(timer, struct cfs_bandwidth, slack_timer);
4035 
4036         do_sched_cfs_slack_timer(cfs_b);
4037 
4038         return HRTIMER_NORESTART;
4039 }
4040 
4041 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4042 {
4043         struct cfs_bandwidth *cfs_b =
4044                 container_of(timer, struct cfs_bandwidth, period_timer);
4045         int overrun;
4046         int idle = 0;
4047 
4048         raw_spin_lock(&cfs_b->lock);
4049         for (;;) {
4050                 overrun = hrtimer_forward_now(timer, cfs_b->period);
4051                 if (!overrun)
4052                         break;
4053 
4054                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4055         }
4056         if (idle)
4057                 cfs_b->period_active = 0;
4058         raw_spin_unlock(&cfs_b->lock);
4059 
4060         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4061 }
4062 
4063 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4064 {
4065         raw_spin_lock_init(&cfs_b->lock);
4066         cfs_b->runtime = 0;
4067         cfs_b->quota = RUNTIME_INF;
4068         cfs_b->period = ns_to_ktime(default_cfs_period());
4069 
4070         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4071         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4072         cfs_b->period_timer.function = sched_cfs_period_timer;
4073         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4074         cfs_b->slack_timer.function = sched_cfs_slack_timer;
4075 }
4076 
4077 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4078 {
4079         cfs_rq->runtime_enabled = 0;
4080         INIT_LIST_HEAD(&cfs_rq->throttled_list);
4081 }
4082 
4083 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4084 {
4085         lockdep_assert_held(&cfs_b->lock);
4086 
4087         if (!cfs_b->period_active) {
4088                 cfs_b->period_active = 1;
4089                 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4090                 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4091         }
4092 }
4093 
4094 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4095 {
4096         /* init_cfs_bandwidth() was not called */
4097         if (!cfs_b->throttled_cfs_rq.next)
4098                 return;
4099 
4100         hrtimer_cancel(&cfs_b->period_timer);
4101         hrtimer_cancel(&cfs_b->slack_timer);
4102 }
4103 
4104 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4105 {
4106         struct cfs_rq *cfs_rq;
4107 
4108         for_each_leaf_cfs_rq(rq, cfs_rq) {
4109                 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4110 
4111                 raw_spin_lock(&cfs_b->lock);
4112                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4113                 raw_spin_unlock(&cfs_b->lock);
4114         }
4115 }
4116 
4117 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4118 {
4119         struct cfs_rq *cfs_rq;
4120 
4121         for_each_leaf_cfs_rq(rq, cfs_rq) {
4122                 if (!cfs_rq->runtime_enabled)
4123                         continue;
4124 
4125                 /*
4126                  * clock_task is not advancing so we just need to make sure
4127                  * there's some valid quota amount
4128                  */
4129                 cfs_rq->runtime_remaining = 1;
4130                 /*
4131                  * Offline rq is schedulable till cpu is completely disabled
4132                  * in take_cpu_down(), so we prevent new cfs throttling here.
4133                  */
4134                 cfs_rq->runtime_enabled = 0;
4135 
4136                 if (cfs_rq_throttled(cfs_rq))
4137                         unthrottle_cfs_rq(cfs_rq);
4138         }
4139 }
4140 
4141 #else /* CONFIG_CFS_BANDWIDTH */
4142 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4143 {
4144         return rq_clock_task(rq_of(cfs_rq));
4145 }
4146 
4147 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4148 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4149 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4150 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4151 
4152 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4153 {
4154         return 0;
4155 }
4156 
4157 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4158 {
4159         return 0;
4160 }
4161 
4162 static inline int throttled_lb_pair(struct task_group *tg,
4163                                     int src_cpu, int dest_cpu)
4164 {
4165         return 0;
4166 }
4167 
4168 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4169 
4170 #ifdef CONFIG_FAIR_GROUP_SCHED
4171 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4172 #endif
4173 
4174 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4175 {
4176         return NULL;
4177 }
4178 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4179 static inline void update_runtime_enabled(struct rq *rq) {}
4180 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4181 
4182 #endif /* CONFIG_CFS_BANDWIDTH */
4183 
4184 /**************************************************
4185  * CFS operations on tasks:
4186  */
4187 
4188 #ifdef CONFIG_SCHED_HRTICK
4189 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4190 {
4191         struct sched_entity *se = &p->se;
4192         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4193 
4194         WARN_ON(task_rq(p) != rq);
4195 
4196         if (cfs_rq->nr_running > 1) {
4197                 u64 slice = sched_slice(cfs_rq, se);
4198                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4199                 s64 delta = slice - ran;
4200 
4201                 if (delta < 0) {
4202                         if (rq->curr == p)
4203                                 resched_curr(rq);
4204                         return;
4205                 }
4206                 hrtick_start(rq, delta);
4207         }
4208 }
4209 
4210 /*
4211  * called from enqueue/dequeue and updates the hrtick when the
4212  * current task is from our class and nr_running is low enough
4213  * to matter.
4214  */
4215 static void hrtick_update(struct rq *rq)
4216 {
4217         struct task_struct *curr = rq->curr;
4218 
4219         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4220                 return;
4221 
4222         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4223                 hrtick_start_fair(rq, curr);
4224 }
4225 #else /* !CONFIG_SCHED_HRTICK */
4226 static inline void
4227 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4228 {
4229 }
4230 
4231 static inline void hrtick_update(struct rq *rq)
4232 {
4233 }
4234 #endif
4235 
4236 /*
4237  * The enqueue_task method is called before nr_running is
4238  * increased. Here we update the fair scheduling stats and
4239  * then put the task into the rbtree:
4240  */
4241 static void
4242 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4243 {
4244         struct cfs_rq *cfs_rq;
4245         struct sched_entity *se = &p->se;
4246 
4247         for_each_sched_entity(se) {
4248                 if (se->on_rq)
4249                         break;
4250                 cfs_rq = cfs_rq_of(se);
4251                 enqueue_entity(cfs_rq, se, flags);
4252 
4253                 /*
4254                  * end evaluation on encountering a throttled cfs_rq
4255                  *
4256                  * note: in the case of encountering a throttled cfs_rq we will
4257                  * post the final h_nr_running increment below.
4258                 */
4259                 if (cfs_rq_throttled(cfs_rq))
4260                         break;
4261                 cfs_rq->h_nr_running++;
4262 
4263                 flags = ENQUEUE_WAKEUP;
4264         }
4265 
4266         for_each_sched_entity(se) {
4267                 cfs_rq = cfs_rq_of(se);
4268                 cfs_rq->h_nr_running++;
4269 
4270                 if (cfs_rq_throttled(cfs_rq))
4271                         break;
4272 
4273                 update_load_avg(se, 1);
4274                 update_cfs_shares(cfs_rq);
4275         }
4276 
4277         if (!se)
4278                 add_nr_running(rq, 1);
4279 
4280         hrtick_update(rq);
4281 }
4282 
4283 static void set_next_buddy(struct sched_entity *se);
4284 
4285 /*
4286  * The dequeue_task method is called before nr_running is
4287  * decreased. We remove the task from the rbtree and
4288  * update the fair scheduling stats:
4289  */
4290 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4291 {
4292         struct cfs_rq *cfs_rq;
4293         struct sched_entity *se = &p->se;
4294         int task_sleep = flags & DEQUEUE_SLEEP;
4295 
4296         for_each_sched_entity(se) {
4297                 cfs_rq = cfs_rq_of(se);
4298                 dequeue_entity(cfs_rq, se, flags);
4299 
4300                 /*
4301                  * end evaluation on encountering a throttled cfs_rq
4302                  *
4303                  * note: in the case of encountering a throttled cfs_rq we will
4304                  * post the final h_nr_running decrement below.
4305                 */
4306                 if (cfs_rq_throttled(cfs_rq))
4307                         break;
4308                 cfs_rq->h_nr_running--;
4309 
4310                 /* Don't dequeue parent if it has other entities besides us */
4311                 if (cfs_rq->load.weight) {
4312                         /*
4313                          * Bias pick_next to pick a task from this cfs_rq, as
4314                          * p is sleeping when it is within its sched_slice.
4315                          */
4316                         if (task_sleep && parent_entity(se))
4317                                 set_next_buddy(parent_entity(se));
4318 
4319                         /* avoid re-evaluating load for this entity */
4320                         se = parent_entity(se);
4321                         break;
4322                 }
4323                 flags |= DEQUEUE_SLEEP;
4324         }
4325 
4326         for_each_sched_entity(se) {
4327                 cfs_rq = cfs_rq_of(se);
4328                 cfs_rq->h_nr_running--;
4329 
4330                 if (cfs_rq_throttled(cfs_rq))
4331                         break;
4332 
4333                 update_load_avg(se, 1);
4334                 update_cfs_shares(cfs_rq);
4335         }
4336 
4337         if (!se)
4338                 sub_nr_running(rq, 1);
4339 
4340         hrtick_update(rq);
4341 }
4342 
4343 #ifdef CONFIG_SMP
4344 
4345 /*
4346  * per rq 'load' arrray crap; XXX kill this.
4347  */
4348 
4349 /*
4350  * The exact cpuload calculated at every tick would be:
4351  *
4352  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4353  *
4354  * If a cpu misses updates for n ticks (as it was idle) and update gets
4355  * called on the n+1-th tick when cpu may be busy, then we have:
4356  *
4357  *   load_n   = (1 - 1/2^i)^n * load_0
4358  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
4359  *
4360  * decay_load_missed() below does efficient calculation of
4361  *
4362  *   load' = (1 - 1/2^i)^n * load
4363  *
4364  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4365  * This allows us to precompute the above in said factors, thereby allowing the
4366  * reduction of an arbitrary n in O(log_2 n) steps. (See also
4367  * fixed_power_int())
4368  *
4369  * The calculation is approximated on a 128 point scale.
4370  */
4371 #define DEGRADE_SHIFT           7
4372 
4373 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4374 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4375         {   0,   0,  0,  0,  0,  0, 0, 0 },
4376         {  64,  32,  8,  0,  0,  0, 0, 0 },
4377         {  96,  72, 40, 12,  1,  0, 0, 0 },
4378         { 112,  98, 75, 43, 15,  1, 0, 0 },
4379         { 120, 112, 98, 76, 45, 16, 2, 0 }
4380 };
4381 
4382 /*
4383  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4384  * would be when CPU is idle and so we just decay the old load without
4385  * adding any new load.
4386  */
4387 static unsigned long
4388 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4389 {
4390         int j = 0;
4391 
4392         if (!missed_updates)
4393                 return load;
4394 
4395         if (missed_updates >= degrade_zero_ticks[idx])
4396                 return 0;
4397 
4398         if (idx == 1)
4399                 return load >> missed_updates;
4400 
4401         while (missed_updates) {
4402                 if (missed_updates % 2)
4403                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4404 
4405                 missed_updates >>= 1;
4406                 j++;
4407         }
4408         return load;
4409 }
4410 
4411 /**
4412  * __update_cpu_load - update the rq->cpu_load[] statistics
4413  * @this_rq: The rq to update statistics for
4414  * @this_load: The current load
4415  * @pending_updates: The number of missed updates
4416  * @active: !0 for NOHZ_FULL
4417  *
4418  * Update rq->cpu_load[] statistics. This function is usually called every
4419  * scheduler tick (TICK_NSEC).
4420  *
4421  * This function computes a decaying average:
4422  *
4423  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4424  *
4425  * Because of NOHZ it might not get called on every tick which gives need for
4426  * the @pending_updates argument.
4427  *
4428  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4429  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4430  *             = A * (A * load[i]_n-2 + B) + B
4431  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
4432  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4433  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4434  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4435  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4436  *
4437  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4438  * any change in load would have resulted in the tick being turned back on.
4439  *
4440  * For regular NOHZ, this reduces to:
4441  *
4442  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
4443  *
4444  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4445  * term. See the @active paramter.
4446  */
4447 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4448                               unsigned long pending_updates, int active)
4449 {
4450         unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
4451         int i, scale;
4452 
4453         this_rq->nr_load_updates++;
4454 
4455         /* Update our load: */
4456         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4457         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4458                 unsigned long old_load, new_load;
4459 
4460                 /* scale is effectively 1 << i now, and >> i divides by scale */
4461 
4462                 old_load = this_rq->cpu_load[i];
4463                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4464                 if (tickless_load) {
4465                         old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4466                         /*
4467                          * old_load can never be a negative value because a
4468                          * decayed tickless_load cannot be greater than the
4469                          * original tickless_load.
4470                          */
4471                         old_load += tickless_load;
4472                 }
4473                 new_load = this_load;
4474                 /*
4475                  * Round up the averaging division if load is increasing. This
4476                  * prevents us from getting stuck on 9 if the load is 10, for
4477                  * example.
4478                  */
4479                 if (new_load > old_load)
4480                         new_load += scale - 1;
4481 
4482                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4483         }
4484 
4485         sched_avg_update(this_rq);
4486 }
4487 
4488 /* Used instead of source_load when we know the type == 0 */
4489 static unsigned long weighted_cpuload(const int cpu)
4490 {
4491         return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4492 }
4493 
4494 #ifdef CONFIG_NO_HZ_COMMON
4495 /*
4496  * There is no sane way to deal with nohz on smp when using jiffies because the
4497  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4498  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4499  *
4500  * Therefore we cannot use the delta approach from the regular tick since that
4501  * would seriously skew the load calculation. However we'll make do for those
4502  * updates happening while idle (nohz_idle_balance) or coming out of idle
4503  * (tick_nohz_idle_exit).
4504  *
4505  * This means we might still be one tick off for nohz periods.
4506  */
4507 
4508 /*
4509  * Called from nohz_idle_balance() to update the load ratings before doing the
4510  * idle balance.
4511  */
4512 static void update_idle_cpu_load(struct rq *this_rq)
4513 {
4514         unsigned long curr_jiffies = READ_ONCE(jiffies);
4515         unsigned long load = weighted_cpuload(cpu_of(this_rq));
4516         unsigned long pending_updates;
4517 
4518         /*
4519          * bail if there's load or we're actually up-to-date.
4520          */
4521         if (load || curr_jiffies == this_rq->last_load_update_tick)
4522                 return;
4523 
4524         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4525         this_rq->last_load_update_tick = curr_jiffies;
4526 
4527         __update_cpu_load(this_rq, load, pending_updates, 0);
4528 }
4529 
4530 /*
4531  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4532  */
4533 void update_cpu_load_nohz(int active)
4534 {
4535         struct rq *this_rq = this_rq();
4536         unsigned long curr_jiffies = READ_ONCE(jiffies);
4537         unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
4538         unsigned long pending_updates;
4539 
4540         if (curr_jiffies == this_rq->last_load_update_tick)
4541                 return;
4542 
4543         raw_spin_lock(&this_rq->lock);
4544         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4545         if (pending_updates) {
4546                 this_rq->last_load_update_tick = curr_jiffies;
4547                 /*
4548                  * In the regular NOHZ case, we were idle, this means load 0.
4549                  * In the NOHZ_FULL case, we were non-idle, we should consider
4550                  * its weighted load.
4551                  */
4552                 __update_cpu_load(this_rq, load, pending_updates, active);
4553         }
4554         raw_spin_unlock(&this_rq->lock);
4555 }
4556 #endif /* CONFIG_NO_HZ */
4557 
4558 /*
4559  * Called from scheduler_tick()
4560  */
4561 void update_cpu_load_active(struct rq *this_rq)
4562 {
4563         unsigned long load = weighted_cpuload(cpu_of(this_rq));
4564         /*
4565          * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4566          */
4567         this_rq->last_load_update_tick = jiffies;
4568         __update_cpu_load(this_rq, load, 1, 1);
4569 }
4570 
4571 /*
4572  * Return a low guess at the load of a migration-source cpu weighted
4573  * according to the scheduling class and "nice" value.
4574  *
4575  * We want to under-estimate the load of migration sources, to
4576  * balance conservatively.
4577  */
4578 static unsigned long source_load(int cpu, int type)
4579 {
4580         struct rq *rq = cpu_rq(cpu);
4581         unsigned long total = weighted_cpuload(cpu);
4582 
4583         if (type == 0 || !sched_feat(LB_BIAS))
4584                 return total;
4585 
4586         return min(rq->cpu_load[type-1], total);
4587 }
4588 
4589 /*
4590  * Return a high guess at the load of a migration-target cpu weighted
4591  * according to the scheduling class and "nice" value.
4592  */
4593 static unsigned long target_load(int cpu, int type)
4594 {
4595         struct rq *rq = cpu_rq(cpu);
4596         unsigned long total = weighted_cpuload(cpu);
4597 
4598         if (type == 0 || !sched_feat(LB_BIAS))
4599                 return total;
4600 
4601         return max(rq->cpu_load[type-1], total);
4602 }
4603 
4604 static unsigned long capacity_of(int cpu)
4605 {
4606         return cpu_rq(cpu)->cpu_capacity;
4607 }
4608 
4609 static unsigned long capacity_orig_of(int cpu)
4610 {
4611         return cpu_rq(cpu)->cpu_capacity_orig;
4612 }
4613 
4614 static unsigned long cpu_avg_load_per_task(int cpu)
4615 {
4616         struct rq *rq = cpu_rq(cpu);
4617         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4618         unsigned long load_avg = weighted_cpuload(cpu);
4619 
4620         if (nr_running)
4621                 return load_avg / nr_running;
4622 
4623         return 0;
4624 }
4625 
4626 static void record_wakee(struct task_struct *p)
4627 {
4628         /*
4629          * Rough decay (wiping) for cost saving, don't worry
4630          * about the boundary, really active task won't care
4631          * about the loss.
4632          */
4633         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4634                 current->wakee_flips >>= 1;
4635                 current->wakee_flip_decay_ts = jiffies;
4636         }
4637 
4638         if (current->last_wakee != p) {
4639                 current->last_wakee = p;
4640                 current->wakee_flips++;
4641         }
4642 }
4643 
4644 static void task_waking_fair(struct task_struct *p)
4645 {
4646         struct sched_entity *se = &p->se;
4647         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4648         u64 min_vruntime;
4649 
4650 #ifndef CONFIG_64BIT
4651         u64 min_vruntime_copy;
4652 
4653         do {
4654                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4655                 smp_rmb();
4656                 min_vruntime = cfs_rq->min_vruntime;
4657         } while (min_vruntime != min_vruntime_copy);
4658 #else
4659         min_vruntime = cfs_rq->min_vruntime;
4660 #endif
4661 
4662         se->vruntime -= min_vruntime;
4663         record_wakee(p);
4664 }
4665 
4666 #ifdef CONFIG_FAIR_GROUP_SCHED
4667 /*
4668  * effective_load() calculates the load change as seen from the root_task_group
4669  *
4670  * Adding load to a group doesn't make a group heavier, but can cause movement
4671  * of group shares between cpus. Assuming the shares were perfectly aligned one
4672  * can calculate the shift in shares.
4673  *
4674  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4675  * on this @cpu and results in a total addition (subtraction) of @wg to the
4676  * total group weight.
4677  *
4678  * Given a runqueue weight distribution (rw_i) we can compute a shares
4679  * distribution (s_i) using:
4680  *
4681  *   s_i = rw_i / \Sum rw_j                                             (1)
4682  *
4683  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4684  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4685  * shares distribution (s_i):
4686  *
4687  *   rw_i = {   2,   4,   1,   0 }
4688  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4689  *
4690  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4691  * task used to run on and the CPU the waker is running on), we need to
4692  * compute the effect of waking a task on either CPU and, in case of a sync
4693  * wakeup, compute the effect of the current task going to sleep.
4694  *
4695  * So for a change of @wl to the local @cpu with an overall group weight change
4696  * of @wl we can compute the new shares distribution (s'_i) using:
4697  *
4698  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
4699  *
4700  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4701  * differences in waking a task to CPU 0. The additional task changes the
4702  * weight and shares distributions like:
4703  *
4704  *   rw'_i = {   3,   4,   1,   0 }
4705  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4706  *
4707  * We can then compute the difference in effective weight by using:
4708  *
4709  *   dw_i = S * (s'_i - s_i)                                            (3)
4710  *
4711  * Where 'S' is the group weight as seen by its parent.
4712  *
4713  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4714  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4715  * 4/7) times the weight of the group.
4716  */
4717 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4718 {
4719         struct sched_entity *se = tg->se[cpu];
4720 
4721         if (!tg->parent)        /* the trivial, non-cgroup case */
4722                 return wl;
4723 
4724         for_each_sched_entity(se) {
4725                 long w, W;
4726 
4727                 tg = se->my_q->tg;
4728 
4729                 /*
4730                  * W = @wg + \Sum rw_j
4731                  */
4732                 W = wg + calc_tg_weight(tg, se->my_q);
4733 
4734                 /*
4735                  * w = rw_i + @wl
4736                  */
4737                 w = cfs_rq_load_avg(se->my_q) + wl;
4738 
4739                 /*
4740                  * wl = S * s'_i; see (2)
4741                  */
4742                 if (W > 0 && w < W)
4743                         wl = (w * (long)tg->shares) / W;
4744                 else
4745                         wl = tg->shares;
4746 
4747                 /*
4748                  * Per the above, wl is the new se->load.weight value; since
4749                  * those are clipped to [MIN_SHARES, ...) do so now. See
4750                  * calc_cfs_shares().
4751                  */
4752                 if (wl < MIN_SHARES)
4753                         wl = MIN_SHARES;
4754 
4755                 /*
4756                  * wl = dw_i = S * (s'_i - s_i); see (3)
4757                  */
4758                 wl -= se->avg.load_avg;
4759 
4760                 /*
4761                  * Recursively apply this logic to all parent groups to compute
4762                  * the final effective load change on the root group. Since
4763                  * only the @tg group gets extra weight, all parent groups can
4764                  * only redistribute existing shares. @wl is the shift in shares
4765                  * resulting from this level per the above.
4766                  */
4767                 wg = 0;
4768         }
4769 
4770         return wl;
4771 }
4772 #else
4773 
4774 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4775 {
4776         return wl;
4777 }
4778 
4779 #endif
4780 
4781 /*
4782  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4783  * A waker of many should wake a different task than the one last awakened
4784  * at a frequency roughly N times higher than one of its wakees.  In order
4785  * to determine whether we should let the load spread vs consolodating to
4786  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
4787  * partner, and a factor of lls_size higher frequency in the other.  With
4788  * both conditions met, we can be relatively sure that the relationship is
4789  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
4790  * being client/server, worker/dispatcher, interrupt source or whatever is
4791  * irrelevant, spread criteria is apparent partner count exceeds socket size.
4792  */
4793 static int wake_wide(struct task_struct *p)
4794 {
4795         unsigned int master = current->wakee_flips;
4796         unsigned int slave = p->wakee_flips;
4797         int factor = this_cpu_read(sd_llc_size);
4798 
4799         if (master < slave)
4800                 swap(master, slave);
4801         if (slave < factor || master < slave * factor)
4802                 return 0;
4803         return 1;
4804 }
4805 
4806 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4807 {
4808         s64 this_load, load;
4809         s64 this_eff_load, prev_eff_load;
4810         int idx, this_cpu, prev_cpu;
4811         struct task_group *tg;
4812         unsigned long weight;
4813         int balanced;
4814 
4815         idx       = sd->wake_idx;
4816         this_cpu  = smp_processor_id();
4817         prev_cpu  = task_cpu(p);
4818         load      = source_load(prev_cpu, idx);
4819         this_load = target_load(this_cpu, idx);
4820 
4821         /*
4822          * If sync wakeup then subtract the (maximum possible)
4823          * effect of the currently running task from the load
4824          * of the current CPU:
4825          */
4826         if (sync) {
4827                 tg = task_group(current);
4828                 weight = current->se.avg.load_avg;
4829 
4830                 this_load += effective_load(tg, this_cpu, -weight, -weight);
4831                 load += effective_load(tg, prev_cpu, 0, -weight);
4832         }
4833 
4834         tg = task_group(p);
4835         weight = p->se.avg.load_avg;
4836 
4837         /*
4838          * In low-load situations, where prev_cpu is idle and this_cpu is idle
4839          * due to the sync cause above having dropped this_load to 0, we'll
4840          * always have an imbalance, but there's really nothing you can do
4841          * about that, so that's good too.
4842          *
4843          * Otherwise check if either cpus are near enough in load to allow this
4844          * task to be woken on this_cpu.
4845          */
4846         this_eff_load = 100;
4847         this_eff_load *= capacity_of(prev_cpu);
4848 
4849         prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4850         prev_eff_load *= capacity_of(this_cpu);
4851 
4852         if (this_load > 0) {
4853                 this_eff_load *= this_load +
4854                         effective_load(tg, this_cpu, weight, weight);
4855 
4856                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4857         }
4858 
4859         balanced = this_eff_load <= prev_eff_load;
4860 
4861         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4862 
4863         if (!balanced)
4864                 return 0;
4865 
4866         schedstat_inc(sd, ttwu_move_affine);
4867         schedstat_inc(p, se.statistics.nr_wakeups_affine);
4868 
4869         return 1;
4870 }
4871 
4872 /*
4873  * find_idlest_group finds and returns the least busy CPU group within the
4874  * domain.
4875  */
4876 static struct sched_group *
4877 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4878                   int this_cpu, int sd_flag)
4879 {
4880         struct sched_group *idlest = NULL, *group = sd->groups;
4881         unsigned long min_load = ULONG_MAX, this_load = 0;
4882         int load_idx = sd->forkexec_idx;
4883         int imbalance = 100 + (sd->imbalance_pct-100)/2;
4884 
4885         if (sd_flag & SD_BALANCE_WAKE)
4886                 load_idx = sd->wake_idx;
4887 
4888         do {
4889                 unsigned long load, avg_load;
4890                 int local_group;
4891                 int i;
4892 
4893                 /* Skip over this group if it has no CPUs allowed */
4894                 if (!cpumask_intersects(sched_group_cpus(group),
4895                                         tsk_cpus_allowed(p)))
4896                         continue;
4897 
4898                 local_group = cpumask_test_cpu(this_cpu,
4899                                                sched_group_cpus(group));
4900 
4901                 /* Tally up the load of all CPUs in the group */
4902                 avg_load = 0;
4903 
4904                 for_each_cpu(i, sched_group_cpus(group)) {
4905                         /* Bias balancing toward cpus of our domain */
4906                         if (local_group)
4907                                 load = source_load(i, load_idx);
4908                         else
4909                                 load = target_load(i, load_idx);
4910 
4911                         avg_load += load;
4912                 }
4913 
4914                 /* Adjust by relative CPU capacity of the group */
4915                 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4916 
4917                 if (local_group) {
4918                         this_load = avg_load;
4919                 } else if (avg_load < min_load) {
4920                         min_load = avg_load;
4921                         idlest = group;
4922                 }
4923         } while (group = group->next, group != sd->groups);
4924 
4925         if (!idlest || 100*this_load < imbalance*min_load)
4926                 return NULL;
4927         return idlest;
4928 }
4929 
4930 /*
4931  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4932  */
4933 static int
4934 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4935 {
4936         unsigned long load, min_load = ULONG_MAX;
4937         unsigned int min_exit_latency = UINT_MAX;
4938         u64 latest_idle_timestamp = 0;
4939         int least_loaded_cpu = this_cpu;
4940         int shallowest_idle_cpu = -1;
4941         int i;
4942 
4943         /* Traverse only the allowed CPUs */
4944         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4945                 if (idle_cpu(i)) {
4946                         struct rq *rq = cpu_rq(i);
4947                         struct cpuidle_state *idle = idle_get_state(rq);
4948                         if (idle && idle->exit_latency < min_exit_latency) {
4949                                 /*
4950                                  * We give priority to a CPU whose idle state
4951                                  * has the smallest exit latency irrespective
4952                                  * of any idle timestamp.
4953                                  */
4954                                 min_exit_latency = idle->exit_latency;
4955                                 latest_idle_timestamp = rq->idle_stamp;
4956                                 shallowest_idle_cpu = i;
4957                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4958                                    rq->idle_stamp > latest_idle_timestamp) {
4959                                 /*
4960                                  * If equal or no active idle state, then
4961                                  * the most recently idled CPU might have
4962                                  * a warmer cache.
4963                                  */
4964                                 latest_idle_timestamp = rq->idle_stamp;
4965                                 shallowest_idle_cpu = i;
4966                         }
4967                 } else if (shallowest_idle_cpu == -1) {
4968                         load = weighted_cpuload(i);
4969                         if (load < min_load || (load == min_load && i == this_cpu)) {
4970                                 min_load = load;
4971                                 least_loaded_cpu = i;
4972                         }
4973                 }
4974         }
4975 
4976         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4977 }
4978 
4979 /*
4980  * Try and locate an idle CPU in the sched_domain.
4981  */
4982 static int select_idle_sibling(struct task_struct *p, int target)
4983 {
4984         struct sched_domain *sd;
4985         struct sched_group *sg;
4986         int i = task_cpu(p);
4987 
4988         if (idle_cpu(target))
4989                 return target;
4990 
4991         /*
4992          * If the prevous cpu is cache affine and idle, don't be stupid.
4993          */
4994         if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4995                 return i;
4996 
4997         /*
4998          * Otherwise, iterate the domains and find an elegible idle cpu.
4999          */
5000         sd = rcu_dereference(per_cpu(sd_llc, target));
5001         for_each_lower_domain(sd) {
5002                 sg = sd->groups;
5003                 do {
5004                         if (!cpumask_intersects(sched_group_cpus(sg),
5005                                                 tsk_cpus_allowed(p)))
5006                                 goto next;
5007 
5008                         for_each_cpu(i, sched_group_cpus(sg)) {
5009                                 if (i == target || !idle_cpu(i))
5010                                         goto next;
5011                         }
5012 
5013                         target = cpumask_first_and(sched_group_cpus(sg),
5014                                         tsk_cpus_allowed(p));
5015                         goto done;
5016 next:
5017                         sg = sg->next;
5018                 } while (sg != sd->groups);
5019         }
5020 done:
5021         return target;
5022 }
5023 
5024 /*
5025  * cpu_util returns the amount of capacity of a CPU that is used by CFS
5026  * tasks. The unit of the return value must be the one of capacity so we can
5027  * compare the utilization with the capacity of the CPU that is available for
5028  * CFS task (ie cpu_capacity).
5029  *
5030  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5031  * recent utilization of currently non-runnable tasks on a CPU. It represents
5032  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5033  * capacity_orig is the cpu_capacity available at the highest frequency
5034  * (arch_scale_freq_capacity()).
5035  * The utilization of a CPU converges towards a sum equal to or less than the
5036  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5037  * the running time on this CPU scaled by capacity_curr.
5038  *
5039  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5040  * higher than capacity_orig because of unfortunate rounding in
5041  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5042  * the average stabilizes with the new running time. We need to check that the
5043  * utilization stays within the range of [0..capacity_orig] and cap it if
5044  * necessary. Without utilization capping, a group could be seen as overloaded
5045  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5046  * available capacity. We allow utilization to overshoot capacity_curr (but not
5047  * capacity_orig) as it useful for predicting the capacity required after task
5048  * migrations (scheduler-driven DVFS).
5049  */
5050 static int cpu_util(int cpu)
5051 {
5052         unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5053         unsigned long capacity = capacity_orig_of(cpu);
5054 
5055         return (util >= capacity) ? capacity : util;
5056 }
5057 
5058 /*
5059  * select_task_rq_fair: Select target runqueue for the waking task in domains
5060  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5061  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5062  *
5063  * Balances load by selecting the idlest cpu in the idlest group, or under
5064  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5065  *
5066  * Returns the target cpu number.
5067  *
5068  * preempt must be disabled.
5069  */
5070 static int
5071 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5072 {
5073         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5074         int cpu = smp_processor_id();
5075         int new_cpu = prev_cpu;
5076         int want_affine = 0;
5077         int sync = wake_flags & WF_SYNC;
5078 
5079         if (sd_flag & SD_BALANCE_WAKE)
5080                 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5081 
5082         rcu_read_lock();
5083         for_each_domain(cpu, tmp) {
5084                 if (!(tmp->flags & SD_LOAD_BALANCE))
5085                         break;
5086 
5087                 /*
5088                  * If both cpu and prev_cpu are part of this domain,
5089                  * cpu is a valid SD_WAKE_AFFINE target.
5090                  */
5091                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5092                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5093                         affine_sd = tmp;
5094                         break;
5095                 }
5096 
5097                 if (tmp->flags & sd_flag)
5098                         sd = tmp;
5099                 else if (!want_affine)
5100                         break;
5101         }
5102 
5103         if (affine_sd) {
5104                 sd = NULL; /* Prefer wake_affine over balance flags */
5105                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5106                         new_cpu = cpu;
5107         }
5108 
5109         if (!sd) {
5110                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5111                         new_cpu = select_idle_sibling(p, new_cpu);
5112 
5113         } else while (sd) {
5114                 struct sched_group *group;
5115                 int weight;
5116 
5117                 if (!(sd->flags & sd_flag)) {
5118                         sd = sd->child;
5119                         continue;
5120                 }
5121 
5122                 group = find_idlest_group(sd, p, cpu, sd_flag);
5123                 if (!group) {
5124                         sd = sd->child;
5125                         continue;
5126                 }
5127 
5128                 new_cpu = find_idlest_cpu(group, p, cpu);
5129                 if (new_cpu == -1 || new_cpu == cpu) {
5130                         /* Now try balancing at a lower domain level of cpu */
5131                         sd = sd->child;
5132                         continue;
5133                 }
5134 
5135                 /* Now try balancing at a lower domain level of new_cpu */
5136                 cpu = new_cpu;
5137                 weight = sd->span_weight;
5138                 sd = NULL;
5139                 for_each_domain(cpu, tmp) {
5140                         if (weight <= tmp->span_weight)
5141                                 break;
5142                         if (tmp->flags & sd_flag)
5143                                 sd = tmp;
5144                 }
5145                 /* while loop will break here if sd == NULL */
5146         }
5147         rcu_read_unlock();
5148 
5149         return new_cpu;
5150 }
5151 
5152 /*
5153  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5154  * cfs_rq_of(p) references at time of call are still valid and identify the
5155  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5156  */
5157 static void migrate_task_rq_fair(struct task_struct *p)
5158 {
5159         /*
5160          * We are supposed to update the task to "current" time, then its up to date
5161          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5162          * what current time is, so simply throw away the out-of-date time. This
5163          * will result in the wakee task is less decayed, but giving the wakee more
5164          * load sounds not bad.
5165          */
5166         remove_entity_load_avg(&p->se);
5167 
5168         /* Tell new CPU we are migrated */
5169         p->se.avg.last_update_time = 0;
5170 
5171         /* We have migrated, no longer consider this task hot */
5172         p->se.exec_start = 0;
5173 }
5174 
5175 static void task_dead_fair(struct task_struct *p)
5176 {
5177         remove_entity_load_avg(&p->se);
5178 }
5179 #endif /* CONFIG_SMP */
5180 
5181 static unsigned long
5182 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5183 {
5184         unsigned long gran = sysctl_sched_wakeup_granularity;
5185 
5186         /*
5187          * Since its curr running now, convert the gran from real-time
5188          * to virtual-time in his units.
5189          *
5190          * By using 'se' instead of 'curr' we penalize light tasks, so
5191          * they get preempted easier. That is, if 'se' < 'curr' then
5192          * the resulting gran will be larger, therefore penalizing the
5193          * lighter, if otoh 'se' > 'curr' then the resulting gran will
5194          * be smaller, again penalizing the lighter task.
5195          *
5196          * This is especially important for buddies when the leftmost
5197          * task is higher priority than the buddy.
5198          */
5199         return calc_delta_fair(gran, se);
5200 }
5201 
5202 /*
5203  * Should 'se' preempt 'curr'.
5204  *
5205  *             |s1
5206  *        |s2
5207  *   |s3
5208  *         g
5209  *      |<--->|c
5210  *
5211  *  w(c, s1) = -1
5212  *  w(c, s2) =  0
5213  *  w(c, s3) =  1
5214  *
5215  */
5216 static int
5217 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5218 {
5219         s64 gran, vdiff = curr->vruntime - se->vruntime;
5220 
5221         if (vdiff <= 0)
5222                 return -1;
5223 
5224         gran = wakeup_gran(curr, se);
5225         if (vdiff > gran)
5226                 return 1;
5227 
5228         return 0;
5229 }
5230 
5231 static void set_last_buddy(struct sched_entity *se)
5232 {
5233         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5234                 return;
5235 
5236         for_each_sched_entity(se)
5237                 cfs_rq_of(se)->last = se;
5238 }
5239 
5240 static void set_next_buddy(struct sched_entity *se)
5241 {
5242         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5243                 return;
5244 
5245         for_each_sched_entity(se)
5246                 cfs_rq_of(se)->next = se;
5247 }
5248 
5249 static void set_skip_buddy(struct sched_entity *se)
5250 {
5251         for_each_sched_entity(se)
5252                 cfs_rq_of(se)->skip = se;
5253 }
5254 
5255 /*
5256  * Preempt the current task with a newly woken task if needed:
5257  */
5258 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5259 {
5260         struct task_struct *curr = rq->curr;
5261         struct sched_entity *se = &curr->se, *pse = &p->se;
5262         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5263         int scale = cfs_rq->nr_running >= sched_nr_latency;
5264         int next_buddy_marked = 0;
5265 
5266         if (unlikely(se == pse))
5267                 return;
5268 
5269         /*
5270          * This is possible from callers such as attach_tasks(), in which we
5271          * unconditionally check_prempt_curr() after an enqueue (which may have
5272          * lead to a throttle).  This both saves work and prevents false
5273          * next-buddy nomination below.
5274          */
5275         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5276                 return;
5277 
5278         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5279                 set_next_buddy(pse);
5280                 next_buddy_marked = 1;
5281         }
5282 
5283         /*
5284          * We can come here with TIF_NEED_RESCHED already set from new task
5285          * wake up path.
5286          *
5287          * Note: this also catches the edge-case of curr being in a throttled
5288          * group (e.g. via set_curr_task), since update_curr() (in the
5289          * enqueue of curr) will have resulted in resched being set.  This
5290          * prevents us from potentially nominating it as a false LAST_BUDDY
5291          * below.
5292          */
5293         if (test_tsk_need_resched(curr))
5294                 return;
5295 
5296         /* Idle tasks are by definition preempted by non-idle tasks. */
5297         if (unlikely(curr->policy == SCHED_IDLE) &&
5298             likely(p->policy != SCHED_IDLE))
5299                 goto preempt;
5300 
5301         /*
5302          * Batch and idle tasks do not preempt non-idle tasks (their preemption
5303          * is driven by the tick):
5304          */
5305         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5306                 return;
5307 
5308         find_matching_se(&se, &pse);
5309         update_curr(cfs_rq_of(se));
5310         BUG_ON(!pse);
5311         if (wakeup_preempt_entity(se, pse) == 1) {
5312                 /*
5313                  * Bias pick_next to pick the sched entity that is
5314                  * triggering this preemption.
5315                  */
5316                 if (!next_buddy_marked)
5317                         set_next_buddy(pse);
5318                 goto preempt;
5319         }
5320 
5321         return;
5322 
5323 preempt:
5324         resched_curr(rq);
5325         /*
5326          * Only set the backward buddy when the current task is still
5327          * on the rq. This can happen when a wakeup gets interleaved
5328          * with schedule on the ->pre_schedule() or idle_balance()
5329          * point, either of which can * drop the rq lock.
5330          *
5331          * Also, during early boot the idle thread is in the fair class,
5332          * for obvious reasons its a bad idea to schedule back to it.
5333          */
5334         if (unlikely(!se->on_rq || curr == rq->idle))
5335                 return;
5336 
5337         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5338                 set_last_buddy(se);
5339 }
5340 
5341 static struct task_struct *
5342 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5343 {
5344         struct cfs_rq *cfs_rq = &rq->cfs;
5345         struct sched_entity *se;
5346         struct task_struct *p;
5347         int new_tasks;
5348 
5349 again:
5350 #ifdef CONFIG_FAIR_GROUP_SCHED
5351         if (!cfs_rq->nr_running)
5352                 goto idle;
5353 
5354         if (prev->sched_class != &fair_sched_class)
5355                 goto simple;
5356 
5357         /*
5358          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5359          * likely that a next task is from the same cgroup as the current.
5360          *
5361          * Therefore attempt to avoid putting and setting the entire cgroup
5362          * hierarchy, only change the part that actually changes.
5363          */
5364 
5365         do {
5366                 struct sched_entity *curr = cfs_rq->curr;
5367 
5368                 /*
5369                  * Since we got here without doing put_prev_entity() we also
5370                  * have to consider cfs_rq->curr. If it is still a runnable
5371                  * entity, update_curr() will update its vruntime, otherwise
5372                  * forget we've ever seen it.
5373                  */
5374                 if (curr) {
5375                         if (curr->on_rq)
5376                                 update_curr(cfs_rq);
5377                         else
5378                                 curr = NULL;
5379 
5380                         /*
5381                          * This call to check_cfs_rq_runtime() will do the
5382                          * throttle and dequeue its entity in the parent(s).
5383                          * Therefore the 'simple' nr_running test will indeed
5384                          * be correct.
5385                          */
5386                         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5387                                 goto simple;
5388                 }
5389 
5390                 se = pick_next_entity(cfs_rq, curr);
5391                 cfs_rq = group_cfs_rq(se);
5392         } while (cfs_rq);
5393 
5394         p = task_of(se);
5395 
5396         /*
5397          * Since we haven't yet done put_prev_entity and if the selected task
5398          * is a different task than we started out with, try and touch the
5399          * least amount of cfs_rqs.
5400          */
5401         if (prev != p) {
5402                 struct sched_entity *pse = &prev->se;
5403 
5404                 while (!(cfs_rq = is_same_group(se, pse))) {
5405                         int se_depth = se->depth;
5406                         int pse_depth = pse->depth;
5407 
5408                         if (se_depth <= pse_depth) {
5409                                 put_prev_entity(cfs_rq_of(pse), pse);
5410                                 pse = parent_entity(pse);
5411                         }
5412                         if (se_depth >= pse_depth) {
5413                                 set_next_entity(cfs_rq_of(se), se);
5414                                 se = parent_entity(se);
5415                         }
5416                 }
5417 
5418                 put_prev_entity(cfs_rq, pse);
5419                 set_next_entity(cfs_rq, se);
5420         }
5421 
5422         if (hrtick_enabled(rq))
5423                 hrtick_start_fair(rq, p);
5424 
5425         return p;
5426 simple:
5427         cfs_rq = &rq->cfs;
5428 #endif
5429 
5430         if (!cfs_rq->nr_running)
5431                 goto idle;
5432 
5433         put_prev_task(rq, prev);
5434 
5435         do {
5436                 se = pick_next_entity(cfs_rq, NULL);
5437                 set_next_entity(cfs_rq, se);
5438                 cfs_rq = group_cfs_rq(se);
5439         } while (cfs_rq);
5440 
5441         p = task_of(se);
5442 
5443         if (hrtick_enabled(rq))
5444                 hrtick_start_fair(rq, p);
5445 
5446         return p;
5447 
5448 idle:
5449         /*
5450          * This is OK, because current is on_cpu, which avoids it being picked
5451          * for load-balance and preemption/IRQs are still disabled avoiding
5452          * further scheduler activity on it and we're being very careful to
5453          * re-start the picking loop.
5454          */
5455         lockdep_unpin_lock(&rq->lock);
5456         new_tasks = idle_balance(rq);
5457         lockdep_pin_lock(&rq->lock);
5458         /*
5459          * Because idle_balance() releases (and re-acquires) rq->lock, it is
5460          * possible for any higher priority task to appear. In that case we
5461          * must re-start the pick_next_entity() loop.
5462          */
5463         if (new_tasks < 0)
5464                 return RETRY_TASK;
5465 
5466         if (new_tasks > 0)
5467                 goto again;
5468 
5469         return NULL;
5470 }
5471 
5472 /*
5473  * Account for a descheduled task:
5474  */
5475 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5476 {
5477         struct sched_entity *se = &prev->se;
5478         struct cfs_rq *cfs_rq;
5479 
5480         for_each_sched_entity(se) {
5481                 cfs_rq = cfs_rq_of(se);
5482                 put_prev_entity(cfs_rq, se);
5483         }
5484 }
5485 
5486 /*
5487  * sched_yield() is very simple
5488  *
5489  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5490  */
5491 static void yield_task_fair(struct rq *rq)
5492 {
5493         struct task_struct *curr = rq->curr;
5494         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5495         struct sched_entity *se = &curr->se;
5496 
5497         /*
5498          * Are we the only task in the tree?
5499          */
5500         if (unlikely(rq->nr_running == 1))
5501                 return;
5502 
5503         clear_buddies(cfs_rq, se);
5504 
5505         if (curr->policy != SCHED_BATCH) {
5506                 update_rq_clock(rq);
5507                 /*
5508                  * Update run-time statistics of the 'current'.
5509                  */
5510                 update_curr(cfs_rq);
5511                 /*
5512                  * Tell update_rq_clock() that we've just updated,
5513                  * so we don't do microscopic update in schedule()
5514                  * and double the fastpath cost.
5515                  */
5516                 rq_clock_skip_update(rq, true);
5517         }
5518 
5519         set_skip_buddy(se);
5520 }
5521 
5522 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5523 {
5524         struct sched_entity *se = &p->se;
5525 
5526         /* throttled hierarchies are not runnable */
5527         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5528                 return false;
5529 
5530         /* Tell the scheduler that we'd really like pse to run next. */
5531         set_next_buddy(se);
5532 
5533         yield_task_fair(rq);
5534 
5535         return true;
5536 }
5537 
5538 #ifdef CONFIG_SMP
5539 /**************************************************
5540  * Fair scheduling class load-balancing methods.
5541  *
5542  * BASICS
5543  *
5544  * The purpose of load-balancing is to achieve the same basic fairness the
5545  * per-cpu scheduler provides, namely provide a proportional amount of compute
5546  * time to each task. This is expressed in the following equation:
5547  *
5548  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5549  *
5550  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5551  * W_i,0 is defined as:
5552  *
5553  *   W_i,0 = \Sum_j w_i,j                                             (2)
5554  *
5555  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5556  * is derived from the nice value as per prio_to_weight[].
5557  *
5558  * The weight average is an exponential decay average of the instantaneous
5559  * weight:
5560  *
5561  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5562  *
5563  * C_i is the compute capacity of cpu i, typically it is the
5564  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5565  * can also include other factors [XXX].
5566  *
5567  * To achieve this balance we define a measure of imbalance which follows
5568  * directly from (1):
5569  *
5570  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5571  *
5572  * We them move tasks around to minimize the imbalance. In the continuous
5573  * function space it is obvious this converges, in the discrete case we get
5574  * a few fun cases generally called infeasible weight scenarios.
5575  *
5576  * [XXX expand on:
5577  *     - infeasible weights;
5578  *     - local vs global optima in the discrete case. ]
5579  *
5580  *
5581  * SCHED DOMAINS
5582  *
5583  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5584  * for all i,j solution, we create a tree of cpus that follows the hardware
5585  * topology where each level pairs two lower groups (or better). This results
5586  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5587  * tree to only the first of the previous level and we decrease the frequency
5588  * of load-balance at each level inv. proportional to the number of cpus in
5589  * the groups.
5590  *
5591  * This yields:
5592  *
5593  *     log_2 n     1     n
5594  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5595  *     i = 0      2^i   2^i
5596  *                               `- size of each group
5597  *         |         |     `- number of cpus doing load-balance
5598  *         |         `- freq
5599  *         `- sum over all levels
5600  *
5601  * Coupled with a limit on how many tasks we can migrate every balance pass,
5602  * this makes (5) the runtime complexity of the balancer.
5603  *
5604  * An important property here is that each CPU is still (indirectly) connected
5605  * to every other cpu in at most O(log n) steps:
5606  *
5607  * The adjacency matrix of the resulting graph is given by:
5608  *
5609  *             log_2 n     
5610  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5611  *             k = 0
5612  *
5613  * And you'll find that:
5614  *
5615  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5616  *
5617  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5618  * The task movement gives a factor of O(m), giving a convergence complexity
5619  * of:
5620  *
5621  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5622  *
5623  *
5624  * WORK CONSERVING
5625  *
5626  * In order to avoid CPUs going idle while there's still work to do, new idle
5627  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5628  * tree itself instead of relying on other CPUs to bring it work.
5629  *
5630  * This adds some complexity to both (5) and (8) but it reduces the total idle
5631  * time.
5632  *
5633  * [XXX more?]
5634  *
5635  *
5636  * CGROUPS
5637  *
5638  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5639  *
5640  *                                s_k,i
5641  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5642  *                                 S_k
5643  *
5644  * Where
5645  *
5646  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5647  *
5648  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5649  *
5650  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5651  * property.
5652  *
5653  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5654  *      rewrite all of this once again.]
5655  */ 
5656 
5657 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5658 
5659 enum fbq_type { regular, remote, all };
5660 
5661 #define LBF_ALL_PINNED  0x01
5662 #define LBF_NEED_BREAK  0x02
5663 #define LBF_DST_PINNED  0x04
5664 #define LBF_SOME_PINNED 0x08
5665 
5666 struct lb_env {
5667         struct sched_domain     *sd;
5668 
5669         struct rq               *src_rq;
5670         int                     src_cpu;
5671 
5672         int                     dst_cpu;
5673         struct rq               *dst_rq;
5674 
5675         struct cpumask          *dst_grpmask;
5676         int                     new_dst_cpu;
5677         enum cpu_idle_type      idle;
5678         long                    imbalance;
5679         /* The set of CPUs under consideration for load-balancing */
5680         struct cpumask          *cpus;
5681 
5682         unsigned int            flags;
5683 
5684         unsigned int            loop;
5685         unsigned int            loop_break;
5686         unsigned int            loop_max;
5687 
5688         enum fbq_type           fbq_type;
5689         struct list_head        tasks;
5690 };
5691 
5692 /*
5693  * Is this task likely cache-hot:
5694  */
5695 static int task_hot(struct task_struct *p, struct lb_env *env)
5696 {
5697         s64 delta;
5698 
5699         lockdep_assert_held(&env->src_rq->lock);
5700 
5701         if (p->sched_class != &fair_sched_class)
5702                 return 0;
5703 
5704         if (unlikely(p->policy == SCHED_IDLE))
5705                 return 0;
5706 
5707         /*
5708          * Buddy candidates are cache hot:
5709          */
5710         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5711                         (&p->se == cfs_rq_of(&p->se)->next ||
5712                          &p->se == cfs_rq_of(&p->se)->last))
5713                 return 1;
5714 
5715         if (sysctl_sched_migration_cost == -1)
5716                 return 1;
5717         if (sysctl_sched_migration_cost == 0)
5718                 return 0;
5719 
5720         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5721 
5722         return delta < (s64)sysctl_sched_migration_cost;
5723 }
5724 
5725 #ifdef CONFIG_NUMA_BALANCING
5726 /*
5727  * Returns 1, if task migration degrades locality
5728  * Returns 0, if task migration improves locality i.e migration preferred.
5729  * Returns -1, if task migration is not affected by locality.
5730  */
5731 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5732 {
5733         struct numa_group *numa_group = rcu_dereference(p->numa_group);
5734         unsigned long src_faults, dst_faults;
5735         int src_nid, dst_nid;
5736 
5737         if (!static_branch_likely(&sched_numa_balancing))
5738                 return -1;
5739 
5740         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5741                 return -1;
5742 
5743         src_nid = cpu_to_node(env->src_cpu);
5744         dst_nid = cpu_to_node(env->dst_cpu);
5745 
5746         if (src_nid == dst_nid)
5747                 return -1;
5748 
5749         /* Migrating away from the preferred node is always bad. */
5750         if (src_nid == p->numa_preferred_nid) {
5751                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5752                         return 1;
5753                 else
5754                         return -1;
5755         }
5756 
5757         /* Encourage migration to the preferred node. */
5758         if (dst_nid == p->numa_preferred_nid)
5759                 return 0;
5760 
5761         if (numa_group) {
5762                 src_faults = group_faults(p, src_nid);
5763                 dst_faults = group_faults(p, dst_nid);
5764         } else {
5765                 src_faults = task_faults(p, src_nid);
5766                 dst_faults = task_faults(p, dst_nid);
5767         }
5768 
5769         return dst_faults < src_faults;
5770 }
5771 
5772 #else
5773 static inline int migrate_degrades_locality(struct task_struct *p,
5774                                              struct lb_env *env)
5775 {
5776         return -1;
5777 }
5778 #endif
5779 
5780 /*
5781  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5782  */
5783 static
5784 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5785 {
5786         int tsk_cache_hot;
5787 
5788         lockdep_assert_held(&env->src_rq->lock);
5789 
5790         /*
5791          * We do not migrate tasks that are:
5792          * 1) throttled_lb_pair, or
5793          * 2) cannot be migrated to this CPU due to cpus_allowed, or
5794          * 3) running (obviously), or
5795          * 4) are cache-hot on their current CPU.
5796          */
5797         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5798                 return 0;
5799 
5800         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5801                 int cpu;
5802 
5803                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5804 
5805                 env->flags |= LBF_SOME_PINNED;
5806 
5807                 /*
5808                  * Remember if this task can be migrated to any other cpu in
5809                  * our sched_group. We may want to revisit it if we couldn't
5810                  * meet load balance goals by pulling other tasks on src_cpu.
5811                  *
5812                  * Also avoid computing new_dst_cpu if we have already computed
5813                  * one in current iteration.
5814                  */
5815                 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5816                         return 0;
5817 
5818                 /* Prevent to re-select dst_cpu via env's cpus */
5819                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5820                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5821                                 env->flags |= LBF_DST_PINNED;
5822                                 env->new_dst_cpu = cpu;
5823                                 break;
5824                         }
5825                 }
5826 
5827                 return 0;
5828         }
5829 
5830         /* Record that we found atleast one task that could run on dst_cpu */
5831         env->flags &= ~LBF_ALL_PINNED;
5832 
5833         if (task_running(env->src_rq, p)) {
5834                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5835                 return 0;
5836         }
5837 
5838         /*
5839          * Aggressive migration if:
5840          * 1) destination numa is preferred
5841          * 2) task is cache cold, or
5842          * 3) too many balance attempts have failed.
5843          */
5844         tsk_cache_hot = migrate_degrades_locality(p, env);
5845         if (tsk_cache_hot == -1)
5846                 tsk_cache_hot = task_hot(p, env);
5847 
5848         if (tsk_cache_hot <= 0 ||
5849             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5850                 if (tsk_cache_hot == 1) {
5851                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5852                         schedstat_inc(p, se.statistics.nr_forced_migrations);
5853                 }
5854                 return 1;
5855         }
5856 
5857         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5858         return 0;
5859 }
5860 
5861 /*
5862  * detach_task() -- detach the task for the migration specified in env
5863  */
5864 static void detach_task(struct task_struct *p, struct lb_env *env)
5865 {
5866         lockdep_assert_held(&env->src_rq->lock);
5867 
5868         p->on_rq = TASK_ON_RQ_MIGRATING;
5869         deactivate_task(env->src_rq, p, 0);
5870         set_task_cpu(p, env->dst_cpu);
5871 }
5872 
5873 /*
5874  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5875  * part of active balancing operations within "domain".
5876  *
5877  * Returns a task if successful and NULL otherwise.
5878  */
5879 static struct task_struct *detach_one_task(struct lb_env *env)
5880 {
5881         struct task_struct *p, *n;
5882 
5883         lockdep_assert_held(&env->src_rq->lock);
5884 
5885         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5886                 if (!can_migrate_task(p, env))
5887                         continue;
5888 
5889                 detach_task(p, env);
5890 
5891                 /*
5892                  * Right now, this is only the second place where
5893                  * lb_gained[env->idle] is updated (other is detach_tasks)
5894                  * so we can safely collect stats here rather than
5895                  * inside detach_tasks().
5896                  */
5897                 schedstat_inc(env->sd, lb_gained[env->idle]);
5898                 return p;
5899         }
5900         return NULL;
5901 }
5902 
5903 static const unsigned int sched_nr_migrate_break = 32;
5904 
5905 /*
5906  * detach_tasks() -- tries to detach up to imbalance weighted load from
5907  * busiest_rq, as part of a balancing operation within domain "sd".
5908  *
5909  * Returns number of detached tasks if successful and 0 otherwise.
5910  */
5911 static int detach_tasks(struct lb_env *env)
5912 {
5913         struct list_head *tasks = &env->src_rq->cfs_tasks;
5914         struct task_struct *p;
5915         unsigned long load;
5916         int detached = 0;
5917 
5918         lockdep_assert_held(&env->src_rq->lock);
5919 
5920         if (env->imbalance <= 0)
5921                 return 0;
5922 
5923         while (!list_empty(tasks)) {
5924                 /*
5925                  * We don't want to steal all, otherwise we may be treated likewise,
5926                  * which could at worst lead to a livelock crash.
5927                  */
5928                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5929                         break;
5930 
5931                 p = list_first_entry(tasks, struct task_struct, se.group_node);
5932 
5933                 env->loop++;
5934                 /* We've more or less seen every task there is, call it quits */
5935                 if (env->loop > env->loop_max)
5936                         break;
5937 
5938                 /* take a breather every nr_migrate tasks */
5939                 if (env->loop > env->loop_break) {
5940                         env->loop_break += sched_nr_migrate_break;
5941                         env->flags |= LBF_NEED_BREAK;
5942                         break;
5943                 }
5944 
5945                 if (!can_migrate_task(p, env))
5946                         goto next;
5947 
5948                 load = task_h_load(p);
5949 
5950                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5951                         goto next;
5952 
5953                 if ((load / 2) > env->imbalance)
5954                         goto next;
5955 
5956                 detach_task(p, env);
5957                 list_add(&p->se.group_node, &env->tasks);
5958 
5959                 detached++;
5960                 env->imbalance -= load;
5961 
5962 #ifdef CONFIG_PREEMPT
5963                 /*
5964                  * NEWIDLE balancing is a source of latency, so preemptible
5965                  * kernels will stop after the first task is detached to minimize
5966                  * the critical section.
5967                  */
5968                 if (env->idle == CPU_NEWLY_IDLE)
5969                         break;
5970 #endif
5971 
5972                 /*
5973                  * We only want to steal up to the prescribed amount of
5974                  * weighted load.
5975                  */
5976                 if (env->imbalance <= 0)
5977                         break;
5978 
5979                 continue;
5980 next:
5981                 list_move_tail(&p->se.group_node, tasks);
5982         }
5983 
5984         /*
5985          * Right now, this is one of only two places we collect this stat
5986          * so we can safely collect detach_one_task() stats here rather
5987          * than inside detach_one_task().
5988          */
5989         schedstat_add(env->sd, lb_gained[env->idle], detached);
5990 
5991         return detached;
5992 }
5993 
5994 /*
5995  * attach_task() -- attach the task detached by detach_task() to its new rq.
5996  */
5997 static void attach_task(struct rq *rq, struct task_struct *p)
5998 {
5999         lockdep_assert_held(&rq->lock);
6000 
6001         BUG_ON(task_rq(p) != rq);
6002         activate_task(rq, p, 0);
6003         p->on_rq = TASK_ON_RQ_QUEUED;
6004         check_preempt_curr(rq, p, 0);
6005 }
6006 
6007 /*
6008  * attach_one_task() -- attaches the task returned from detach_one_task() to
6009  * its new rq.
6010  */
6011 static void attach_one_task(struct rq *rq, struct task_struct *p)
6012 {
6013         raw_spin_lock(&rq->lock);
6014         attach_task(rq, p);
6015         raw_spin_unlock(&rq->lock);
6016 }
6017 
6018 /*
6019  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6020  * new rq.
6021  */
6022 static void attach_tasks(struct lb_env *env)
6023 {
6024         struct list_head *tasks = &env->tasks;
6025         struct task_struct *p;
6026 
6027         raw_spin_lock(&env->dst_rq->lock);
6028 
6029         while (!list_empty(tasks)) {
6030                 p = list_first_entry(tasks, struct task_struct, se.group_node);
6031                 list_del_init(&p->se.group_node);
6032 
6033                 attach_task(env->dst_rq, p);
6034         }
6035 
6036         raw_spin_unlock(&env->dst_rq->lock);
6037 }
6038 
6039 #ifdef CONFIG_FAIR_GROUP_SCHED
6040 static void update_blocked_averages(int cpu)
6041 {
6042         struct rq *rq = cpu_rq(cpu);
6043         struct cfs_rq *cfs_rq;
6044         unsigned long flags;
6045 
6046         raw_spin_lock_irqsave(&rq->lock, flags);
6047         update_rq_clock(rq);
6048 
6049         /*
6050          * Iterates the task_group tree in a bottom up fashion, see
6051          * list_add_leaf_cfs_rq() for details.
6052          */
6053         for_each_leaf_cfs_rq(rq, cfs_rq) {
6054                 /* throttled entities do not contribute to load */
6055                 if (throttled_hierarchy(cfs_rq))
6056                         continue;
6057 
6058                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
6059                         update_tg_load_avg(cfs_rq, 0);
6060         }
6061         raw_spin_unlock_irqrestore(&rq->lock, flags);
6062 }
6063 
6064 /*
6065  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6066  * This needs to be done in a top-down fashion because the load of a child
6067  * group is a fraction of its parents load.
6068  */
6069 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6070 {
6071         struct rq *rq = rq_of(cfs_rq);
6072         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6073         unsigned long now = jiffies;
6074         unsigned long load;
6075 
6076         if (cfs_rq->last_h_load_update == now)
6077                 return;
6078 
6079         cfs_rq->h_load_next = NULL;
6080         for_each_sched_entity(se) {
6081                 cfs_rq = cfs_rq_of(se);
6082                 cfs_rq->h_load_next = se;
6083                 if (cfs_rq->last_h_load_update == now)
6084                         break;
6085         }
6086 
6087         if (!se) {
6088                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6089                 cfs_rq->last_h_load_update = now;
6090         }
6091 
6092         while ((se = cfs_rq->h_load_next) != NULL) {
6093                 load = cfs_rq->h_load;
6094                 load = div64_ul(load * se->avg.load_avg,
6095                         cfs_rq_load_avg(cfs_rq) + 1);
6096                 cfs_rq = group_cfs_rq(se);
6097                 cfs_rq->h_load = load;
6098                 cfs_rq->last_h_load_update = now;
6099         }
6100 }
6101 
6102 static unsigned long task_h_load(struct task_struct *p)
6103 {
6104         struct cfs_rq *cfs_rq = task_cfs_rq(p);
6105 
6106         update_cfs_rq_h_load(cfs_rq);
6107         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6108                         cfs_rq_load_avg(cfs_rq) + 1);
6109 }
6110 #else
6111 static inline void update_blocked_averages(int cpu)
6112 {
6113         struct rq *rq = cpu_rq(cpu);
6114         struct cfs_rq *cfs_rq = &rq->cfs;
6115         unsigned long flags;
6116 
6117         raw_spin_lock_irqsave(&rq->lock, flags);
6118         update_rq_clock(rq);
6119         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
6120         raw_spin_unlock_irqrestore(&rq->lock, flags);
6121 }
6122 
6123 static unsigned long task_h_load(struct task_struct *p)
6124 {
6125         return p->se.avg.load_avg;
6126 }
6127 #endif
6128 
6129 /********** Helpers for find_busiest_group ************************/
6130 
6131 enum group_type {
6132         group_other = 0,
6133         group_imbalanced,
6134         group_overloaded,
6135 };
6136 
6137 /*
6138  * sg_lb_stats - stats of a sched_group required for load_balancing
6139  */
6140 struct sg_lb_stats {
6141         unsigned long avg_load; /*Avg load across the CPUs of the group */
6142         unsigned long group_load; /* Total load over the CPUs of the group */
6143         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6144         unsigned long load_per_task;
6145         unsigned long group_capacity;
6146         unsigned long group_util; /* Total utilization of the group */
6147         unsigned int sum_nr_running; /* Nr tasks running in the group */
6148         unsigned int idle_cpus;
6149         unsigned int group_weight;
6150         enum group_type group_type;
6151         int group_no_capacity;
6152 #ifdef CONFIG_NUMA_BALANCING
6153         unsigned int nr_numa_running;
6154         unsigned int nr_preferred_running;
6155 #endif
6156 };
6157 
6158 /*
6159  * sd_lb_stats - Structure to store the statistics of a sched_domain
6160  *               during load balancing.
6161  */
6162 struct sd_lb_stats {
6163         struct sched_group *busiest;    /* Busiest group in this sd */
6164         struct sched_group *local;      /* Local group in this sd */
6165         unsigned long total_load;       /* Total load of all groups in sd */
6166         unsigned long total_capacity;   /* Total capacity of all groups in sd */
6167         unsigned long avg_load; /* Average load across all groups in sd */
6168 
6169         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6170         struct sg_lb_stats local_stat;  /* Statistics of the local group */
6171 };
6172 
6173 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6174 {
6175         /*
6176          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6177          * local_stat because update_sg_lb_stats() does a full clear/assignment.
6178          * We must however clear busiest_stat::avg_load because
6179          * update_sd_pick_busiest() reads this before assignment.
6180          */
6181         *sds = (struct sd_lb_stats){
6182                 .busiest = NULL,
6183                 .local = NULL,
6184                 .total_load = 0UL,
6185                 .total_capacity = 0UL,
6186                 .busiest_stat = {
6187                         .avg_load = 0UL,
6188                         .sum_nr_running = 0,
6189                         .group_type = group_other,
6190                 },
6191         };
6192 }
6193 
6194 /**
6195  * get_sd_load_idx - Obtain the load index for a given sched domain.
6196  * @sd: The sched_domain whose load_idx is to be obtained.
6197  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6198  *
6199  * Return: The load index.
6200  */
6201 static inline int get_sd_load_idx(struct sched_domain *sd,
6202                                         enum cpu_idle_type idle)
6203 {
6204         int load_idx;
6205 
6206         switch (idle) {
6207         case CPU_NOT_IDLE:
6208                 load_idx = sd->busy_idx;
6209                 break;
6210 
6211         case CPU_NEWLY_IDLE:
6212                 load_idx = sd->newidle_idx;
6213                 break;
6214         default:
6215                 load_idx = sd->idle_idx;
6216                 break;
6217         }
6218 
6219         return load_idx;
6220 }
6221 
6222 static unsigned long scale_rt_capacity(int cpu)
6223 {
6224         struct rq *rq = cpu_rq(cpu);
6225         u64 total, used, age_stamp, avg;
6226         s64 delta;
6227 
6228         /*
6229          * Since we're reading these variables without serialization make sure
6230          * we read them once before doing sanity checks on them.
6231          */
6232         age_stamp = READ_ONCE(rq->age_stamp);
6233         avg = READ_ONCE(rq->rt_avg);
6234         delta = __rq_clock_broken(rq) - age_stamp;
6235 
6236         if (unlikely(delta < 0))
6237                 delta = 0;
6238 
6239         total = sched_avg_period() + delta;
6240 
6241         used = div_u64(avg, total);
6242 
6243         if (likely(used < SCHED_CAPACITY_SCALE))
6244                 return SCHED_CAPACITY_SCALE - used;
6245 
6246         return 1;
6247 }
6248 
6249 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6250 {
6251         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6252         struct sched_group *sdg = sd->groups;
6253 
6254         cpu_rq(cpu)->cpu_capacity_orig = capacity;
6255 
6256         capacity *= scale_rt_capacity(cpu);
6257         capacity >>= SCHED_CAPACITY_SHIFT;
6258 
6259         if (!capacity)
6260                 capacity = 1;
6261 
6262         cpu_rq(cpu)->cpu_capacity = capacity;
6263         sdg->sgc->capacity = capacity;
6264 }
6265 
6266 void update_group_capacity(struct sched_domain *sd, int cpu)
6267 {
6268         struct sched_domain *child = sd->child;
6269         struct sched_group *group, *sdg = sd->groups;
6270         unsigned long capacity;
6271         unsigned long interval;
6272 
6273         interval = msecs_to_jiffies(sd->balance_interval);
6274         interval = clamp(interval, 1UL, max_load_balance_interval);
6275         sdg->sgc->next_update = jiffies + interval;
6276 
6277         if (!child) {
6278                 update_cpu_capacity(sd, cpu);
6279                 return;
6280         }
6281 
6282         capacity = 0;
6283 
6284         if (child->flags & SD_OVERLAP) {
6285                 /*
6286                  * SD_OVERLAP domains cannot assume that child groups
6287                  * span the current group.
6288                  */
6289 
6290                 for_each_cpu(cpu, sched_group_cpus(sdg)) {
6291                         struct sched_group_capacity *sgc;
6292                         struct rq *rq = cpu_rq(cpu);
6293 
6294                         /*
6295                          * build_sched_domains() -> init_sched_groups_capacity()
6296                          * gets here before we've attached the domains to the
6297                          * runqueues.
6298                          *
6299                          * Use capacity_of(), which is set irrespective of domains
6300                          * in update_cpu_capacity().
6301                          *
6302                          * This avoids capacity from being 0 and
6303                          * causing divide-by-zero issues on boot.
6304                          */
6305                         if (unlikely(!rq->sd)) {
6306                                 capacity += capacity_of(cpu);
6307                                 continue;
6308                         }
6309 
6310                         sgc = rq->sd->groups->sgc;
6311                         capacity += sgc->capacity;
6312                 }
6313         } else  {
6314                 /*
6315                  * !SD_OVERLAP domains can assume that child groups
6316                  * span the current group.
6317                  */ 
6318 
6319                 group = child->groups;
6320                 do {
6321                         capacity += group->sgc->capacity;
6322                         group = group->next;
6323                 } while (group != child->groups);
6324         }
6325 
6326         sdg->sgc->capacity = capacity;
6327 }
6328 
6329 /*
6330  * Check whether the capacity of the rq has been noticeably reduced by side
6331  * activity. The imbalance_pct is used for the threshold.
6332  * Return true is the capacity is reduced
6333  */
6334 static inline int
6335 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6336 {
6337         return ((rq->cpu_capacity * sd->imbalance_pct) <
6338                                 (rq->cpu_capacity_orig * 100));
6339 }
6340 
6341 /*
6342  * Group imbalance indicates (and tries to solve) the problem where balancing
6343  * groups is inadequate due to tsk_cpus_allowed() constraints.
6344  *
6345  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6346  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6347  * Something like:
6348  *
6349  *      { 0 1 2 3 } { 4 5 6 7 }
6350  *              *     * * *
6351  *
6352  * If we were to balance group-wise we'd place two tasks in the first group and
6353  * two tasks in the second group. Clearly this is undesired as it will overload
6354  * cpu 3 and leave one of the cpus in the second group unused.
6355  *
6356  * The current solution to this issue is detecting the skew in the first group
6357  * by noticing the lower domain failed to reach balance and had difficulty
6358  * moving tasks due to affinity constraints.
6359  *
6360  * When this is so detected; this group becomes a candidate for busiest; see
6361  * update_sd_pick_busiest(). And calculate_imbalance() and
6362  * find_busiest_group() avoid some of the usual balance conditions to allow it
6363  * to create an effective group imbalance.
6364  *
6365  * This is a somewhat tricky proposition since the next run might not find the
6366  * group imbalance and decide the groups need to be balanced again. A most
6367  * subtle and fragile situation.
6368  */
6369 
6370 static inline int sg_imbalanced(struct sched_group *group)
6371 {
6372         return group->sgc->imbalance;
6373 }
6374 
6375 /*
6376  * group_has_capacity returns true if the group has spare capacity that could
6377  * be used by some tasks.
6378  * We consider that a group has spare capacity if the  * number of task is
6379  * smaller than the number of CPUs or if the utilization is lower than the
6380  * available capacity for CFS tasks.
6381  * For the latter, we use a threshold to stabilize the state, to take into
6382  * account the variance of the tasks' load and to return true if the available
6383  * capacity in meaningful for the load balancer.
6384  * As an example, an available capacity of 1% can appear but it doesn't make
6385  * any benefit for the load balance.
6386  */
6387 static inline bool
6388 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6389 {
6390         if (sgs->sum_nr_running < sgs->group_weight)
6391                 return true;
6392 
6393         if ((sgs->group_capacity * 100) >
6394                         (sgs->group_util * env->sd->imbalance_pct))
6395                 return true;
6396 
6397         return false;
6398 }
6399 
6400 /*
6401  *  group_is_overloaded returns true if the group has more tasks than it can
6402  *  handle.
6403  *  group_is_overloaded is not equals to !group_has_capacity because a group
6404  *  with the exact right number of tasks, has no more spare capacity but is not
6405  *  overloaded so both group_has_capacity and group_is_overloaded return
6406  *  false.
6407  */
6408 static inline bool
6409 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6410 {
6411         if (sgs->sum_nr_running <= sgs->group_weight)
6412                 return false;
6413 
6414         if ((sgs->group_capacity * 100) <
6415                         (sgs->group_util * env->sd->imbalance_pct))
6416                 return true;
6417 
6418         return false;
6419 }