~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/sched/sch_cbq.c

Version: ~ [ linux-6.6-rc1 ] ~ [ linux-6.5.2 ] ~ [ linux-6.4.15 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.52 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.131 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.194 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.256 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.294 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.325 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
  4  *
  5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6  */
  7 
  8 #include <linux/module.h>
  9 #include <linux/slab.h>
 10 #include <linux/types.h>
 11 #include <linux/kernel.h>
 12 #include <linux/string.h>
 13 #include <linux/errno.h>
 14 #include <linux/skbuff.h>
 15 #include <net/netlink.h>
 16 #include <net/pkt_sched.h>
 17 #include <net/pkt_cls.h>
 18 
 19 
 20 /*      Class-Based Queueing (CBQ) algorithm.
 21         =======================================
 22 
 23         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
 24                  Management Models for Packet Networks",
 25                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
 26 
 27                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
 28 
 29                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
 30                  Parameters", 1996
 31 
 32                  [4] Sally Floyd and Michael Speer, "Experimental Results
 33                  for Class-Based Queueing", 1998, not published.
 34 
 35         -----------------------------------------------------------------------
 36 
 37         Algorithm skeleton was taken from NS simulator cbq.cc.
 38         If someone wants to check this code against the LBL version,
 39         he should take into account that ONLY the skeleton was borrowed,
 40         the implementation is different. Particularly:
 41 
 42         --- The WRR algorithm is different. Our version looks more
 43         reasonable (I hope) and works when quanta are allowed to be
 44         less than MTU, which is always the case when real time classes
 45         have small rates. Note, that the statement of [3] is
 46         incomplete, delay may actually be estimated even if class
 47         per-round allotment is less than MTU. Namely, if per-round
 48         allotment is W*r_i, and r_1+...+r_k = r < 1
 49 
 50         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
 51 
 52         In the worst case we have IntServ estimate with D = W*r+k*MTU
 53         and C = MTU*r. The proof (if correct at all) is trivial.
 54 
 55 
 56         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
 57         interpret some places, which look like wrong translations
 58         from NS. Anyone is advised to find these differences
 59         and explain to me, why I am wrong 8).
 60 
 61         --- Linux has no EOI event, so that we cannot estimate true class
 62         idle time. Workaround is to consider the next dequeue event
 63         as sign that previous packet is finished. This is wrong because of
 64         internal device queueing, but on a permanently loaded link it is true.
 65         Moreover, combined with clock integrator, this scheme looks
 66         very close to an ideal solution.  */
 67 
 68 struct cbq_sched_data;
 69 
 70 
 71 struct cbq_class {
 72         struct Qdisc_class_common common;
 73         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
 74 
 75 /* Parameters */
 76         unsigned char           priority;       /* class priority */
 77         unsigned char           priority2;      /* priority to be used after overlimit */
 78         unsigned char           ewma_log;       /* time constant for idle time calculation */
 79 
 80         u32                     defmap;
 81 
 82         /* Link-sharing scheduler parameters */
 83         long                    maxidle;        /* Class parameters: see below. */
 84         long                    offtime;
 85         long                    minidle;
 86         u32                     avpkt;
 87         struct qdisc_rate_table *R_tab;
 88 
 89         /* General scheduler (WRR) parameters */
 90         long                    allot;
 91         long                    quantum;        /* Allotment per WRR round */
 92         long                    weight;         /* Relative allotment: see below */
 93 
 94         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 95         struct cbq_class        *split;         /* Ptr to split node */
 96         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 97         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 98         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 99                                                    parent otherwise */
100         struct cbq_class        *sibling;       /* Sibling chain */
101         struct cbq_class        *children;      /* Pointer to children chain */
102 
103         struct Qdisc            *q;             /* Elementary queueing discipline */
104 
105 
106 /* Variables */
107         unsigned char           cpriority;      /* Effective priority */
108         unsigned char           delayed;
109         unsigned char           level;          /* level of the class in hierarchy:
110                                                    0 for leaf classes, and maximal
111                                                    level of children + 1 for nodes.
112                                                  */
113 
114         psched_time_t           last;           /* Last end of service */
115         psched_time_t           undertime;
116         long                    avgidle;
117         long                    deficit;        /* Saved deficit for WRR */
118         psched_time_t           penalized;
119         struct gnet_stats_basic_sync bstats;
120         struct gnet_stats_queue qstats;
121         struct net_rate_estimator __rcu *rate_est;
122         struct tc_cbq_xstats    xstats;
123 
124         struct tcf_proto __rcu  *filter_list;
125         struct tcf_block        *block;
126 
127         int                     filters;
128 
129         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
130 };
131 
132 struct cbq_sched_data {
133         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
134         int                     nclasses[TC_CBQ_MAXPRIO + 1];
135         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
136 
137         struct cbq_class        link;
138 
139         unsigned int            activemask;
140         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
141                                                                    with backlog */
142 
143 #ifdef CONFIG_NET_CLS_ACT
144         struct cbq_class        *rx_class;
145 #endif
146         struct cbq_class        *tx_class;
147         struct cbq_class        *tx_borrowed;
148         int                     tx_len;
149         psched_time_t           now;            /* Cached timestamp */
150         unsigned int            pmask;
151 
152         struct hrtimer          delay_timer;
153         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
154                                                    started when CBQ has
155                                                    backlog, but cannot
156                                                    transmit just now */
157         psched_tdiff_t          wd_expires;
158         int                     toplevel;
159         u32                     hgenerator;
160 };
161 
162 
163 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
164 
165 static inline struct cbq_class *
166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168         struct Qdisc_class_common *clc;
169 
170         clc = qdisc_class_find(&q->clhash, classid);
171         if (clc == NULL)
172                 return NULL;
173         return container_of(clc, struct cbq_class, common);
174 }
175 
176 #ifdef CONFIG_NET_CLS_ACT
177 
178 static struct cbq_class *
179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181         struct cbq_class *cl;
182 
183         for (cl = this->tparent; cl; cl = cl->tparent) {
184                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185 
186                 if (new != NULL && new != this)
187                         return new;
188         }
189         return NULL;
190 }
191 
192 #endif
193 
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203 
204 static struct cbq_class *
205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207         struct cbq_sched_data *q = qdisc_priv(sch);
208         struct cbq_class *head = &q->link;
209         struct cbq_class **defmap;
210         struct cbq_class *cl = NULL;
211         u32 prio = skb->priority;
212         struct tcf_proto *fl;
213         struct tcf_result res;
214 
215         /*
216          *  Step 1. If skb->priority points to one of our classes, use it.
217          */
218         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219             (cl = cbq_class_lookup(q, prio)) != NULL)
220                 return cl;
221 
222         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223         for (;;) {
224                 int result = 0;
225                 defmap = head->defaults;
226 
227                 fl = rcu_dereference_bh(head->filter_list);
228                 /*
229                  * Step 2+n. Apply classifier.
230                  */
231                 result = tcf_classify(skb, NULL, fl, &res, true);
232                 if (!fl || result < 0)
233                         goto fallback;
234 
235                 cl = (void *)res.class;
236                 if (!cl) {
237                         if (TC_H_MAJ(res.classid))
238                                 cl = cbq_class_lookup(q, res.classid);
239                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
240                                 cl = defmap[TC_PRIO_BESTEFFORT];
241 
242                         if (cl == NULL)
243                                 goto fallback;
244                 }
245                 if (cl->level >= head->level)
246                         goto fallback;
247 #ifdef CONFIG_NET_CLS_ACT
248                 switch (result) {
249                 case TC_ACT_QUEUED:
250                 case TC_ACT_STOLEN:
251                 case TC_ACT_TRAP:
252                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253                         fallthrough;
254                 case TC_ACT_SHOT:
255                         return NULL;
256                 case TC_ACT_RECLASSIFY:
257                         return cbq_reclassify(skb, cl);
258                 }
259 #endif
260                 if (cl->level == 0)
261                         return cl;
262 
263                 /*
264                  * Step 3+n. If classifier selected a link sharing class,
265                  *         apply agency specific classifier.
266                  *         Repeat this procedure until we hit a leaf node.
267                  */
268                 head = cl;
269         }
270 
271 fallback:
272         cl = head;
273 
274         /*
275          * Step 4. No success...
276          */
277         if (TC_H_MAJ(prio) == 0 &&
278             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280                 return head;
281 
282         return cl;
283 }
284 
285 /*
286  * A packet has just been enqueued on the empty class.
287  * cbq_activate_class adds it to the tail of active class list
288  * of its priority band.
289  */
290 
291 static inline void cbq_activate_class(struct cbq_class *cl)
292 {
293         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294         int prio = cl->cpriority;
295         struct cbq_class *cl_tail;
296 
297         cl_tail = q->active[prio];
298         q->active[prio] = cl;
299 
300         if (cl_tail != NULL) {
301                 cl->next_alive = cl_tail->next_alive;
302                 cl_tail->next_alive = cl;
303         } else {
304                 cl->next_alive = cl;
305                 q->activemask |= (1<<prio);
306         }
307 }
308 
309 /*
310  * Unlink class from active chain.
311  * Note that this same procedure is done directly in cbq_dequeue*
312  * during round-robin procedure.
313  */
314 
315 static void cbq_deactivate_class(struct cbq_class *this)
316 {
317         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318         int prio = this->cpriority;
319         struct cbq_class *cl;
320         struct cbq_class *cl_prev = q->active[prio];
321 
322         do {
323                 cl = cl_prev->next_alive;
324                 if (cl == this) {
325                         cl_prev->next_alive = cl->next_alive;
326                         cl->next_alive = NULL;
327 
328                         if (cl == q->active[prio]) {
329                                 q->active[prio] = cl_prev;
330                                 if (cl == q->active[prio]) {
331                                         q->active[prio] = NULL;
332                                         q->activemask &= ~(1<<prio);
333                                         return;
334                                 }
335                         }
336                         return;
337                 }
338         } while ((cl_prev = cl) != q->active[prio]);
339 }
340 
341 static void
342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343 {
344         int toplevel = q->toplevel;
345 
346         if (toplevel > cl->level) {
347                 psched_time_t now = psched_get_time();
348 
349                 do {
350                         if (cl->undertime < now) {
351                                 q->toplevel = cl->level;
352                                 return;
353                         }
354                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355         }
356 }
357 
358 static int
359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360             struct sk_buff **to_free)
361 {
362         struct cbq_sched_data *q = qdisc_priv(sch);
363         int ret;
364         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365 
366 #ifdef CONFIG_NET_CLS_ACT
367         q->rx_class = cl;
368 #endif
369         if (cl == NULL) {
370                 if (ret & __NET_XMIT_BYPASS)
371                         qdisc_qstats_drop(sch);
372                 __qdisc_drop(skb, to_free);
373                 return ret;
374         }
375 
376         ret = qdisc_enqueue(skb, cl->q, to_free);
377         if (ret == NET_XMIT_SUCCESS) {
378                 sch->q.qlen++;
379                 cbq_mark_toplevel(q, cl);
380                 if (!cl->next_alive)
381                         cbq_activate_class(cl);
382                 return ret;
383         }
384 
385         if (net_xmit_drop_count(ret)) {
386                 qdisc_qstats_drop(sch);
387                 cbq_mark_toplevel(q, cl);
388                 cl->qstats.drops++;
389         }
390         return ret;
391 }
392 
393 /* Overlimit action: penalize leaf class by adding offtime */
394 static void cbq_overlimit(struct cbq_class *cl)
395 {
396         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397         psched_tdiff_t delay = cl->undertime - q->now;
398 
399         if (!cl->delayed) {
400                 delay += cl->offtime;
401 
402                 /*
403                  * Class goes to sleep, so that it will have no
404                  * chance to work avgidle. Let's forgive it 8)
405                  *
406                  * BTW cbq-2.0 has a crap in this
407                  * place, apparently they forgot to shift it by cl->ewma_log.
408                  */
409                 if (cl->avgidle < 0)
410                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411                 if (cl->avgidle < cl->minidle)
412                         cl->avgidle = cl->minidle;
413                 if (delay <= 0)
414                         delay = 1;
415                 cl->undertime = q->now + delay;
416 
417                 cl->xstats.overactions++;
418                 cl->delayed = 1;
419         }
420         if (q->wd_expires == 0 || q->wd_expires > delay)
421                 q->wd_expires = delay;
422 
423         /* Dirty work! We must schedule wakeups based on
424          * real available rate, rather than leaf rate,
425          * which may be tiny (even zero).
426          */
427         if (q->toplevel == TC_CBQ_MAXLEVEL) {
428                 struct cbq_class *b;
429                 psched_tdiff_t base_delay = q->wd_expires;
430 
431                 for (b = cl->borrow; b; b = b->borrow) {
432                         delay = b->undertime - q->now;
433                         if (delay < base_delay) {
434                                 if (delay <= 0)
435                                         delay = 1;
436                                 base_delay = delay;
437                         }
438                 }
439 
440                 q->wd_expires = base_delay;
441         }
442 }
443 
444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445                                        psched_time_t now)
446 {
447         struct cbq_class *cl;
448         struct cbq_class *cl_prev = q->active[prio];
449         psched_time_t sched = now;
450 
451         if (cl_prev == NULL)
452                 return 0;
453 
454         do {
455                 cl = cl_prev->next_alive;
456                 if (now - cl->penalized > 0) {
457                         cl_prev->next_alive = cl->next_alive;
458                         cl->next_alive = NULL;
459                         cl->cpriority = cl->priority;
460                         cl->delayed = 0;
461                         cbq_activate_class(cl);
462 
463                         if (cl == q->active[prio]) {
464                                 q->active[prio] = cl_prev;
465                                 if (cl == q->active[prio]) {
466                                         q->active[prio] = NULL;
467                                         return 0;
468                                 }
469                         }
470 
471                         cl = cl_prev->next_alive;
472                 } else if (sched - cl->penalized > 0)
473                         sched = cl->penalized;
474         } while ((cl_prev = cl) != q->active[prio]);
475 
476         return sched - now;
477 }
478 
479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480 {
481         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482                                                 delay_timer);
483         struct Qdisc *sch = q->watchdog.qdisc;
484         psched_time_t now;
485         psched_tdiff_t delay = 0;
486         unsigned int pmask;
487 
488         now = psched_get_time();
489 
490         pmask = q->pmask;
491         q->pmask = 0;
492 
493         while (pmask) {
494                 int prio = ffz(~pmask);
495                 psched_tdiff_t tmp;
496 
497                 pmask &= ~(1<<prio);
498 
499                 tmp = cbq_undelay_prio(q, prio, now);
500                 if (tmp > 0) {
501                         q->pmask |= 1<<prio;
502                         if (tmp < delay || delay == 0)
503                                 delay = tmp;
504                 }
505         }
506 
507         if (delay) {
508                 ktime_t time;
509 
510                 time = 0;
511                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513         }
514 
515         __netif_schedule(qdisc_root(sch));
516         return HRTIMER_NORESTART;
517 }
518 
519 /*
520  * It is mission critical procedure.
521  *
522  * We "regenerate" toplevel cutoff, if transmitting class
523  * has backlog and it is not regulated. It is not part of
524  * original CBQ description, but looks more reasonable.
525  * Probably, it is wrong. This question needs further investigation.
526  */
527 
528 static inline void
529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530                     struct cbq_class *borrowed)
531 {
532         if (cl && q->toplevel >= borrowed->level) {
533                 if (cl->q->q.qlen > 1) {
534                         do {
535                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
536                                         q->toplevel = borrowed->level;
537                                         return;
538                                 }
539                         } while ((borrowed = borrowed->borrow) != NULL);
540                 }
541 #if 0
542         /* It is not necessary now. Uncommenting it
543            will save CPU cycles, but decrease fairness.
544          */
545                 q->toplevel = TC_CBQ_MAXLEVEL;
546 #endif
547         }
548 }
549 
550 static void
551 cbq_update(struct cbq_sched_data *q)
552 {
553         struct cbq_class *this = q->tx_class;
554         struct cbq_class *cl = this;
555         int len = q->tx_len;
556         psched_time_t now;
557 
558         q->tx_class = NULL;
559         /* Time integrator. We calculate EOS time
560          * by adding expected packet transmission time.
561          */
562         now = q->now + L2T(&q->link, len);
563 
564         for ( ; cl; cl = cl->share) {
565                 long avgidle = cl->avgidle;
566                 long idle;
567 
568                 _bstats_update(&cl->bstats, len, 1);
569 
570                 /*
571                  * (now - last) is total time between packet right edges.
572                  * (last_pktlen/rate) is "virtual" busy time, so that
573                  *
574                  *      idle = (now - last) - last_pktlen/rate
575                  */
576 
577                 idle = now - cl->last;
578                 if ((unsigned long)idle > 128*1024*1024) {
579                         avgidle = cl->maxidle;
580                 } else {
581                         idle -= L2T(cl, len);
582 
583                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
584                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
585                  * cl->avgidle == true_avgidle/W,
586                  * hence:
587                  */
588                         avgidle += idle - (avgidle>>cl->ewma_log);
589                 }
590 
591                 if (avgidle <= 0) {
592                         /* Overlimit or at-limit */
593 
594                         if (avgidle < cl->minidle)
595                                 avgidle = cl->minidle;
596 
597                         cl->avgidle = avgidle;
598 
599                         /* Calculate expected time, when this class
600                          * will be allowed to send.
601                          * It will occur, when:
602                          * (1-W)*true_avgidle + W*delay = 0, i.e.
603                          * idle = (1/W - 1)*(-true_avgidle)
604                          * or
605                          * idle = (1 - W)*(-cl->avgidle);
606                          */
607                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
608 
609                         /*
610                          * That is not all.
611                          * To maintain the rate allocated to the class,
612                          * we add to undertime virtual clock,
613                          * necessary to complete transmitted packet.
614                          * (len/phys_bandwidth has been already passed
615                          * to the moment of cbq_update)
616                          */
617 
618                         idle -= L2T(&q->link, len);
619                         idle += L2T(cl, len);
620 
621                         cl->undertime = now + idle;
622                 } else {
623                         /* Underlimit */
624 
625                         cl->undertime = PSCHED_PASTPERFECT;
626                         if (avgidle > cl->maxidle)
627                                 cl->avgidle = cl->maxidle;
628                         else
629                                 cl->avgidle = avgidle;
630                 }
631                 if ((s64)(now - cl->last) > 0)
632                         cl->last = now;
633         }
634 
635         cbq_update_toplevel(q, this, q->tx_borrowed);
636 }
637 
638 static inline struct cbq_class *
639 cbq_under_limit(struct cbq_class *cl)
640 {
641         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
642         struct cbq_class *this_cl = cl;
643 
644         if (cl->tparent == NULL)
645                 return cl;
646 
647         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
648                 cl->delayed = 0;
649                 return cl;
650         }
651 
652         do {
653                 /* It is very suspicious place. Now overlimit
654                  * action is generated for not bounded classes
655                  * only if link is completely congested.
656                  * Though it is in agree with ancestor-only paradigm,
657                  * it looks very stupid. Particularly,
658                  * it means that this chunk of code will either
659                  * never be called or result in strong amplification
660                  * of burstiness. Dangerous, silly, and, however,
661                  * no another solution exists.
662                  */
663                 cl = cl->borrow;
664                 if (!cl) {
665                         this_cl->qstats.overlimits++;
666                         cbq_overlimit(this_cl);
667                         return NULL;
668                 }
669                 if (cl->level > q->toplevel)
670                         return NULL;
671         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
672 
673         cl->delayed = 0;
674         return cl;
675 }
676 
677 static inline struct sk_buff *
678 cbq_dequeue_prio(struct Qdisc *sch, int prio)
679 {
680         struct cbq_sched_data *q = qdisc_priv(sch);
681         struct cbq_class *cl_tail, *cl_prev, *cl;
682         struct sk_buff *skb;
683         int deficit;
684 
685         cl_tail = cl_prev = q->active[prio];
686         cl = cl_prev->next_alive;
687 
688         do {
689                 deficit = 0;
690 
691                 /* Start round */
692                 do {
693                         struct cbq_class *borrow = cl;
694 
695                         if (cl->q->q.qlen &&
696                             (borrow = cbq_under_limit(cl)) == NULL)
697                                 goto skip_class;
698 
699                         if (cl->deficit <= 0) {
700                                 /* Class exhausted its allotment per
701                                  * this round. Switch to the next one.
702                                  */
703                                 deficit = 1;
704                                 cl->deficit += cl->quantum;
705                                 goto next_class;
706                         }
707 
708                         skb = cl->q->dequeue(cl->q);
709 
710                         /* Class did not give us any skb :-(
711                          * It could occur even if cl->q->q.qlen != 0
712                          * f.e. if cl->q == "tbf"
713                          */
714                         if (skb == NULL)
715                                 goto skip_class;
716 
717                         cl->deficit -= qdisc_pkt_len(skb);
718                         q->tx_class = cl;
719                         q->tx_borrowed = borrow;
720                         if (borrow != cl) {
721 #ifndef CBQ_XSTATS_BORROWS_BYTES
722                                 borrow->xstats.borrows++;
723                                 cl->xstats.borrows++;
724 #else
725                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
726                                 cl->xstats.borrows += qdisc_pkt_len(skb);
727 #endif
728                         }
729                         q->tx_len = qdisc_pkt_len(skb);
730 
731                         if (cl->deficit <= 0) {
732                                 q->active[prio] = cl;
733                                 cl = cl->next_alive;
734                                 cl->deficit += cl->quantum;
735                         }
736                         return skb;
737 
738 skip_class:
739                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
740                                 /* Class is empty or penalized.
741                                  * Unlink it from active chain.
742                                  */
743                                 cl_prev->next_alive = cl->next_alive;
744                                 cl->next_alive = NULL;
745 
746                                 /* Did cl_tail point to it? */
747                                 if (cl == cl_tail) {
748                                         /* Repair it! */
749                                         cl_tail = cl_prev;
750 
751                                         /* Was it the last class in this band? */
752                                         if (cl == cl_tail) {
753                                                 /* Kill the band! */
754                                                 q->active[prio] = NULL;
755                                                 q->activemask &= ~(1<<prio);
756                                                 if (cl->q->q.qlen)
757                                                         cbq_activate_class(cl);
758                                                 return NULL;
759                                         }
760 
761                                         q->active[prio] = cl_tail;
762                                 }
763                                 if (cl->q->q.qlen)
764                                         cbq_activate_class(cl);
765 
766                                 cl = cl_prev;
767                         }
768 
769 next_class:
770                         cl_prev = cl;
771                         cl = cl->next_alive;
772                 } while (cl_prev != cl_tail);
773         } while (deficit);
774 
775         q->active[prio] = cl_prev;
776 
777         return NULL;
778 }
779 
780 static inline struct sk_buff *
781 cbq_dequeue_1(struct Qdisc *sch)
782 {
783         struct cbq_sched_data *q = qdisc_priv(sch);
784         struct sk_buff *skb;
785         unsigned int activemask;
786 
787         activemask = q->activemask & 0xFF;
788         while (activemask) {
789                 int prio = ffz(~activemask);
790                 activemask &= ~(1<<prio);
791                 skb = cbq_dequeue_prio(sch, prio);
792                 if (skb)
793                         return skb;
794         }
795         return NULL;
796 }
797 
798 static struct sk_buff *
799 cbq_dequeue(struct Qdisc *sch)
800 {
801         struct sk_buff *skb;
802         struct cbq_sched_data *q = qdisc_priv(sch);
803         psched_time_t now;
804 
805         now = psched_get_time();
806 
807         if (q->tx_class)
808                 cbq_update(q);
809 
810         q->now = now;
811 
812         for (;;) {
813                 q->wd_expires = 0;
814 
815                 skb = cbq_dequeue_1(sch);
816                 if (skb) {
817                         qdisc_bstats_update(sch, skb);
818                         sch->q.qlen--;
819                         return skb;
820                 }
821 
822                 /* All the classes are overlimit.
823                  *
824                  * It is possible, if:
825                  *
826                  * 1. Scheduler is empty.
827                  * 2. Toplevel cutoff inhibited borrowing.
828                  * 3. Root class is overlimit.
829                  *
830                  * Reset 2d and 3d conditions and retry.
831                  *
832                  * Note, that NS and cbq-2.0 are buggy, peeking
833                  * an arbitrary class is appropriate for ancestor-only
834                  * sharing, but not for toplevel algorithm.
835                  *
836                  * Our version is better, but slower, because it requires
837                  * two passes, but it is unavoidable with top-level sharing.
838                  */
839 
840                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
841                     q->link.undertime == PSCHED_PASTPERFECT)
842                         break;
843 
844                 q->toplevel = TC_CBQ_MAXLEVEL;
845                 q->link.undertime = PSCHED_PASTPERFECT;
846         }
847 
848         /* No packets in scheduler or nobody wants to give them to us :-(
849          * Sigh... start watchdog timer in the last case.
850          */
851 
852         if (sch->q.qlen) {
853                 qdisc_qstats_overlimit(sch);
854                 if (q->wd_expires)
855                         qdisc_watchdog_schedule(&q->watchdog,
856                                                 now + q->wd_expires);
857         }
858         return NULL;
859 }
860 
861 /* CBQ class maintenance routines */
862 
863 static void cbq_adjust_levels(struct cbq_class *this)
864 {
865         if (this == NULL)
866                 return;
867 
868         do {
869                 int level = 0;
870                 struct cbq_class *cl;
871 
872                 cl = this->children;
873                 if (cl) {
874                         do {
875                                 if (cl->level > level)
876                                         level = cl->level;
877                         } while ((cl = cl->sibling) != this->children);
878                 }
879                 this->level = level + 1;
880         } while ((this = this->tparent) != NULL);
881 }
882 
883 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
884 {
885         struct cbq_class *cl;
886         unsigned int h;
887 
888         if (q->quanta[prio] == 0)
889                 return;
890 
891         for (h = 0; h < q->clhash.hashsize; h++) {
892                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
893                         /* BUGGGG... Beware! This expression suffer of
894                          * arithmetic overflows!
895                          */
896                         if (cl->priority == prio) {
897                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
898                                         q->quanta[prio];
899                         }
900                         if (cl->quantum <= 0 ||
901                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
902                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
903                                         cl->common.classid, cl->quantum);
904                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
905                         }
906                 }
907         }
908 }
909 
910 static void cbq_sync_defmap(struct cbq_class *cl)
911 {
912         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
913         struct cbq_class *split = cl->split;
914         unsigned int h;
915         int i;
916 
917         if (split == NULL)
918                 return;
919 
920         for (i = 0; i <= TC_PRIO_MAX; i++) {
921                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
922                         split->defaults[i] = NULL;
923         }
924 
925         for (i = 0; i <= TC_PRIO_MAX; i++) {
926                 int level = split->level;
927 
928                 if (split->defaults[i])
929                         continue;
930 
931                 for (h = 0; h < q->clhash.hashsize; h++) {
932                         struct cbq_class *c;
933 
934                         hlist_for_each_entry(c, &q->clhash.hash[h],
935                                              common.hnode) {
936                                 if (c->split == split && c->level < level &&
937                                     c->defmap & (1<<i)) {
938                                         split->defaults[i] = c;
939                                         level = c->level;
940                                 }
941                         }
942                 }
943         }
944 }
945 
946 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
947 {
948         struct cbq_class *split = NULL;
949 
950         if (splitid == 0) {
951                 split = cl->split;
952                 if (!split)
953                         return;
954                 splitid = split->common.classid;
955         }
956 
957         if (split == NULL || split->common.classid != splitid) {
958                 for (split = cl->tparent; split; split = split->tparent)
959                         if (split->common.classid == splitid)
960                                 break;
961         }
962 
963         if (split == NULL)
964                 return;
965 
966         if (cl->split != split) {
967                 cl->defmap = 0;
968                 cbq_sync_defmap(cl);
969                 cl->split = split;
970                 cl->defmap = def & mask;
971         } else
972                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
973 
974         cbq_sync_defmap(cl);
975 }
976 
977 static void cbq_unlink_class(struct cbq_class *this)
978 {
979         struct cbq_class *cl, **clp;
980         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
981 
982         qdisc_class_hash_remove(&q->clhash, &this->common);
983 
984         if (this->tparent) {
985                 clp = &this->sibling;
986                 cl = *clp;
987                 do {
988                         if (cl == this) {
989                                 *clp = cl->sibling;
990                                 break;
991                         }
992                         clp = &cl->sibling;
993                 } while ((cl = *clp) != this->sibling);
994 
995                 if (this->tparent->children == this) {
996                         this->tparent->children = this->sibling;
997                         if (this->sibling == this)
998                                 this->tparent->children = NULL;
999                 }
1000         } else {
1001                 WARN_ON(this->sibling != this);
1002         }
1003 }
1004 
1005 static void cbq_link_class(struct cbq_class *this)
1006 {
1007         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1008         struct cbq_class *parent = this->tparent;
1009 
1010         this->sibling = this;
1011         qdisc_class_hash_insert(&q->clhash, &this->common);
1012 
1013         if (parent == NULL)
1014                 return;
1015 
1016         if (parent->children == NULL) {
1017                 parent->children = this;
1018         } else {
1019                 this->sibling = parent->children->sibling;
1020                 parent->children->sibling = this;
1021         }
1022 }
1023 
1024 static void
1025 cbq_reset(struct Qdisc *sch)
1026 {
1027         struct cbq_sched_data *q = qdisc_priv(sch);
1028         struct cbq_class *cl;
1029         int prio;
1030         unsigned int h;
1031 
1032         q->activemask = 0;
1033         q->pmask = 0;
1034         q->tx_class = NULL;
1035         q->tx_borrowed = NULL;
1036         qdisc_watchdog_cancel(&q->watchdog);
1037         hrtimer_cancel(&q->delay_timer);
1038         q->toplevel = TC_CBQ_MAXLEVEL;
1039         q->now = psched_get_time();
1040 
1041         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1042                 q->active[prio] = NULL;
1043 
1044         for (h = 0; h < q->clhash.hashsize; h++) {
1045                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1046                         qdisc_reset(cl->q);
1047 
1048                         cl->next_alive = NULL;
1049                         cl->undertime = PSCHED_PASTPERFECT;
1050                         cl->avgidle = cl->maxidle;
1051                         cl->deficit = cl->quantum;
1052                         cl->cpriority = cl->priority;
1053                 }
1054         }
1055         sch->q.qlen = 0;
1056 }
1057 
1058 
1059 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1060 {
1061         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1062                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1063                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1064         }
1065         if (lss->change & TCF_CBQ_LSS_EWMA)
1066                 cl->ewma_log = lss->ewma_log;
1067         if (lss->change & TCF_CBQ_LSS_AVPKT)
1068                 cl->avpkt = lss->avpkt;
1069         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1070                 cl->minidle = -(long)lss->minidle;
1071         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1072                 cl->maxidle = lss->maxidle;
1073                 cl->avgidle = lss->maxidle;
1074         }
1075         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1076                 cl->offtime = lss->offtime;
1077         return 0;
1078 }
1079 
1080 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1081 {
1082         q->nclasses[cl->priority]--;
1083         q->quanta[cl->priority] -= cl->weight;
1084         cbq_normalize_quanta(q, cl->priority);
1085 }
1086 
1087 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1088 {
1089         q->nclasses[cl->priority]++;
1090         q->quanta[cl->priority] += cl->weight;
1091         cbq_normalize_quanta(q, cl->priority);
1092 }
1093 
1094 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1095 {
1096         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097 
1098         if (wrr->allot)
1099                 cl->allot = wrr->allot;
1100         if (wrr->weight)
1101                 cl->weight = wrr->weight;
1102         if (wrr->priority) {
1103                 cl->priority = wrr->priority - 1;
1104                 cl->cpriority = cl->priority;
1105                 if (cl->priority >= cl->priority2)
1106                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1107         }
1108 
1109         cbq_addprio(q, cl);
1110         return 0;
1111 }
1112 
1113 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1114 {
1115         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1116         return 0;
1117 }
1118 
1119 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1120         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1121         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1122         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1123         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1124         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1125         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1126         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1127 };
1128 
1129 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1130                          struct nlattr *opt,
1131                          struct netlink_ext_ack *extack)
1132 {
1133         int err;
1134 
1135         if (!opt) {
1136                 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1137                 return -EINVAL;
1138         }
1139 
1140         err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1141                                           cbq_policy, extack);
1142         if (err < 0)
1143                 return err;
1144 
1145         if (tb[TCA_CBQ_WRROPT]) {
1146                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147 
1148                 if (wrr->priority > TC_CBQ_MAXPRIO) {
1149                         NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1150                         err = -EINVAL;
1151                 }
1152         }
1153         return err;
1154 }
1155 
1156 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1157                     struct netlink_ext_ack *extack)
1158 {
1159         struct cbq_sched_data *q = qdisc_priv(sch);
1160         struct nlattr *tb[TCA_CBQ_MAX + 1];
1161         struct tc_ratespec *r;
1162         int err;
1163 
1164         qdisc_watchdog_init(&q->watchdog, sch);
1165         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1166         q->delay_timer.function = cbq_undelay;
1167 
1168         err = cbq_opt_parse(tb, opt, extack);
1169         if (err < 0)
1170                 return err;
1171 
1172         if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1173                 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1174                 return -EINVAL;
1175         }
1176 
1177         r = nla_data(tb[TCA_CBQ_RATE]);
1178 
1179         q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1180         if (!q->link.R_tab)
1181                 return -EINVAL;
1182 
1183         err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1184         if (err)
1185                 goto put_rtab;
1186 
1187         err = qdisc_class_hash_init(&q->clhash);
1188         if (err < 0)
1189                 goto put_block;
1190 
1191         q->link.sibling = &q->link;
1192         q->link.common.classid = sch->handle;
1193         q->link.qdisc = sch;
1194         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1195                                       sch->handle, NULL);
1196         if (!q->link.q)
1197                 q->link.q = &noop_qdisc;
1198         else
1199                 qdisc_hash_add(q->link.q, true);
1200 
1201         q->link.priority = TC_CBQ_MAXPRIO - 1;
1202         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1203         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1204         q->link.allot = psched_mtu(qdisc_dev(sch));
1205         q->link.quantum = q->link.allot;
1206         q->link.weight = q->link.R_tab->rate.rate;
1207 
1208         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1209         q->link.avpkt = q->link.allot/2;
1210         q->link.minidle = -0x7FFFFFFF;
1211 
1212         q->toplevel = TC_CBQ_MAXLEVEL;
1213         q->now = psched_get_time();
1214 
1215         cbq_link_class(&q->link);
1216 
1217         if (tb[TCA_CBQ_LSSOPT])
1218                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1219 
1220         cbq_addprio(q, &q->link);
1221         return 0;
1222 
1223 put_block:
1224         tcf_block_put(q->link.block);
1225 
1226 put_rtab:
1227         qdisc_put_rtab(q->link.R_tab);
1228         return err;
1229 }
1230 
1231 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1232 {
1233         unsigned char *b = skb_tail_pointer(skb);
1234 
1235         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1236                 goto nla_put_failure;
1237         return skb->len;
1238 
1239 nla_put_failure:
1240         nlmsg_trim(skb, b);
1241         return -1;
1242 }
1243 
1244 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1245 {
1246         unsigned char *b = skb_tail_pointer(skb);
1247         struct tc_cbq_lssopt opt;
1248 
1249         opt.flags = 0;
1250         if (cl->borrow == NULL)
1251                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1252         if (cl->share == NULL)
1253                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1254         opt.ewma_log = cl->ewma_log;
1255         opt.level = cl->level;
1256         opt.avpkt = cl->avpkt;
1257         opt.maxidle = cl->maxidle;
1258         opt.minidle = (u32)(-cl->minidle);
1259         opt.offtime = cl->offtime;
1260         opt.change = ~0;
1261         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1262                 goto nla_put_failure;
1263         return skb->len;
1264 
1265 nla_put_failure:
1266         nlmsg_trim(skb, b);
1267         return -1;
1268 }
1269 
1270 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1271 {
1272         unsigned char *b = skb_tail_pointer(skb);
1273         struct tc_cbq_wrropt opt;
1274 
1275         memset(&opt, 0, sizeof(opt));
1276         opt.flags = 0;
1277         opt.allot = cl->allot;
1278         opt.priority = cl->priority + 1;
1279         opt.cpriority = cl->cpriority + 1;
1280         opt.weight = cl->weight;
1281         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1282                 goto nla_put_failure;
1283         return skb->len;
1284 
1285 nla_put_failure:
1286         nlmsg_trim(skb, b);
1287         return -1;
1288 }
1289 
1290 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1291 {
1292         unsigned char *b = skb_tail_pointer(skb);
1293         struct tc_cbq_fopt opt;
1294 
1295         if (cl->split || cl->defmap) {
1296                 opt.split = cl->split ? cl->split->common.classid : 0;
1297                 opt.defmap = cl->defmap;
1298                 opt.defchange = ~0;
1299                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1300                         goto nla_put_failure;
1301         }
1302         return skb->len;
1303 
1304 nla_put_failure:
1305         nlmsg_trim(skb, b);
1306         return -1;
1307 }
1308 
1309 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1310 {
1311         if (cbq_dump_lss(skb, cl) < 0 ||
1312             cbq_dump_rate(skb, cl) < 0 ||
1313             cbq_dump_wrr(skb, cl) < 0 ||
1314             cbq_dump_fopt(skb, cl) < 0)
1315                 return -1;
1316         return 0;
1317 }
1318 
1319 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1320 {
1321         struct cbq_sched_data *q = qdisc_priv(sch);
1322         struct nlattr *nest;
1323 
1324         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1325         if (nest == NULL)
1326                 goto nla_put_failure;
1327         if (cbq_dump_attr(skb, &q->link) < 0)
1328                 goto nla_put_failure;
1329         return nla_nest_end(skb, nest);
1330 
1331 nla_put_failure:
1332         nla_nest_cancel(skb, nest);
1333         return -1;
1334 }
1335 
1336 static int
1337 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1338 {
1339         struct cbq_sched_data *q = qdisc_priv(sch);
1340 
1341         q->link.xstats.avgidle = q->link.avgidle;
1342         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1343 }
1344 
1345 static int
1346 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1347                struct sk_buff *skb, struct tcmsg *tcm)
1348 {
1349         struct cbq_class *cl = (struct cbq_class *)arg;
1350         struct nlattr *nest;
1351 
1352         if (cl->tparent)
1353                 tcm->tcm_parent = cl->tparent->common.classid;
1354         else
1355                 tcm->tcm_parent = TC_H_ROOT;
1356         tcm->tcm_handle = cl->common.classid;
1357         tcm->tcm_info = cl->q->handle;
1358 
1359         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1360         if (nest == NULL)
1361                 goto nla_put_failure;
1362         if (cbq_dump_attr(skb, cl) < 0)
1363                 goto nla_put_failure;
1364         return nla_nest_end(skb, nest);
1365 
1366 nla_put_failure:
1367         nla_nest_cancel(skb, nest);
1368         return -1;
1369 }
1370 
1371 static int
1372 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1373         struct gnet_dump *d)
1374 {
1375         struct cbq_sched_data *q = qdisc_priv(sch);
1376         struct cbq_class *cl = (struct cbq_class *)arg;
1377         __u32 qlen;
1378 
1379         cl->xstats.avgidle = cl->avgidle;
1380         cl->xstats.undertime = 0;
1381         qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1382 
1383         if (cl->undertime != PSCHED_PASTPERFECT)
1384                 cl->xstats.undertime = cl->undertime - q->now;
1385 
1386         if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1387             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1388             gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1389                 return -1;
1390 
1391         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1392 }
1393 
1394 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1395                      struct Qdisc **old, struct netlink_ext_ack *extack)
1396 {
1397         struct cbq_class *cl = (struct cbq_class *)arg;
1398 
1399         if (new == NULL) {
1400                 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1401                                         cl->common.classid, extack);
1402                 if (new == NULL)
1403                         return -ENOBUFS;
1404         }
1405 
1406         *old = qdisc_replace(sch, new, &cl->q);
1407         return 0;
1408 }
1409 
1410 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1411 {
1412         struct cbq_class *cl = (struct cbq_class *)arg;
1413 
1414         return cl->q;
1415 }
1416 
1417 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1418 {
1419         struct cbq_class *cl = (struct cbq_class *)arg;
1420 
1421         cbq_deactivate_class(cl);
1422 }
1423 
1424 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1425 {
1426         struct cbq_sched_data *q = qdisc_priv(sch);
1427 
1428         return (unsigned long)cbq_class_lookup(q, classid);
1429 }
1430 
1431 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1432 {
1433         struct cbq_sched_data *q = qdisc_priv(sch);
1434 
1435         WARN_ON(cl->filters);
1436 
1437         tcf_block_put(cl->block);
1438         qdisc_put(cl->q);
1439         qdisc_put_rtab(cl->R_tab);
1440         gen_kill_estimator(&cl->rate_est);
1441         if (cl != &q->link)
1442                 kfree(cl);
1443 }
1444 
1445 static void cbq_destroy(struct Qdisc *sch)
1446 {
1447         struct cbq_sched_data *q = qdisc_priv(sch);
1448         struct hlist_node *next;
1449         struct cbq_class *cl;
1450         unsigned int h;
1451 
1452 #ifdef CONFIG_NET_CLS_ACT
1453         q->rx_class = NULL;
1454 #endif
1455         /*
1456          * Filters must be destroyed first because we don't destroy the
1457          * classes from root to leafs which means that filters can still
1458          * be bound to classes which have been destroyed already. --TGR '04
1459          */
1460         for (h = 0; h < q->clhash.hashsize; h++) {
1461                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1462                         tcf_block_put(cl->block);
1463                         cl->block = NULL;
1464                 }
1465         }
1466         for (h = 0; h < q->clhash.hashsize; h++) {
1467                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1468                                           common.hnode)
1469                         cbq_destroy_class(sch, cl);
1470         }
1471         qdisc_class_hash_destroy(&q->clhash);
1472 }
1473 
1474 static int
1475 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1476                  unsigned long *arg, struct netlink_ext_ack *extack)
1477 {
1478         int err;
1479         struct cbq_sched_data *q = qdisc_priv(sch);
1480         struct cbq_class *cl = (struct cbq_class *)*arg;
1481         struct nlattr *opt = tca[TCA_OPTIONS];
1482         struct nlattr *tb[TCA_CBQ_MAX + 1];
1483         struct cbq_class *parent;
1484         struct qdisc_rate_table *rtab = NULL;
1485 
1486         err = cbq_opt_parse(tb, opt, extack);
1487         if (err < 0)
1488                 return err;
1489 
1490         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1491                 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1492                 return -EOPNOTSUPP;
1493         }
1494 
1495         if (cl) {
1496                 /* Check parent */
1497                 if (parentid) {
1498                         if (cl->tparent &&
1499                             cl->tparent->common.classid != parentid) {
1500                                 NL_SET_ERR_MSG(extack, "Invalid parent id");
1501                                 return -EINVAL;
1502                         }
1503                         if (!cl->tparent && parentid != TC_H_ROOT) {
1504                                 NL_SET_ERR_MSG(extack, "Parent must be root");
1505                                 return -EINVAL;
1506                         }
1507                 }
1508 
1509                 if (tb[TCA_CBQ_RATE]) {
1510                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1511                                               tb[TCA_CBQ_RTAB], extack);
1512                         if (rtab == NULL)
1513                                 return -EINVAL;
1514                 }
1515 
1516                 if (tca[TCA_RATE]) {
1517                         err = gen_replace_estimator(&cl->bstats, NULL,
1518                                                     &cl->rate_est,
1519                                                     NULL,
1520                                                     true,
1521                                                     tca[TCA_RATE]);
1522                         if (err) {
1523                                 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1524                                 qdisc_put_rtab(rtab);
1525                                 return err;
1526                         }
1527                 }
1528 
1529                 /* Change class parameters */
1530                 sch_tree_lock(sch);
1531 
1532                 if (cl->next_alive != NULL)
1533                         cbq_deactivate_class(cl);
1534 
1535                 if (rtab) {
1536                         qdisc_put_rtab(cl->R_tab);
1537                         cl->R_tab = rtab;
1538                 }
1539 
1540                 if (tb[TCA_CBQ_LSSOPT])
1541                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1542 
1543                 if (tb[TCA_CBQ_WRROPT]) {
1544                         cbq_rmprio(q, cl);
1545                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1546                 }
1547 
1548                 if (tb[TCA_CBQ_FOPT])
1549                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1550 
1551                 if (cl->q->q.qlen)
1552                         cbq_activate_class(cl);
1553 
1554                 sch_tree_unlock(sch);
1555 
1556                 return 0;
1557         }
1558 
1559         if (parentid == TC_H_ROOT)
1560                 return -EINVAL;
1561 
1562         if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1563                 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1564                 return -EINVAL;
1565         }
1566 
1567         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1568                               extack);
1569         if (rtab == NULL)
1570                 return -EINVAL;
1571 
1572         if (classid) {
1573                 err = -EINVAL;
1574                 if (TC_H_MAJ(classid ^ sch->handle) ||
1575                     cbq_class_lookup(q, classid)) {
1576                         NL_SET_ERR_MSG(extack, "Specified class not found");
1577                         goto failure;
1578                 }
1579         } else {
1580                 int i;
1581                 classid = TC_H_MAKE(sch->handle, 0x8000);
1582 
1583                 for (i = 0; i < 0x8000; i++) {
1584                         if (++q->hgenerator >= 0x8000)
1585                                 q->hgenerator = 1;
1586                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1587                                 break;
1588                 }
1589                 err = -ENOSR;
1590                 if (i >= 0x8000) {
1591                         NL_SET_ERR_MSG(extack, "Unable to generate classid");
1592                         goto failure;
1593                 }
1594                 classid = classid|q->hgenerator;
1595         }
1596 
1597         parent = &q->link;
1598         if (parentid) {
1599                 parent = cbq_class_lookup(q, parentid);
1600                 err = -EINVAL;
1601                 if (!parent) {
1602                         NL_SET_ERR_MSG(extack, "Failed to find parentid");
1603                         goto failure;
1604                 }
1605         }
1606 
1607         err = -ENOBUFS;
1608         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1609         if (cl == NULL)
1610                 goto failure;
1611 
1612         gnet_stats_basic_sync_init(&cl->bstats);
1613         err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1614         if (err) {
1615                 kfree(cl);
1616                 goto failure;
1617         }
1618 
1619         if (tca[TCA_RATE]) {
1620                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1621                                         NULL, true, tca[TCA_RATE]);
1622                 if (err) {
1623                         NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1624                         tcf_block_put(cl->block);
1625                         kfree(cl);
1626                         goto failure;
1627                 }
1628         }
1629 
1630         cl->R_tab = rtab;
1631         rtab = NULL;
1632         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1633                                   NULL);
1634         if (!cl->q)
1635                 cl->q = &noop_qdisc;
1636         else
1637                 qdisc_hash_add(cl->q, true);
1638 
1639         cl->common.classid = classid;
1640         cl->tparent = parent;
1641         cl->qdisc = sch;
1642         cl->allot = parent->allot;
1643         cl->quantum = cl->allot;
1644         cl->weight = cl->R_tab->rate.rate;
1645 
1646         sch_tree_lock(sch);
1647         cbq_link_class(cl);
1648         cl->borrow = cl->tparent;
1649         if (cl->tparent != &q->link)
1650                 cl->share = cl->tparent;
1651         cbq_adjust_levels(parent);
1652         cl->minidle = -0x7FFFFFFF;
1653         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1654         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1655         if (cl->ewma_log == 0)
1656                 cl->ewma_log = q->link.ewma_log;
1657         if (cl->maxidle == 0)
1658                 cl->maxidle = q->link.maxidle;
1659         if (cl->avpkt == 0)
1660                 cl->avpkt = q->link.avpkt;
1661         if (tb[TCA_CBQ_FOPT])
1662                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1663         sch_tree_unlock(sch);
1664 
1665         qdisc_class_hash_grow(sch, &q->clhash);
1666 
1667         *arg = (unsigned long)cl;
1668         return 0;
1669 
1670 failure:
1671         qdisc_put_rtab(rtab);
1672         return err;
1673 }
1674 
1675 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1676                       struct netlink_ext_ack *extack)
1677 {
1678         struct cbq_sched_data *q = qdisc_priv(sch);
1679         struct cbq_class *cl = (struct cbq_class *)arg;
1680 
1681         if (cl->filters || cl->children || cl == &q->link)
1682                 return -EBUSY;
1683 
1684         sch_tree_lock(sch);
1685 
1686         qdisc_purge_queue(cl->q);
1687 
1688         if (cl->next_alive)
1689                 cbq_deactivate_class(cl);
1690 
1691         if (q->tx_borrowed == cl)
1692                 q->tx_borrowed = q->tx_class;
1693         if (q->tx_class == cl) {
1694                 q->tx_class = NULL;
1695                 q->tx_borrowed = NULL;
1696         }
1697 #ifdef CONFIG_NET_CLS_ACT
1698         if (q->rx_class == cl)
1699                 q->rx_class = NULL;
1700 #endif
1701 
1702         cbq_unlink_class(cl);
1703         cbq_adjust_levels(cl->tparent);
1704         cl->defmap = 0;
1705         cbq_sync_defmap(cl);
1706 
1707         cbq_rmprio(q, cl);
1708         sch_tree_unlock(sch);
1709 
1710         cbq_destroy_class(sch, cl);
1711         return 0;
1712 }
1713 
1714 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1715                                        struct netlink_ext_ack *extack)
1716 {
1717         struct cbq_sched_data *q = qdisc_priv(sch);
1718         struct cbq_class *cl = (struct cbq_class *)arg;
1719 
1720         if (cl == NULL)
1721                 cl = &q->link;
1722 
1723         return cl->block;
1724 }
1725 
1726 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1727                                      u32 classid)
1728 {
1729         struct cbq_sched_data *q = qdisc_priv(sch);
1730         struct cbq_class *p = (struct cbq_class *)parent;
1731         struct cbq_class *cl = cbq_class_lookup(q, classid);
1732 
1733         if (cl) {
1734                 if (p && p->level <= cl->level)
1735                         return 0;
1736                 cl->filters++;
1737                 return (unsigned long)cl;
1738         }
1739         return 0;
1740 }
1741 
1742 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1743 {
1744         struct cbq_class *cl = (struct cbq_class *)arg;
1745 
1746         cl->filters--;
1747 }
1748 
1749 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1750 {
1751         struct cbq_sched_data *q = qdisc_priv(sch);
1752         struct cbq_class *cl;
1753         unsigned int h;
1754 
1755         if (arg->stop)
1756                 return;
1757 
1758         for (h = 0; h < q->clhash.hashsize; h++) {
1759                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1760                         if (arg->count < arg->skip) {
1761                                 arg->count++;
1762                                 continue;
1763                         }
1764                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1765                                 arg->stop = 1;
1766                                 return;
1767                         }
1768                         arg->count++;
1769                 }
1770         }
1771 }
1772 
1773 static const struct Qdisc_class_ops cbq_class_ops = {
1774         .graft          =       cbq_graft,
1775         .leaf           =       cbq_leaf,
1776         .qlen_notify    =       cbq_qlen_notify,
1777         .find           =       cbq_find,
1778         .change         =       cbq_change_class,
1779         .delete         =       cbq_delete,
1780         .walk           =       cbq_walk,
1781         .tcf_block      =       cbq_tcf_block,
1782         .bind_tcf       =       cbq_bind_filter,
1783         .unbind_tcf     =       cbq_unbind_filter,
1784         .dump           =       cbq_dump_class,
1785         .dump_stats     =       cbq_dump_class_stats,
1786 };
1787 
1788 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1789         .next           =       NULL,
1790         .cl_ops         =       &cbq_class_ops,
1791         .id             =       "cbq",
1792         .priv_size      =       sizeof(struct cbq_sched_data),
1793         .enqueue        =       cbq_enqueue,
1794         .dequeue        =       cbq_dequeue,
1795         .peek           =       qdisc_peek_dequeued,
1796         .init           =       cbq_init,
1797         .reset          =       cbq_reset,
1798         .destroy        =       cbq_destroy,
1799         .change         =       NULL,
1800         .dump           =       cbq_dump,
1801         .dump_stats     =       cbq_dump_stats,
1802         .owner          =       THIS_MODULE,
1803 };
1804 
1805 static int __init cbq_module_init(void)
1806 {
1807         return register_qdisc(&cbq_qdisc_ops);
1808 }
1809 static void __exit cbq_module_exit(void)
1810 {
1811         unregister_qdisc(&cbq_qdisc_ops);
1812 }
1813 module_init(cbq_module_init)
1814 module_exit(cbq_module_exit)
1815 MODULE_LICENSE("GPL");
1816 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp