~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/sched/sch_api.c

Version: ~ [ linux-5.13-rc1 ] ~ [ linux-5.12.2 ] ~ [ linux-5.11.19 ] ~ [ linux-5.10.35 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.117 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.190 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.232 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.268 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.268 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * net/sched/sch_api.c  Packet scheduler API.
  4  *
  5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6  *
  7  * Fixes:
  8  *
  9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
 10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
 11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
 12  */
 13 
 14 #include <linux/module.h>
 15 #include <linux/types.h>
 16 #include <linux/kernel.h>
 17 #include <linux/string.h>
 18 #include <linux/errno.h>
 19 #include <linux/skbuff.h>
 20 #include <linux/init.h>
 21 #include <linux/proc_fs.h>
 22 #include <linux/seq_file.h>
 23 #include <linux/kmod.h>
 24 #include <linux/list.h>
 25 #include <linux/hrtimer.h>
 26 #include <linux/slab.h>
 27 #include <linux/hashtable.h>
 28 
 29 #include <net/net_namespace.h>
 30 #include <net/sock.h>
 31 #include <net/netlink.h>
 32 #include <net/pkt_sched.h>
 33 #include <net/pkt_cls.h>
 34 
 35 #include <trace/events/qdisc.h>
 36 
 37 /*
 38 
 39    Short review.
 40    -------------
 41 
 42    This file consists of two interrelated parts:
 43 
 44    1. queueing disciplines manager frontend.
 45    2. traffic classes manager frontend.
 46 
 47    Generally, queueing discipline ("qdisc") is a black box,
 48    which is able to enqueue packets and to dequeue them (when
 49    device is ready to send something) in order and at times
 50    determined by algorithm hidden in it.
 51 
 52    qdisc's are divided to two categories:
 53    - "queues", which have no internal structure visible from outside.
 54    - "schedulers", which split all the packets to "traffic classes",
 55      using "packet classifiers" (look at cls_api.c)
 56 
 57    In turn, classes may have child qdiscs (as rule, queues)
 58    attached to them etc. etc. etc.
 59 
 60    The goal of the routines in this file is to translate
 61    information supplied by user in the form of handles
 62    to more intelligible for kernel form, to make some sanity
 63    checks and part of work, which is common to all qdiscs
 64    and to provide rtnetlink notifications.
 65 
 66    All real intelligent work is done inside qdisc modules.
 67 
 68 
 69 
 70    Every discipline has two major routines: enqueue and dequeue.
 71 
 72    ---dequeue
 73 
 74    dequeue usually returns a skb to send. It is allowed to return NULL,
 75    but it does not mean that queue is empty, it just means that
 76    discipline does not want to send anything this time.
 77    Queue is really empty if q->q.qlen == 0.
 78    For complicated disciplines with multiple queues q->q is not
 79    real packet queue, but however q->q.qlen must be valid.
 80 
 81    ---enqueue
 82 
 83    enqueue returns 0, if packet was enqueued successfully.
 84    If packet (this one or another one) was dropped, it returns
 85    not zero error code.
 86    NET_XMIT_DROP        - this packet dropped
 87      Expected action: do not backoff, but wait until queue will clear.
 88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
 89      Expected action: backoff or ignore
 90 
 91    Auxiliary routines:
 92 
 93    ---peek
 94 
 95    like dequeue but without removing a packet from the queue
 96 
 97    ---reset
 98 
 99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101 
102    ---init
103 
104    initializes newly created qdisc.
105 
106    ---destroy
107 
108    destroys resources allocated by init and during lifetime of qdisc.
109 
110    ---change
111 
112    changes qdisc parameters.
113  */
114 
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117 
118 
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122 
123 
124 /* The list of all installed queueing disciplines. */
125 
126 static struct Qdisc_ops *qdisc_base;
127 
128 /* Register/unregister queueing discipline */
129 
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134 
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139 
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150 
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153 
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156 
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160 
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167 
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173 
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178 
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192 
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200 
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204 
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212 
213         return q;
214 }
215 
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220 
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223 
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231 
232                 ops = qdisc_lookup_default(name);
233         }
234 
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241 
242         return ops ? 0 : -ENOENT;
243 }
244 
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253 
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258 
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262 
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265 
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269 
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277 
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288 
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297 
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301 
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(dev->qdisc, handle);
305         if (q)
306                 goto out;
307 
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315 
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320 
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(dev->qdisc, handle);
324         if (q)
325                 goto out;
326 
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333 
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338 
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342 
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347 
348 /* Find queueing discipline by name */
349 
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353 
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367 
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391 
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397 
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406 
407 static struct qdisc_rate_table *qdisc_rtab_list;
408 
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414 
415         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
416             nla_len(tab) != TC_RTAB_SIZE) {
417                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
418                 return NULL;
419         }
420 
421         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
422                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
423                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
424                         rtab->refcnt++;
425                         return rtab;
426                 }
427         }
428 
429         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
430         if (rtab) {
431                 rtab->rate = *r;
432                 rtab->refcnt = 1;
433                 memcpy(rtab->data, nla_data(tab), 1024);
434                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
435                         r->linklayer = __detect_linklayer(r, rtab->data);
436                 rtab->next = qdisc_rtab_list;
437                 qdisc_rtab_list = rtab;
438         } else {
439                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
440         }
441         return rtab;
442 }
443 EXPORT_SYMBOL(qdisc_get_rtab);
444 
445 void qdisc_put_rtab(struct qdisc_rate_table *tab)
446 {
447         struct qdisc_rate_table *rtab, **rtabp;
448 
449         if (!tab || --tab->refcnt)
450                 return;
451 
452         for (rtabp = &qdisc_rtab_list;
453              (rtab = *rtabp) != NULL;
454              rtabp = &rtab->next) {
455                 if (rtab == tab) {
456                         *rtabp = rtab->next;
457                         kfree(rtab);
458                         return;
459                 }
460         }
461 }
462 EXPORT_SYMBOL(qdisc_put_rtab);
463 
464 static LIST_HEAD(qdisc_stab_list);
465 
466 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
467         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
468         [TCA_STAB_DATA] = { .type = NLA_BINARY },
469 };
470 
471 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
472                                                struct netlink_ext_ack *extack)
473 {
474         struct nlattr *tb[TCA_STAB_MAX + 1];
475         struct qdisc_size_table *stab;
476         struct tc_sizespec *s;
477         unsigned int tsize = 0;
478         u16 *tab = NULL;
479         int err;
480 
481         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
482                                           extack);
483         if (err < 0)
484                 return ERR_PTR(err);
485         if (!tb[TCA_STAB_BASE]) {
486                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
487                 return ERR_PTR(-EINVAL);
488         }
489 
490         s = nla_data(tb[TCA_STAB_BASE]);
491 
492         if (s->tsize > 0) {
493                 if (!tb[TCA_STAB_DATA]) {
494                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
495                         return ERR_PTR(-EINVAL);
496                 }
497                 tab = nla_data(tb[TCA_STAB_DATA]);
498                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
499         }
500 
501         if (tsize != s->tsize || (!tab && tsize > 0)) {
502                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
503                 return ERR_PTR(-EINVAL);
504         }
505 
506         list_for_each_entry(stab, &qdisc_stab_list, list) {
507                 if (memcmp(&stab->szopts, s, sizeof(*s)))
508                         continue;
509                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
510                         continue;
511                 stab->refcnt++;
512                 return stab;
513         }
514 
515         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
516         if (!stab)
517                 return ERR_PTR(-ENOMEM);
518 
519         stab->refcnt = 1;
520         stab->szopts = *s;
521         if (tsize > 0)
522                 memcpy(stab->data, tab, tsize * sizeof(u16));
523 
524         list_add_tail(&stab->list, &qdisc_stab_list);
525 
526         return stab;
527 }
528 
529 void qdisc_put_stab(struct qdisc_size_table *tab)
530 {
531         if (!tab)
532                 return;
533 
534         if (--tab->refcnt == 0) {
535                 list_del(&tab->list);
536                 kfree_rcu(tab, rcu);
537         }
538 }
539 EXPORT_SYMBOL(qdisc_put_stab);
540 
541 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
542 {
543         struct nlattr *nest;
544 
545         nest = nla_nest_start_noflag(skb, TCA_STAB);
546         if (nest == NULL)
547                 goto nla_put_failure;
548         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
549                 goto nla_put_failure;
550         nla_nest_end(skb, nest);
551 
552         return skb->len;
553 
554 nla_put_failure:
555         return -1;
556 }
557 
558 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
559                                const struct qdisc_size_table *stab)
560 {
561         int pkt_len, slot;
562 
563         pkt_len = skb->len + stab->szopts.overhead;
564         if (unlikely(!stab->szopts.tsize))
565                 goto out;
566 
567         slot = pkt_len + stab->szopts.cell_align;
568         if (unlikely(slot < 0))
569                 slot = 0;
570 
571         slot >>= stab->szopts.cell_log;
572         if (likely(slot < stab->szopts.tsize))
573                 pkt_len = stab->data[slot];
574         else
575                 pkt_len = stab->data[stab->szopts.tsize - 1] *
576                                 (slot / stab->szopts.tsize) +
577                                 stab->data[slot % stab->szopts.tsize];
578 
579         pkt_len <<= stab->szopts.size_log;
580 out:
581         if (unlikely(pkt_len < 1))
582                 pkt_len = 1;
583         qdisc_skb_cb(skb)->pkt_len = pkt_len;
584 }
585 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
586 
587 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
588 {
589         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
590                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
591                         txt, qdisc->ops->id, qdisc->handle >> 16);
592                 qdisc->flags |= TCQ_F_WARN_NONWC;
593         }
594 }
595 EXPORT_SYMBOL(qdisc_warn_nonwc);
596 
597 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
598 {
599         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
600                                                  timer);
601 
602         rcu_read_lock();
603         __netif_schedule(qdisc_root(wd->qdisc));
604         rcu_read_unlock();
605 
606         return HRTIMER_NORESTART;
607 }
608 
609 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
610                                  clockid_t clockid)
611 {
612         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
613         wd->timer.function = qdisc_watchdog;
614         wd->qdisc = qdisc;
615 }
616 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
617 
618 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
619 {
620         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
621 }
622 EXPORT_SYMBOL(qdisc_watchdog_init);
623 
624 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
625                                       u64 delta_ns)
626 {
627         if (test_bit(__QDISC_STATE_DEACTIVATED,
628                      &qdisc_root_sleeping(wd->qdisc)->state))
629                 return;
630 
631         if (hrtimer_is_queued(&wd->timer)) {
632                 /* If timer is already set in [expires, expires + delta_ns],
633                  * do not reprogram it.
634                  */
635                 if (wd->last_expires - expires <= delta_ns)
636                         return;
637         }
638 
639         wd->last_expires = expires;
640         hrtimer_start_range_ns(&wd->timer,
641                                ns_to_ktime(expires),
642                                delta_ns,
643                                HRTIMER_MODE_ABS_PINNED);
644 }
645 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
646 
647 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
648 {
649         hrtimer_cancel(&wd->timer);
650 }
651 EXPORT_SYMBOL(qdisc_watchdog_cancel);
652 
653 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
654 {
655         struct hlist_head *h;
656         unsigned int i;
657 
658         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
659 
660         if (h != NULL) {
661                 for (i = 0; i < n; i++)
662                         INIT_HLIST_HEAD(&h[i]);
663         }
664         return h;
665 }
666 
667 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
668 {
669         struct Qdisc_class_common *cl;
670         struct hlist_node *next;
671         struct hlist_head *nhash, *ohash;
672         unsigned int nsize, nmask, osize;
673         unsigned int i, h;
674 
675         /* Rehash when load factor exceeds 0.75 */
676         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
677                 return;
678         nsize = clhash->hashsize * 2;
679         nmask = nsize - 1;
680         nhash = qdisc_class_hash_alloc(nsize);
681         if (nhash == NULL)
682                 return;
683 
684         ohash = clhash->hash;
685         osize = clhash->hashsize;
686 
687         sch_tree_lock(sch);
688         for (i = 0; i < osize; i++) {
689                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
690                         h = qdisc_class_hash(cl->classid, nmask);
691                         hlist_add_head(&cl->hnode, &nhash[h]);
692                 }
693         }
694         clhash->hash     = nhash;
695         clhash->hashsize = nsize;
696         clhash->hashmask = nmask;
697         sch_tree_unlock(sch);
698 
699         kvfree(ohash);
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_grow);
702 
703 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
704 {
705         unsigned int size = 4;
706 
707         clhash->hash = qdisc_class_hash_alloc(size);
708         if (!clhash->hash)
709                 return -ENOMEM;
710         clhash->hashsize  = size;
711         clhash->hashmask  = size - 1;
712         clhash->hashelems = 0;
713         return 0;
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_init);
716 
717 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
718 {
719         kvfree(clhash->hash);
720 }
721 EXPORT_SYMBOL(qdisc_class_hash_destroy);
722 
723 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
724                              struct Qdisc_class_common *cl)
725 {
726         unsigned int h;
727 
728         INIT_HLIST_NODE(&cl->hnode);
729         h = qdisc_class_hash(cl->classid, clhash->hashmask);
730         hlist_add_head(&cl->hnode, &clhash->hash[h]);
731         clhash->hashelems++;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_insert);
734 
735 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
736                              struct Qdisc_class_common *cl)
737 {
738         hlist_del(&cl->hnode);
739         clhash->hashelems--;
740 }
741 EXPORT_SYMBOL(qdisc_class_hash_remove);
742 
743 /* Allocate an unique handle from space managed by kernel
744  * Possible range is [8000-FFFF]:0000 (0x8000 values)
745  */
746 static u32 qdisc_alloc_handle(struct net_device *dev)
747 {
748         int i = 0x8000;
749         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
750 
751         do {
752                 autohandle += TC_H_MAKE(0x10000U, 0);
753                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
754                         autohandle = TC_H_MAKE(0x80000000U, 0);
755                 if (!qdisc_lookup(dev, autohandle))
756                         return autohandle;
757                 cond_resched();
758         } while (--i > 0);
759 
760         return 0;
761 }
762 
763 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
764 {
765         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
766         const struct Qdisc_class_ops *cops;
767         unsigned long cl;
768         u32 parentid;
769         bool notify;
770         int drops;
771 
772         if (n == 0 && len == 0)
773                 return;
774         drops = max_t(int, n, 0);
775         rcu_read_lock();
776         while ((parentid = sch->parent)) {
777                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
778                         break;
779 
780                 if (sch->flags & TCQ_F_NOPARENT)
781                         break;
782                 /* Notify parent qdisc only if child qdisc becomes empty.
783                  *
784                  * If child was empty even before update then backlog
785                  * counter is screwed and we skip notification because
786                  * parent class is already passive.
787                  *
788                  * If the original child was offloaded then it is allowed
789                  * to be seem as empty, so the parent is notified anyway.
790                  */
791                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
792                                                        !qdisc_is_offloaded);
793                 /* TODO: perform the search on a per txq basis */
794                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
795                 if (sch == NULL) {
796                         WARN_ON_ONCE(parentid != TC_H_ROOT);
797                         break;
798                 }
799                 cops = sch->ops->cl_ops;
800                 if (notify && cops->qlen_notify) {
801                         cl = cops->find(sch, parentid);
802                         cops->qlen_notify(sch, cl);
803                 }
804                 sch->q.qlen -= n;
805                 sch->qstats.backlog -= len;
806                 __qdisc_qstats_drop(sch, drops);
807         }
808         rcu_read_unlock();
809 }
810 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
811 
812 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
813                               void *type_data)
814 {
815         struct net_device *dev = qdisc_dev(sch);
816         int err;
817 
818         sch->flags &= ~TCQ_F_OFFLOADED;
819         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
820                 return 0;
821 
822         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
823         if (err == -EOPNOTSUPP)
824                 return 0;
825 
826         if (!err)
827                 sch->flags |= TCQ_F_OFFLOADED;
828 
829         return err;
830 }
831 EXPORT_SYMBOL(qdisc_offload_dump_helper);
832 
833 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
834                                 struct Qdisc *new, struct Qdisc *old,
835                                 enum tc_setup_type type, void *type_data,
836                                 struct netlink_ext_ack *extack)
837 {
838         bool any_qdisc_is_offloaded;
839         int err;
840 
841         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
842                 return;
843 
844         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
845 
846         /* Don't report error if the graft is part of destroy operation. */
847         if (!err || !new || new == &noop_qdisc)
848                 return;
849 
850         /* Don't report error if the parent, the old child and the new
851          * one are not offloaded.
852          */
853         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
854         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
855         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
856 
857         if (any_qdisc_is_offloaded)
858                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
859 }
860 EXPORT_SYMBOL(qdisc_offload_graft_helper);
861 
862 static void qdisc_offload_graft_root(struct net_device *dev,
863                                      struct Qdisc *new, struct Qdisc *old,
864                                      struct netlink_ext_ack *extack)
865 {
866         struct tc_root_qopt_offload graft_offload = {
867                 .command        = TC_ROOT_GRAFT,
868                 .handle         = new ? new->handle : 0,
869                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
870                                   (old && old->flags & TCQ_F_INGRESS),
871         };
872 
873         qdisc_offload_graft_helper(dev, NULL, new, old,
874                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
875 }
876 
877 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
878                          u32 portid, u32 seq, u16 flags, int event)
879 {
880         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
881         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
882         struct tcmsg *tcm;
883         struct nlmsghdr  *nlh;
884         unsigned char *b = skb_tail_pointer(skb);
885         struct gnet_dump d;
886         struct qdisc_size_table *stab;
887         u32 block_index;
888         __u32 qlen;
889 
890         cond_resched();
891         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
892         if (!nlh)
893                 goto out_nlmsg_trim;
894         tcm = nlmsg_data(nlh);
895         tcm->tcm_family = AF_UNSPEC;
896         tcm->tcm__pad1 = 0;
897         tcm->tcm__pad2 = 0;
898         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
899         tcm->tcm_parent = clid;
900         tcm->tcm_handle = q->handle;
901         tcm->tcm_info = refcount_read(&q->refcnt);
902         if (nla_put_string(skb, TCA_KIND, q->ops->id))
903                 goto nla_put_failure;
904         if (q->ops->ingress_block_get) {
905                 block_index = q->ops->ingress_block_get(q);
906                 if (block_index &&
907                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
908                         goto nla_put_failure;
909         }
910         if (q->ops->egress_block_get) {
911                 block_index = q->ops->egress_block_get(q);
912                 if (block_index &&
913                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
914                         goto nla_put_failure;
915         }
916         if (q->ops->dump && q->ops->dump(q, skb) < 0)
917                 goto nla_put_failure;
918         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
919                 goto nla_put_failure;
920         qlen = qdisc_qlen_sum(q);
921 
922         stab = rtnl_dereference(q->stab);
923         if (stab && qdisc_dump_stab(skb, stab) < 0)
924                 goto nla_put_failure;
925 
926         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
927                                          NULL, &d, TCA_PAD) < 0)
928                 goto nla_put_failure;
929 
930         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
931                 goto nla_put_failure;
932 
933         if (qdisc_is_percpu_stats(q)) {
934                 cpu_bstats = q->cpu_bstats;
935                 cpu_qstats = q->cpu_qstats;
936         }
937 
938         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
939                                   &d, cpu_bstats, &q->bstats) < 0 ||
940             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
941             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
942                 goto nla_put_failure;
943 
944         if (gnet_stats_finish_copy(&d) < 0)
945                 goto nla_put_failure;
946 
947         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
948         return skb->len;
949 
950 out_nlmsg_trim:
951 nla_put_failure:
952         nlmsg_trim(skb, b);
953         return -1;
954 }
955 
956 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
957 {
958         if (q->flags & TCQ_F_BUILTIN)
959                 return true;
960         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
961                 return true;
962 
963         return false;
964 }
965 
966 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
967                         struct nlmsghdr *n, u32 clid,
968                         struct Qdisc *old, struct Qdisc *new)
969 {
970         struct sk_buff *skb;
971         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
972 
973         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
974         if (!skb)
975                 return -ENOBUFS;
976 
977         if (old && !tc_qdisc_dump_ignore(old, false)) {
978                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
979                                   0, RTM_DELQDISC) < 0)
980                         goto err_out;
981         }
982         if (new && !tc_qdisc_dump_ignore(new, false)) {
983                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
984                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
985                         goto err_out;
986         }
987 
988         if (skb->len)
989                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
990                                       n->nlmsg_flags & NLM_F_ECHO);
991 
992 err_out:
993         kfree_skb(skb);
994         return -EINVAL;
995 }
996 
997 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
998                                struct nlmsghdr *n, u32 clid,
999                                struct Qdisc *old, struct Qdisc *new)
1000 {
1001         if (new || old)
1002                 qdisc_notify(net, skb, n, clid, old, new);
1003 
1004         if (old)
1005                 qdisc_put(old);
1006 }
1007 
1008 static void qdisc_clear_nolock(struct Qdisc *sch)
1009 {
1010         sch->flags &= ~TCQ_F_NOLOCK;
1011         if (!(sch->flags & TCQ_F_CPUSTATS))
1012                 return;
1013 
1014         free_percpu(sch->cpu_bstats);
1015         free_percpu(sch->cpu_qstats);
1016         sch->cpu_bstats = NULL;
1017         sch->cpu_qstats = NULL;
1018         sch->flags &= ~TCQ_F_CPUSTATS;
1019 }
1020 
1021 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1022  * to device "dev".
1023  *
1024  * When appropriate send a netlink notification using 'skb'
1025  * and "n".
1026  *
1027  * On success, destroy old qdisc.
1028  */
1029 
1030 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1031                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1032                        struct Qdisc *new, struct Qdisc *old,
1033                        struct netlink_ext_ack *extack)
1034 {
1035         struct Qdisc *q = old;
1036         struct net *net = dev_net(dev);
1037 
1038         if (parent == NULL) {
1039                 unsigned int i, num_q, ingress;
1040 
1041                 ingress = 0;
1042                 num_q = dev->num_tx_queues;
1043                 if ((q && q->flags & TCQ_F_INGRESS) ||
1044                     (new && new->flags & TCQ_F_INGRESS)) {
1045                         num_q = 1;
1046                         ingress = 1;
1047                         if (!dev_ingress_queue(dev)) {
1048                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1049                                 return -ENOENT;
1050                         }
1051                 }
1052 
1053                 if (dev->flags & IFF_UP)
1054                         dev_deactivate(dev);
1055 
1056                 qdisc_offload_graft_root(dev, new, old, extack);
1057 
1058                 if (new && new->ops->attach)
1059                         goto skip;
1060 
1061                 for (i = 0; i < num_q; i++) {
1062                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1063 
1064                         if (!ingress)
1065                                 dev_queue = netdev_get_tx_queue(dev, i);
1066 
1067                         old = dev_graft_qdisc(dev_queue, new);
1068                         if (new && i > 0)
1069                                 qdisc_refcount_inc(new);
1070 
1071                         if (!ingress)
1072                                 qdisc_put(old);
1073                 }
1074 
1075 skip:
1076                 if (!ingress) {
1077                         notify_and_destroy(net, skb, n, classid,
1078                                            dev->qdisc, new);
1079                         if (new && !new->ops->attach)
1080                                 qdisc_refcount_inc(new);
1081                         dev->qdisc = new ? : &noop_qdisc;
1082 
1083                         if (new && new->ops->attach)
1084                                 new->ops->attach(new);
1085                 } else {
1086                         notify_and_destroy(net, skb, n, classid, old, new);
1087                 }
1088 
1089                 if (dev->flags & IFF_UP)
1090                         dev_activate(dev);
1091         } else {
1092                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1093                 unsigned long cl;
1094                 int err;
1095 
1096                 /* Only support running class lockless if parent is lockless */
1097                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1098                         qdisc_clear_nolock(new);
1099 
1100                 if (!cops || !cops->graft)
1101                         return -EOPNOTSUPP;
1102 
1103                 cl = cops->find(parent, classid);
1104                 if (!cl) {
1105                         NL_SET_ERR_MSG(extack, "Specified class not found");
1106                         return -ENOENT;
1107                 }
1108 
1109                 err = cops->graft(parent, cl, new, &old, extack);
1110                 if (err)
1111                         return err;
1112                 notify_and_destroy(net, skb, n, classid, old, new);
1113         }
1114         return 0;
1115 }
1116 
1117 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1118                                    struct netlink_ext_ack *extack)
1119 {
1120         u32 block_index;
1121 
1122         if (tca[TCA_INGRESS_BLOCK]) {
1123                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1124 
1125                 if (!block_index) {
1126                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1127                         return -EINVAL;
1128                 }
1129                 if (!sch->ops->ingress_block_set) {
1130                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1131                         return -EOPNOTSUPP;
1132                 }
1133                 sch->ops->ingress_block_set(sch, block_index);
1134         }
1135         if (tca[TCA_EGRESS_BLOCK]) {
1136                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1137 
1138                 if (!block_index) {
1139                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1140                         return -EINVAL;
1141                 }
1142                 if (!sch->ops->egress_block_set) {
1143                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1144                         return -EOPNOTSUPP;
1145                 }
1146                 sch->ops->egress_block_set(sch, block_index);
1147         }
1148         return 0;
1149 }
1150 
1151 /*
1152    Allocate and initialize new qdisc.
1153 
1154    Parameters are passed via opt.
1155  */
1156 
1157 static struct Qdisc *qdisc_create(struct net_device *dev,
1158                                   struct netdev_queue *dev_queue,
1159                                   struct Qdisc *p, u32 parent, u32 handle,
1160                                   struct nlattr **tca, int *errp,
1161                                   struct netlink_ext_ack *extack)
1162 {
1163         int err;
1164         struct nlattr *kind = tca[TCA_KIND];
1165         struct Qdisc *sch;
1166         struct Qdisc_ops *ops;
1167         struct qdisc_size_table *stab;
1168 
1169         ops = qdisc_lookup_ops(kind);
1170 #ifdef CONFIG_MODULES
1171         if (ops == NULL && kind != NULL) {
1172                 char name[IFNAMSIZ];
1173                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1174                         /* We dropped the RTNL semaphore in order to
1175                          * perform the module load.  So, even if we
1176                          * succeeded in loading the module we have to
1177                          * tell the caller to replay the request.  We
1178                          * indicate this using -EAGAIN.
1179                          * We replay the request because the device may
1180                          * go away in the mean time.
1181                          */
1182                         rtnl_unlock();
1183                         request_module("sch_%s", name);
1184                         rtnl_lock();
1185                         ops = qdisc_lookup_ops(kind);
1186                         if (ops != NULL) {
1187                                 /* We will try again qdisc_lookup_ops,
1188                                  * so don't keep a reference.
1189                                  */
1190                                 module_put(ops->owner);
1191                                 err = -EAGAIN;
1192                                 goto err_out;
1193                         }
1194                 }
1195         }
1196 #endif
1197 
1198         err = -ENOENT;
1199         if (!ops) {
1200                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1201                 goto err_out;
1202         }
1203 
1204         sch = qdisc_alloc(dev_queue, ops, extack);
1205         if (IS_ERR(sch)) {
1206                 err = PTR_ERR(sch);
1207                 goto err_out2;
1208         }
1209 
1210         sch->parent = parent;
1211 
1212         if (handle == TC_H_INGRESS) {
1213                 sch->flags |= TCQ_F_INGRESS;
1214                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1215         } else {
1216                 if (handle == 0) {
1217                         handle = qdisc_alloc_handle(dev);
1218                         if (handle == 0) {
1219                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1220                                 err = -ENOSPC;
1221                                 goto err_out3;
1222                         }
1223                 }
1224                 if (!netif_is_multiqueue(dev))
1225                         sch->flags |= TCQ_F_ONETXQUEUE;
1226         }
1227 
1228         sch->handle = handle;
1229 
1230         /* This exist to keep backward compatible with a userspace
1231          * loophole, what allowed userspace to get IFF_NO_QUEUE
1232          * facility on older kernels by setting tx_queue_len=0 (prior
1233          * to qdisc init), and then forgot to reinit tx_queue_len
1234          * before again attaching a qdisc.
1235          */
1236         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1237                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1238                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1239         }
1240 
1241         err = qdisc_block_indexes_set(sch, tca, extack);
1242         if (err)
1243                 goto err_out3;
1244 
1245         if (ops->init) {
1246                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1247                 if (err != 0)
1248                         goto err_out5;
1249         }
1250 
1251         if (tca[TCA_STAB]) {
1252                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1253                 if (IS_ERR(stab)) {
1254                         err = PTR_ERR(stab);
1255                         goto err_out4;
1256                 }
1257                 rcu_assign_pointer(sch->stab, stab);
1258         }
1259         if (tca[TCA_RATE]) {
1260                 seqcount_t *running;
1261 
1262                 err = -EOPNOTSUPP;
1263                 if (sch->flags & TCQ_F_MQROOT) {
1264                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1265                         goto err_out4;
1266                 }
1267 
1268                 if (sch->parent != TC_H_ROOT &&
1269                     !(sch->flags & TCQ_F_INGRESS) &&
1270                     (!p || !(p->flags & TCQ_F_MQROOT)))
1271                         running = qdisc_root_sleeping_running(sch);
1272                 else
1273                         running = &sch->running;
1274 
1275                 err = gen_new_estimator(&sch->bstats,
1276                                         sch->cpu_bstats,
1277                                         &sch->rate_est,
1278                                         NULL,
1279                                         running,
1280                                         tca[TCA_RATE]);
1281                 if (err) {
1282                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1283                         goto err_out4;
1284                 }
1285         }
1286 
1287         qdisc_hash_add(sch, false);
1288         trace_qdisc_create(ops, dev, parent);
1289 
1290         return sch;
1291 
1292 err_out5:
1293         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1294         if (ops->destroy)
1295                 ops->destroy(sch);
1296 err_out3:
1297         dev_put(dev);
1298         qdisc_free(sch);
1299 err_out2:
1300         module_put(ops->owner);
1301 err_out:
1302         *errp = err;
1303         return NULL;
1304 
1305 err_out4:
1306         /*
1307          * Any broken qdiscs that would require a ops->reset() here?
1308          * The qdisc was never in action so it shouldn't be necessary.
1309          */
1310         qdisc_put_stab(rtnl_dereference(sch->stab));
1311         if (ops->destroy)
1312                 ops->destroy(sch);
1313         goto err_out3;
1314 }
1315 
1316 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1317                         struct netlink_ext_ack *extack)
1318 {
1319         struct qdisc_size_table *ostab, *stab = NULL;
1320         int err = 0;
1321 
1322         if (tca[TCA_OPTIONS]) {
1323                 if (!sch->ops->change) {
1324                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1325                         return -EINVAL;
1326                 }
1327                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1328                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1329                         return -EOPNOTSUPP;
1330                 }
1331                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1332                 if (err)
1333                         return err;
1334         }
1335 
1336         if (tca[TCA_STAB]) {
1337                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1338                 if (IS_ERR(stab))
1339                         return PTR_ERR(stab);
1340         }
1341 
1342         ostab = rtnl_dereference(sch->stab);
1343         rcu_assign_pointer(sch->stab, stab);
1344         qdisc_put_stab(ostab);
1345 
1346         if (tca[TCA_RATE]) {
1347                 /* NB: ignores errors from replace_estimator
1348                    because change can't be undone. */
1349                 if (sch->flags & TCQ_F_MQROOT)
1350                         goto out;
1351                 gen_replace_estimator(&sch->bstats,
1352                                       sch->cpu_bstats,
1353                                       &sch->rate_est,
1354                                       NULL,
1355                                       qdisc_root_sleeping_running(sch),
1356                                       tca[TCA_RATE]);
1357         }
1358 out:
1359         return 0;
1360 }
1361 
1362 struct check_loop_arg {
1363         struct qdisc_walker     w;
1364         struct Qdisc            *p;
1365         int                     depth;
1366 };
1367 
1368 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1369                          struct qdisc_walker *w);
1370 
1371 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1372 {
1373         struct check_loop_arg   arg;
1374 
1375         if (q->ops->cl_ops == NULL)
1376                 return 0;
1377 
1378         arg.w.stop = arg.w.skip = arg.w.count = 0;
1379         arg.w.fn = check_loop_fn;
1380         arg.depth = depth;
1381         arg.p = p;
1382         q->ops->cl_ops->walk(q, &arg.w);
1383         return arg.w.stop ? -ELOOP : 0;
1384 }
1385 
1386 static int
1387 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1388 {
1389         struct Qdisc *leaf;
1390         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1391         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1392 
1393         leaf = cops->leaf(q, cl);
1394         if (leaf) {
1395                 if (leaf == arg->p || arg->depth > 7)
1396                         return -ELOOP;
1397                 return check_loop(leaf, arg->p, arg->depth + 1);
1398         }
1399         return 0;
1400 }
1401 
1402 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1403         [TCA_KIND]              = { .type = NLA_STRING },
1404         [TCA_RATE]              = { .type = NLA_BINARY,
1405                                     .len = sizeof(struct tc_estimator) },
1406         [TCA_STAB]              = { .type = NLA_NESTED },
1407         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1408         [TCA_CHAIN]             = { .type = NLA_U32 },
1409         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1410         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1411 };
1412 
1413 /*
1414  * Delete/get qdisc.
1415  */
1416 
1417 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1418                         struct netlink_ext_ack *extack)
1419 {
1420         struct net *net = sock_net(skb->sk);
1421         struct tcmsg *tcm = nlmsg_data(n);
1422         struct nlattr *tca[TCA_MAX + 1];
1423         struct net_device *dev;
1424         u32 clid;
1425         struct Qdisc *q = NULL;
1426         struct Qdisc *p = NULL;
1427         int err;
1428 
1429         if ((n->nlmsg_type != RTM_GETQDISC) &&
1430             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1431                 return -EPERM;
1432 
1433         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1434                                      rtm_tca_policy, extack);
1435         if (err < 0)
1436                 return err;
1437 
1438         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1439         if (!dev)
1440                 return -ENODEV;
1441 
1442         clid = tcm->tcm_parent;
1443         if (clid) {
1444                 if (clid != TC_H_ROOT) {
1445                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1446                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1447                                 if (!p) {
1448                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1449                                         return -ENOENT;
1450                                 }
1451                                 q = qdisc_leaf(p, clid);
1452                         } else if (dev_ingress_queue(dev)) {
1453                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1454                         }
1455                 } else {
1456                         q = dev->qdisc;
1457                 }
1458                 if (!q) {
1459                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1460                         return -ENOENT;
1461                 }
1462 
1463                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1464                         NL_SET_ERR_MSG(extack, "Invalid handle");
1465                         return -EINVAL;
1466                 }
1467         } else {
1468                 q = qdisc_lookup(dev, tcm->tcm_handle);
1469                 if (!q) {
1470                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1471                         return -ENOENT;
1472                 }
1473         }
1474 
1475         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1476                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1477                 return -EINVAL;
1478         }
1479 
1480         if (n->nlmsg_type == RTM_DELQDISC) {
1481                 if (!clid) {
1482                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1483                         return -EINVAL;
1484                 }
1485                 if (q->handle == 0) {
1486                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1487                         return -ENOENT;
1488                 }
1489                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1490                 if (err != 0)
1491                         return err;
1492         } else {
1493                 qdisc_notify(net, skb, n, clid, NULL, q);
1494         }
1495         return 0;
1496 }
1497 
1498 /*
1499  * Create/change qdisc.
1500  */
1501 
1502 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1503                            struct netlink_ext_ack *extack)
1504 {
1505         struct net *net = sock_net(skb->sk);
1506         struct tcmsg *tcm;
1507         struct nlattr *tca[TCA_MAX + 1];
1508         struct net_device *dev;
1509         u32 clid;
1510         struct Qdisc *q, *p;
1511         int err;
1512 
1513         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1514                 return -EPERM;
1515 
1516 replay:
1517         /* Reinit, just in case something touches this. */
1518         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1519                                      rtm_tca_policy, extack);
1520         if (err < 0)
1521                 return err;
1522 
1523         tcm = nlmsg_data(n);
1524         clid = tcm->tcm_parent;
1525         q = p = NULL;
1526 
1527         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1528         if (!dev)
1529                 return -ENODEV;
1530 
1531 
1532         if (clid) {
1533                 if (clid != TC_H_ROOT) {
1534                         if (clid != TC_H_INGRESS) {
1535                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1536                                 if (!p) {
1537                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1538                                         return -ENOENT;
1539                                 }
1540                                 q = qdisc_leaf(p, clid);
1541                         } else if (dev_ingress_queue_create(dev)) {
1542                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1543                         }
1544                 } else {
1545                         q = dev->qdisc;
1546                 }
1547 
1548                 /* It may be default qdisc, ignore it */
1549                 if (q && q->handle == 0)
1550                         q = NULL;
1551 
1552                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1553                         if (tcm->tcm_handle) {
1554                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1555                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1556                                         return -EEXIST;
1557                                 }
1558                                 if (TC_H_MIN(tcm->tcm_handle)) {
1559                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1560                                         return -EINVAL;
1561                                 }
1562                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1563                                 if (!q)
1564                                         goto create_n_graft;
1565                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1566                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1567                                         return -EEXIST;
1568                                 }
1569                                 if (tca[TCA_KIND] &&
1570                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1571                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1572                                         return -EINVAL;
1573                                 }
1574                                 if (q == p ||
1575                                     (p && check_loop(q, p, 0))) {
1576                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1577                                         return -ELOOP;
1578                                 }
1579                                 qdisc_refcount_inc(q);
1580                                 goto graft;
1581                         } else {
1582                                 if (!q)
1583                                         goto create_n_graft;
1584 
1585                                 /* This magic test requires explanation.
1586                                  *
1587                                  *   We know, that some child q is already
1588                                  *   attached to this parent and have choice:
1589                                  *   either to change it or to create/graft new one.
1590                                  *
1591                                  *   1. We are allowed to create/graft only
1592                                  *   if CREATE and REPLACE flags are set.
1593                                  *
1594                                  *   2. If EXCL is set, requestor wanted to say,
1595                                  *   that qdisc tcm_handle is not expected
1596                                  *   to exist, so that we choose create/graft too.
1597                                  *
1598                                  *   3. The last case is when no flags are set.
1599                                  *   Alas, it is sort of hole in API, we
1600                                  *   cannot decide what to do unambiguously.
1601                                  *   For now we select create/graft, if
1602                                  *   user gave KIND, which does not match existing.
1603                                  */
1604                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1605                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1606                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1607                                      (tca[TCA_KIND] &&
1608                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1609                                         goto create_n_graft;
1610                         }
1611                 }
1612         } else {
1613                 if (!tcm->tcm_handle) {
1614                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1615                         return -EINVAL;
1616                 }
1617                 q = qdisc_lookup(dev, tcm->tcm_handle);
1618         }
1619 
1620         /* Change qdisc parameters */
1621         if (!q) {
1622                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1623                 return -ENOENT;
1624         }
1625         if (n->nlmsg_flags & NLM_F_EXCL) {
1626                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1627                 return -EEXIST;
1628         }
1629         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1630                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1631                 return -EINVAL;
1632         }
1633         err = qdisc_change(q, tca, extack);
1634         if (err == 0)
1635                 qdisc_notify(net, skb, n, clid, NULL, q);
1636         return err;
1637 
1638 create_n_graft:
1639         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1640                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1641                 return -ENOENT;
1642         }
1643         if (clid == TC_H_INGRESS) {
1644                 if (dev_ingress_queue(dev)) {
1645                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1646                                          tcm->tcm_parent, tcm->tcm_parent,
1647                                          tca, &err, extack);
1648                 } else {
1649                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1650                         err = -ENOENT;
1651                 }
1652         } else {
1653                 struct netdev_queue *dev_queue;
1654 
1655                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1656                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1657                 else if (p)
1658                         dev_queue = p->dev_queue;
1659                 else
1660                         dev_queue = netdev_get_tx_queue(dev, 0);
1661 
1662                 q = qdisc_create(dev, dev_queue, p,
1663                                  tcm->tcm_parent, tcm->tcm_handle,
1664                                  tca, &err, extack);
1665         }
1666         if (q == NULL) {
1667                 if (err == -EAGAIN)
1668                         goto replay;
1669                 return err;
1670         }
1671 
1672 graft:
1673         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1674         if (err) {
1675                 if (q)
1676                         qdisc_put(q);
1677                 return err;
1678         }
1679 
1680         return 0;
1681 }
1682 
1683 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1684                               struct netlink_callback *cb,
1685                               int *q_idx_p, int s_q_idx, bool recur,
1686                               bool dump_invisible)
1687 {
1688         int ret = 0, q_idx = *q_idx_p;
1689         struct Qdisc *q;
1690         int b;
1691 
1692         if (!root)
1693                 return 0;
1694 
1695         q = root;
1696         if (q_idx < s_q_idx) {
1697                 q_idx++;
1698         } else {
1699                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1700                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1701                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1702                                   RTM_NEWQDISC) <= 0)
1703                         goto done;
1704                 q_idx++;
1705         }
1706 
1707         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1708          * itself has already been dumped.
1709          *
1710          * If we've already dumped the top-level (ingress) qdisc above and the global
1711          * qdisc hashtable, we don't want to hit it again
1712          */
1713         if (!qdisc_dev(root) || !recur)
1714                 goto out;
1715 
1716         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1717                 if (q_idx < s_q_idx) {
1718                         q_idx++;
1719                         continue;
1720                 }
1721                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1722                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1723                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1724                                   RTM_NEWQDISC) <= 0)
1725                         goto done;
1726                 q_idx++;
1727         }
1728 
1729 out:
1730         *q_idx_p = q_idx;
1731         return ret;
1732 done:
1733         ret = -1;
1734         goto out;
1735 }
1736 
1737 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1738 {
1739         struct net *net = sock_net(skb->sk);
1740         int idx, q_idx;
1741         int s_idx, s_q_idx;
1742         struct net_device *dev;
1743         const struct nlmsghdr *nlh = cb->nlh;
1744         struct nlattr *tca[TCA_MAX + 1];
1745         int err;
1746 
1747         s_idx = cb->args[0];
1748         s_q_idx = q_idx = cb->args[1];
1749 
1750         idx = 0;
1751         ASSERT_RTNL();
1752 
1753         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1754                                      rtm_tca_policy, cb->extack);
1755         if (err < 0)
1756                 return err;
1757 
1758         for_each_netdev(net, dev) {
1759                 struct netdev_queue *dev_queue;
1760 
1761                 if (idx < s_idx)
1762                         goto cont;
1763                 if (idx > s_idx)
1764                         s_q_idx = 0;
1765                 q_idx = 0;
1766 
1767                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1768                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1769                         goto done;
1770 
1771                 dev_queue = dev_ingress_queue(dev);
1772                 if (dev_queue &&
1773                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1774                                        &q_idx, s_q_idx, false,
1775                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1776                         goto done;
1777 
1778 cont:
1779                 idx++;
1780         }
1781 
1782 done:
1783         cb->args[0] = idx;
1784         cb->args[1] = q_idx;
1785 
1786         return skb->len;
1787 }
1788 
1789 
1790 
1791 /************************************************
1792  *      Traffic classes manipulation.           *
1793  ************************************************/
1794 
1795 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1796                           unsigned long cl,
1797                           u32 portid, u32 seq, u16 flags, int event)
1798 {
1799         struct tcmsg *tcm;
1800         struct nlmsghdr  *nlh;
1801         unsigned char *b = skb_tail_pointer(skb);
1802         struct gnet_dump d;
1803         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1804 
1805         cond_resched();
1806         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1807         if (!nlh)
1808                 goto out_nlmsg_trim;
1809         tcm = nlmsg_data(nlh);
1810         tcm->tcm_family = AF_UNSPEC;
1811         tcm->tcm__pad1 = 0;
1812         tcm->tcm__pad2 = 0;
1813         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1814         tcm->tcm_parent = q->handle;
1815         tcm->tcm_handle = q->handle;
1816         tcm->tcm_info = 0;
1817         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1818                 goto nla_put_failure;
1819         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1820                 goto nla_put_failure;
1821 
1822         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1823                                          NULL, &d, TCA_PAD) < 0)
1824                 goto nla_put_failure;
1825 
1826         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1827                 goto nla_put_failure;
1828 
1829         if (gnet_stats_finish_copy(&d) < 0)
1830                 goto nla_put_failure;
1831 
1832         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1833         return skb->len;
1834 
1835 out_nlmsg_trim:
1836 nla_put_failure:
1837         nlmsg_trim(skb, b);
1838         return -1;
1839 }
1840 
1841 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1842                          struct nlmsghdr *n, struct Qdisc *q,
1843                          unsigned long cl, int event)
1844 {
1845         struct sk_buff *skb;
1846         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1847         int err = 0;
1848 
1849         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1850         if (!skb)
1851                 return -ENOBUFS;
1852 
1853         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1854                 kfree_skb(skb);
1855                 return -EINVAL;
1856         }
1857 
1858         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1859                              n->nlmsg_flags & NLM_F_ECHO);
1860         if (err > 0)
1861                 err = 0;
1862         return err;
1863 }
1864 
1865 static int tclass_del_notify(struct net *net,
1866                              const struct Qdisc_class_ops *cops,
1867                              struct sk_buff *oskb, struct nlmsghdr *n,
1868                              struct Qdisc *q, unsigned long cl)
1869 {
1870         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1871         struct sk_buff *skb;
1872         int err = 0;
1873 
1874         if (!cops->delete)
1875                 return -EOPNOTSUPP;
1876 
1877         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1878         if (!skb)
1879                 return -ENOBUFS;
1880 
1881         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1882                            RTM_DELTCLASS) < 0) {
1883                 kfree_skb(skb);
1884                 return -EINVAL;
1885         }
1886 
1887         err = cops->delete(q, cl);
1888         if (err) {
1889                 kfree_skb(skb);
1890                 return err;
1891         }
1892 
1893         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1894                              n->nlmsg_flags & NLM_F_ECHO);
1895         if (err > 0)
1896                 err = 0;
1897         return err;
1898 }
1899 
1900 #ifdef CONFIG_NET_CLS
1901 
1902 struct tcf_bind_args {
1903         struct tcf_walker w;
1904         unsigned long base;
1905         unsigned long cl;
1906         u32 classid;
1907 };
1908 
1909 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1910 {
1911         struct tcf_bind_args *a = (void *)arg;
1912 
1913         if (tp->ops->bind_class) {
1914                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1915 
1916                 sch_tree_lock(q);
1917                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1918                 sch_tree_unlock(q);
1919         }
1920         return 0;
1921 }
1922 
1923 struct tc_bind_class_args {
1924         struct qdisc_walker w;
1925         unsigned long new_cl;
1926         u32 portid;
1927         u32 clid;
1928 };
1929 
1930 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1931                                 struct qdisc_walker *w)
1932 {
1933         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1934         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1935         struct tcf_block *block;
1936         struct tcf_chain *chain;
1937 
1938         block = cops->tcf_block(q, cl, NULL);
1939         if (!block)
1940                 return 0;
1941         for (chain = tcf_get_next_chain(block, NULL);
1942              chain;
1943              chain = tcf_get_next_chain(block, chain)) {
1944                 struct tcf_proto *tp;
1945 
1946                 for (tp = tcf_get_next_proto(chain, NULL, true);
1947                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1948                         struct tcf_bind_args arg = {};
1949 
1950                         arg.w.fn = tcf_node_bind;
1951                         arg.classid = a->clid;
1952                         arg.base = cl;
1953                         arg.cl = a->new_cl;
1954                         tp->ops->walk(tp, &arg.w, true);
1955                 }
1956         }
1957 
1958         return 0;
1959 }
1960 
1961 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1962                            unsigned long new_cl)
1963 {
1964         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1965         struct tc_bind_class_args args = {};
1966 
1967         if (!cops->tcf_block)
1968                 return;
1969         args.portid = portid;
1970         args.clid = clid;
1971         args.new_cl = new_cl;
1972         args.w.fn = tc_bind_class_walker;
1973         q->ops->cl_ops->walk(q, &args.w);
1974 }
1975 
1976 #else
1977 
1978 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1979                            unsigned long new_cl)
1980 {
1981 }
1982 
1983 #endif
1984 
1985 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1986                          struct netlink_ext_ack *extack)
1987 {
1988         struct net *net = sock_net(skb->sk);
1989         struct tcmsg *tcm = nlmsg_data(n);
1990         struct nlattr *tca[TCA_MAX + 1];
1991         struct net_device *dev;
1992         struct Qdisc *q = NULL;
1993         const struct Qdisc_class_ops *cops;
1994         unsigned long cl = 0;
1995         unsigned long new_cl;
1996         u32 portid;
1997         u32 clid;
1998         u32 qid;
1999         int err;
2000 
2001         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2002             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2003                 return -EPERM;
2004 
2005         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2006                                      rtm_tca_policy, extack);
2007         if (err < 0)
2008                 return err;
2009 
2010         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2011         if (!dev)
2012                 return -ENODEV;
2013 
2014         /*
2015            parent == TC_H_UNSPEC - unspecified parent.
2016            parent == TC_H_ROOT   - class is root, which has no parent.
2017            parent == X:0         - parent is root class.
2018            parent == X:Y         - parent is a node in hierarchy.
2019            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2020 
2021            handle == 0:0         - generate handle from kernel pool.
2022            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2023            handle == X:Y         - clear.
2024            handle == X:0         - root class.
2025          */
2026 
2027         /* Step 1. Determine qdisc handle X:0 */
2028 
2029         portid = tcm->tcm_parent;
2030         clid = tcm->tcm_handle;
2031         qid = TC_H_MAJ(clid);
2032 
2033         if (portid != TC_H_ROOT) {
2034                 u32 qid1 = TC_H_MAJ(portid);
2035 
2036                 if (qid && qid1) {
2037                         /* If both majors are known, they must be identical. */
2038                         if (qid != qid1)
2039                                 return -EINVAL;
2040                 } else if (qid1) {
2041                         qid = qid1;
2042                 } else if (qid == 0)
2043                         qid = dev->qdisc->handle;
2044 
2045                 /* Now qid is genuine qdisc handle consistent
2046                  * both with parent and child.
2047                  *
2048                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2049                  */
2050                 if (portid)
2051                         portid = TC_H_MAKE(qid, portid);
2052         } else {
2053                 if (qid == 0)
2054                         qid = dev->qdisc->handle;
2055         }
2056 
2057         /* OK. Locate qdisc */
2058         q = qdisc_lookup(dev, qid);
2059         if (!q)
2060                 return -ENOENT;
2061 
2062         /* An check that it supports classes */
2063         cops = q->ops->cl_ops;
2064         if (cops == NULL)
2065                 return -EINVAL;
2066 
2067         /* Now try to get class */
2068         if (clid == 0) {
2069                 if (portid == TC_H_ROOT)
2070                         clid = qid;
2071         } else
2072                 clid = TC_H_MAKE(qid, clid);
2073 
2074         if (clid)
2075                 cl = cops->find(q, clid);
2076 
2077         if (cl == 0) {
2078                 err = -ENOENT;
2079                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2080                     !(n->nlmsg_flags & NLM_F_CREATE))
2081                         goto out;
2082         } else {
2083                 switch (n->nlmsg_type) {
2084                 case RTM_NEWTCLASS:
2085                         err = -EEXIST;
2086                         if (n->nlmsg_flags & NLM_F_EXCL)
2087                                 goto out;
2088                         break;
2089                 case RTM_DELTCLASS:
2090                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2091                         /* Unbind the class with flilters with 0 */
2092                         tc_bind_tclass(q, portid, clid, 0);
2093                         goto out;
2094                 case RTM_GETTCLASS:
2095                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2096                         goto out;
2097                 default:
2098                         err = -EINVAL;
2099                         goto out;
2100                 }
2101         }
2102 
2103         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2104                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2105                 return -EOPNOTSUPP;
2106         }
2107 
2108         new_cl = cl;
2109         err = -EOPNOTSUPP;
2110         if (cops->change)
2111                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2112         if (err == 0) {
2113                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2114                 /* We just create a new class, need to do reverse binding. */
2115                 if (cl != new_cl)
2116                         tc_bind_tclass(q, portid, clid, new_cl);
2117         }
2118 out:
2119         return err;
2120 }
2121 
2122 struct qdisc_dump_args {
2123         struct qdisc_walker     w;
2124         struct sk_buff          *skb;
2125         struct netlink_callback *cb;
2126 };
2127 
2128 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2129                             struct qdisc_walker *arg)
2130 {
2131         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2132 
2133         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2134                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2135                               RTM_NEWTCLASS);
2136 }
2137 
2138 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2139                                 struct tcmsg *tcm, struct netlink_callback *cb,
2140                                 int *t_p, int s_t)
2141 {
2142         struct qdisc_dump_args arg;
2143 
2144         if (tc_qdisc_dump_ignore(q, false) ||
2145             *t_p < s_t || !q->ops->cl_ops ||
2146             (tcm->tcm_parent &&
2147              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2148                 (*t_p)++;
2149                 return 0;
2150         }
2151         if (*t_p > s_t)
2152                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2153         arg.w.fn = qdisc_class_dump;
2154         arg.skb = skb;
2155         arg.cb = cb;
2156         arg.w.stop  = 0;
2157         arg.w.skip = cb->args[1];
2158         arg.w.count = 0;
2159         q->ops->cl_ops->walk(q, &arg.w);
2160         cb->args[1] = arg.w.count;
2161         if (arg.w.stop)
2162                 return -1;
2163         (*t_p)++;
2164         return 0;
2165 }
2166 
2167 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2168                                struct tcmsg *tcm, struct netlink_callback *cb,
2169                                int *t_p, int s_t)
2170 {
2171         struct Qdisc *q;
2172         int b;
2173 
2174         if (!root)
2175                 return 0;
2176 
2177         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2178                 return -1;
2179 
2180         if (!qdisc_dev(root))
2181                 return 0;
2182 
2183         if (tcm->tcm_parent) {
2184                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2185                 if (q && q != root &&
2186                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2187                         return -1;
2188                 return 0;
2189         }
2190         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2191                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2192                         return -1;
2193         }
2194 
2195         return 0;
2196 }
2197 
2198 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2199 {
2200         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2201         struct net *net = sock_net(skb->sk);
2202         struct netdev_queue *dev_queue;
2203         struct net_device *dev;
2204         int t, s_t;
2205 
2206         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2207                 return 0;
2208         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2209         if (!dev)
2210                 return 0;
2211 
2212         s_t = cb->args[0];
2213         t = 0;
2214 
2215         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2216                 goto done;
2217 
2218         dev_queue = dev_ingress_queue(dev);
2219         if (dev_queue &&
2220             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2221                                 &t, s_t) < 0)
2222                 goto done;
2223 
2224 done:
2225         cb->args[0] = t;
2226 
2227         dev_put(dev);
2228         return skb->len;
2229 }
2230 
2231 #ifdef CONFIG_PROC_FS
2232 static int psched_show(struct seq_file *seq, void *v)
2233 {
2234         seq_printf(seq, "%08x %08x %08x %08x\n",
2235                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2236                    1000000,
2237                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2238 
2239         return 0;
2240 }
2241 
2242 static int __net_init psched_net_init(struct net *net)
2243 {
2244         struct proc_dir_entry *e;
2245 
2246         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2247         if (e == NULL)
2248                 return -ENOMEM;
2249 
2250         return 0;
2251 }
2252 
2253 static void __net_exit psched_net_exit(struct net *net)
2254 {
2255         remove_proc_entry("psched", net->proc_net);
2256 }
2257 #else
2258 static int __net_init psched_net_init(struct net *net)
2259 {
2260         return 0;
2261 }
2262 
2263 static void __net_exit psched_net_exit(struct net *net)
2264 {
2265 }
2266 #endif
2267 
2268 static struct pernet_operations psched_net_ops = {
2269         .init = psched_net_init,
2270         .exit = psched_net_exit,
2271 };
2272 
2273 static int __init pktsched_init(void)
2274 {
2275         int err;
2276 
2277         err = register_pernet_subsys(&psched_net_ops);
2278         if (err) {
2279                 pr_err("pktsched_init: "
2280                        "cannot initialize per netns operations\n");
2281                 return err;
2282         }
2283 
2284         register_qdisc(&pfifo_fast_ops);
2285         register_qdisc(&pfifo_qdisc_ops);
2286         register_qdisc(&bfifo_qdisc_ops);
2287         register_qdisc(&pfifo_head_drop_qdisc_ops);
2288         register_qdisc(&mq_qdisc_ops);
2289         register_qdisc(&noqueue_qdisc_ops);
2290 
2291         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2292         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2293         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2294                       0);
2295         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2296         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2297         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2298                       0);
2299 
2300         return 0;
2301 }
2302 
2303 subsys_initcall(pktsched_init);
2304 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp