~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/sched/sch_api.c

Version: ~ [ linux-6.4-rc3 ] ~ [ linux-6.3.4 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.30 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.113 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.180 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.243 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.283 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.315 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * net/sched/sch_api.c  Packet scheduler API.
  3  *
  4  *              This program is free software; you can redistribute it and/or
  5  *              modify it under the terms of the GNU General Public License
  6  *              as published by the Free Software Foundation; either version
  7  *              2 of the License, or (at your option) any later version.
  8  *
  9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10  *
 11  * Fixes:
 12  *
 13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
 14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
 15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
 16  */
 17 
 18 #include <linux/config.h>
 19 #include <linux/module.h>
 20 #include <linux/types.h>
 21 #include <linux/kernel.h>
 22 #include <linux/sched.h>
 23 #include <linux/string.h>
 24 #include <linux/mm.h>
 25 #include <linux/socket.h>
 26 #include <linux/sockios.h>
 27 #include <linux/in.h>
 28 #include <linux/errno.h>
 29 #include <linux/interrupt.h>
 30 #include <linux/netdevice.h>
 31 #include <linux/skbuff.h>
 32 #include <linux/rtnetlink.h>
 33 #include <linux/init.h>
 34 #include <linux/proc_fs.h>
 35 #include <linux/seq_file.h>
 36 #include <linux/kmod.h>
 37 
 38 #include <net/sock.h>
 39 #include <net/pkt_sched.h>
 40 
 41 #include <asm/processor.h>
 42 #include <asm/uaccess.h>
 43 #include <asm/system.h>
 44 #include <asm/bitops.h>
 45 
 46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
 47                         struct Qdisc *old, struct Qdisc *new);
 48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 49                          struct Qdisc *q, unsigned long cl, int event);
 50 
 51 /*
 52 
 53    Short review.
 54    -------------
 55 
 56    This file consists of two interrelated parts:
 57 
 58    1. queueing disciplines manager frontend.
 59    2. traffic classes manager frontend.
 60 
 61    Generally, queueing discipline ("qdisc") is a black box,
 62    which is able to enqueue packets and to dequeue them (when
 63    device is ready to send something) in order and at times
 64    determined by algorithm hidden in it.
 65 
 66    qdisc's are divided to two categories:
 67    - "queues", which have no internal structure visible from outside.
 68    - "schedulers", which split all the packets to "traffic classes",
 69      using "packet classifiers" (look at cls_api.c)
 70 
 71    In turn, classes may have child qdiscs (as rule, queues)
 72    attached to them etc. etc. etc.
 73 
 74    The goal of the routines in this file is to translate
 75    information supplied by user in the form of handles
 76    to more intelligible for kernel form, to make some sanity
 77    checks and part of work, which is common to all qdiscs
 78    and to provide rtnetlink notifications.
 79 
 80    All real intelligent work is done inside qdisc modules.
 81 
 82 
 83 
 84    Every discipline has two major routines: enqueue and dequeue.
 85 
 86    ---dequeue
 87 
 88    dequeue usually returns a skb to send. It is allowed to return NULL,
 89    but it does not mean that queue is empty, it just means that
 90    discipline does not want to send anything this time.
 91    Queue is really empty if q->q.qlen == 0.
 92    For complicated disciplines with multiple queues q->q is not
 93    real packet queue, but however q->q.qlen must be valid.
 94 
 95    ---enqueue
 96 
 97    enqueue returns 0, if packet was enqueued successfully.
 98    If packet (this one or another one) was dropped, it returns
 99    not zero error code.
100    NET_XMIT_DROP        - this packet dropped
101      Expected action: do not backoff, but wait until queue will clear.
102    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
103      Expected action: backoff or ignore
104    NET_XMIT_POLICED     - dropped by police.
105      Expected action: backoff or error to real-time apps.
106 
107    Auxiliary routines:
108 
109    ---requeue
110 
111    requeues once dequeued packet. It is used for non-standard or
112    just buggy devices, which can defer output even if dev->tbusy=0.
113 
114    ---reset
115 
116    returns qdisc to initial state: purge all buffers, clear all
117    timers, counters (except for statistics) etc.
118 
119    ---init
120 
121    initializes newly created qdisc.
122 
123    ---destroy
124 
125    destroys resources allocated by init and during lifetime of qdisc.
126 
127    ---change
128 
129    changes qdisc parameters.
130  */
131 
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
134 
135 
136 /************************************************
137  *      Queueing disciplines manipulation.      *
138  ************************************************/
139 
140 
141 /* The list of all installed queueing disciplines. */
142 
143 static struct Qdisc_ops *qdisc_base;
144 
145 /* Register/uregister queueing discipline */
146 
147 int register_qdisc(struct Qdisc_ops *qops)
148 {
149         struct Qdisc_ops *q, **qp;
150         int rc = -EEXIST;
151 
152         write_lock(&qdisc_mod_lock);
153         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154                 if (!strcmp(qops->id, q->id))
155                         goto out;
156 
157         if (qops->enqueue == NULL)
158                 qops->enqueue = noop_qdisc_ops.enqueue;
159         if (qops->requeue == NULL)
160                 qops->requeue = noop_qdisc_ops.requeue;
161         if (qops->dequeue == NULL)
162                 qops->dequeue = noop_qdisc_ops.dequeue;
163 
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170 }
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176 
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 
190 /* We know handle. Find qdisc among all qdisc's attached to device
191    (root qdisc, all its children, children of children etc.)
192  */
193 
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195 {
196         struct Qdisc *q;
197 
198         for (q = dev->qdisc_list; q; q = q->next) {
199                 if (q->handle == handle)
200                         return q;
201         }
202         return NULL;
203 }
204 
205 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
206 {
207         unsigned long cl;
208         struct Qdisc *leaf;
209         struct Qdisc_class_ops *cops = p->ops->cl_ops;
210 
211         if (cops == NULL)
212                 return NULL;
213         cl = cops->get(p, classid);
214 
215         if (cl == 0)
216                 return NULL;
217         leaf = cops->leaf(p, cl);
218         cops->put(p, cl);
219         return leaf;
220 }
221 
222 /* Find queueing discipline by name */
223 
224 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
225 {
226         struct Qdisc_ops *q = NULL;
227 
228         if (kind) {
229                 read_lock(&qdisc_mod_lock);
230                 for (q = qdisc_base; q; q = q->next) {
231                         if (rtattr_strcmp(kind, q->id) == 0)
232                                 break;
233                 }
234                 read_unlock(&qdisc_mod_lock);
235         }
236         return q;
237 }
238 
239 static struct qdisc_rate_table *qdisc_rtab_list;
240 
241 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
242 {
243         struct qdisc_rate_table *rtab;
244 
245         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
246                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
247                         rtab->refcnt++;
248                         return rtab;
249                 }
250         }
251 
252         if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
253                 return NULL;
254 
255         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
256         if (rtab) {
257                 rtab->rate = *r;
258                 rtab->refcnt = 1;
259                 memcpy(rtab->data, RTA_DATA(tab), 1024);
260                 rtab->next = qdisc_rtab_list;
261                 qdisc_rtab_list = rtab;
262         }
263         return rtab;
264 }
265 
266 void qdisc_put_rtab(struct qdisc_rate_table *tab)
267 {
268         struct qdisc_rate_table *rtab, **rtabp;
269 
270         if (!tab || --tab->refcnt)
271                 return;
272 
273         for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
274                 if (rtab == tab) {
275                         *rtabp = rtab->next;
276                         kfree(rtab);
277                         return;
278                 }
279         }
280 }
281 
282 
283 /* Allocate an unique handle from space managed by kernel */
284 
285 u32 qdisc_alloc_handle(struct net_device *dev)
286 {
287         int i = 0x10000;
288         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
289 
290         do {
291                 autohandle += TC_H_MAKE(0x10000U, 0);
292                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
293                         autohandle = TC_H_MAKE(0x80000000U, 0);
294         } while (qdisc_lookup(dev, autohandle) && --i > 0);
295 
296         return i>0 ? autohandle : 0;
297 }
298 
299 /* Attach toplevel qdisc to device dev */
300 
301 static struct Qdisc *
302 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
303 {
304         struct Qdisc *oqdisc;
305 
306         if (dev->flags & IFF_UP)
307                 dev_deactivate(dev);
308 
309         write_lock(&qdisc_tree_lock);
310         spin_lock_bh(&dev->queue_lock);
311         if (qdisc && qdisc->flags&TCQ_F_INGRES) {
312                 oqdisc = dev->qdisc_ingress;
313                 /* Prune old scheduler */
314                 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
315                         /* delete */
316                         qdisc_reset(oqdisc);
317                         dev->qdisc_ingress = NULL;
318                 } else {  /* new */
319                         dev->qdisc_ingress = qdisc;
320                 }
321 
322         } else {
323 
324                 oqdisc = dev->qdisc_sleeping;
325 
326                 /* Prune old scheduler */
327                 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
328                         qdisc_reset(oqdisc);
329 
330                 /* ... and graft new one */
331                 if (qdisc == NULL)
332                         qdisc = &noop_qdisc;
333                 dev->qdisc_sleeping = qdisc;
334                 dev->qdisc = &noop_qdisc;
335         }
336 
337         spin_unlock_bh(&dev->queue_lock);
338         write_unlock(&qdisc_tree_lock);
339 
340         if (dev->flags & IFF_UP)
341                 dev_activate(dev);
342 
343         return oqdisc;
344 }
345 
346 
347 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
348    to device "dev".
349 
350    Old qdisc is not destroyed but returned in *old.
351  */
352 
353 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
354                 struct Qdisc *new, struct Qdisc **old)
355 {
356         int err = 0;
357         struct Qdisc *q = *old;
358 
359 
360         if (parent == NULL) { 
361                 if (q && q->flags&TCQ_F_INGRES) {
362                         *old = dev_graft_qdisc(dev, q);
363                 } else {
364                         *old = dev_graft_qdisc(dev, new);
365                 }
366         } else {
367                 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
368 
369                 err = -EINVAL;
370 
371                 if (cops) {
372                         unsigned long cl = cops->get(parent, classid);
373                         if (cl) {
374                                 err = cops->graft(parent, cl, new, old);
375                                 cops->put(parent, cl);
376                         }
377                 }
378         }
379         return err;
380 }
381 
382 /*
383    Allocate and initialize new qdisc.
384 
385    Parameters are passed via opt.
386  */
387 
388 static struct Qdisc *
389 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
390 {
391         int err;
392         struct rtattr *kind = tca[TCA_KIND-1];
393         struct Qdisc *sch = NULL;
394         struct Qdisc_ops *ops;
395         int size;
396 
397         ops = qdisc_lookup_ops(kind);
398 #ifdef CONFIG_KMOD
399         if (ops==NULL && tca[TCA_KIND-1] != NULL) {
400                 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
401                         request_module("sch_%s", (char*)RTA_DATA(kind));
402                         ops = qdisc_lookup_ops(kind);
403                 }
404         }
405 #endif
406 
407         err = -EINVAL;
408         if (ops == NULL)
409                 goto err_out;
410 
411         size = sizeof(*sch) + ops->priv_size;
412 
413         sch = kmalloc(size, GFP_KERNEL);
414         err = -ENOBUFS;
415         if (!sch)
416                 goto err_out;
417 
418         /* Grrr... Resolve race condition with module unload */
419 
420         err = -EINVAL;
421         if (ops != qdisc_lookup_ops(kind))
422                 goto err_out;
423 
424         memset(sch, 0, size);
425 
426         skb_queue_head_init(&sch->q);
427 
428         if (handle == TC_H_INGRESS)
429                 sch->flags |= TCQ_F_INGRES;
430 
431         sch->ops = ops;
432         sch->enqueue = ops->enqueue;
433         sch->dequeue = ops->dequeue;
434         sch->dev = dev;
435         atomic_set(&sch->refcnt, 1);
436         sch->stats.lock = &dev->queue_lock;
437         if (handle == 0) {
438                 handle = qdisc_alloc_handle(dev);
439                 err = -ENOMEM;
440                 if (handle == 0)
441                         goto err_out;
442         }
443 
444         if (handle == TC_H_INGRESS)
445                 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
446         else
447                 sch->handle = handle;
448 
449         err = -EBUSY;
450         if (!try_module_get(ops->owner))
451                 goto err_out;
452 
453         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
454                 write_lock(&qdisc_tree_lock);
455                 sch->next = dev->qdisc_list;
456                 dev->qdisc_list = sch;
457                 write_unlock(&qdisc_tree_lock);
458 #ifdef CONFIG_NET_ESTIMATOR
459                 if (tca[TCA_RATE-1])
460                         qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
461 #endif
462                 return sch;
463         }
464         module_put(ops->owner);
465 
466 err_out:
467         *errp = err;
468         if (sch)
469                 kfree(sch);
470         return NULL;
471 }
472 
473 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
474 {
475         if (tca[TCA_OPTIONS-1]) {
476                 int err;
477 
478                 if (sch->ops->change == NULL)
479                         return -EINVAL;
480                 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
481                 if (err)
482                         return err;
483         }
484 #ifdef CONFIG_NET_ESTIMATOR
485         if (tca[TCA_RATE-1]) {
486                 qdisc_kill_estimator(&sch->stats);
487                 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
488         }
489 #endif
490         return 0;
491 }
492 
493 struct check_loop_arg
494 {
495         struct qdisc_walker     w;
496         struct Qdisc            *p;
497         int                     depth;
498 };
499 
500 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
501 
502 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
503 {
504         struct check_loop_arg   arg;
505 
506         if (q->ops->cl_ops == NULL)
507                 return 0;
508 
509         arg.w.stop = arg.w.skip = arg.w.count = 0;
510         arg.w.fn = check_loop_fn;
511         arg.depth = depth;
512         arg.p = p;
513         q->ops->cl_ops->walk(q, &arg.w);
514         return arg.w.stop ? -ELOOP : 0;
515 }
516 
517 static int
518 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
519 {
520         struct Qdisc *leaf;
521         struct Qdisc_class_ops *cops = q->ops->cl_ops;
522         struct check_loop_arg *arg = (struct check_loop_arg *)w;
523 
524         leaf = cops->leaf(q, cl);
525         if (leaf) {
526                 if (leaf == arg->p || arg->depth > 7)
527                         return -ELOOP;
528                 return check_loop(leaf, arg->p, arg->depth + 1);
529         }
530         return 0;
531 }
532 
533 /*
534  * Delete/get qdisc.
535  */
536 
537 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
538 {
539         struct tcmsg *tcm = NLMSG_DATA(n);
540         struct rtattr **tca = arg;
541         struct net_device *dev;
542         u32 clid = tcm->tcm_parent;
543         struct Qdisc *q = NULL;
544         struct Qdisc *p = NULL;
545         int err;
546 
547         if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
548                 return -ENODEV;
549 
550         if (clid) {
551                 if (clid != TC_H_ROOT) {
552                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
553                                 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
554                                         return -ENOENT;
555                                 q = qdisc_leaf(p, clid);
556                         } else { /* ingress */
557                                 q = dev->qdisc_ingress;
558                         }
559                 } else {
560                         q = dev->qdisc_sleeping;
561                 }
562                 if (!q)
563                         return -ENOENT;
564 
565                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
566                         return -EINVAL;
567         } else {
568                 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
569                         return -ENOENT;
570         }
571 
572         if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
573                 return -EINVAL;
574 
575         if (n->nlmsg_type == RTM_DELQDISC) {
576                 if (!clid)
577                         return -EINVAL;
578                 if (q->handle == 0)
579                         return -ENOENT;
580                 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
581                         return err;
582                 if (q) {
583                         qdisc_notify(skb, n, clid, q, NULL);
584                         spin_lock_bh(&dev->queue_lock);
585                         qdisc_destroy(q);
586                         spin_unlock_bh(&dev->queue_lock);
587                 }
588         } else {
589                 qdisc_notify(skb, n, clid, NULL, q);
590         }
591         return 0;
592 }
593 
594 /*
595    Create/change qdisc.
596  */
597 
598 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
599 {
600         struct tcmsg *tcm = NLMSG_DATA(n);
601         struct rtattr **tca = arg;
602         struct net_device *dev;
603         u32 clid = tcm->tcm_parent;
604         struct Qdisc *q = NULL;
605         struct Qdisc *p = NULL;
606         int err;
607 
608         if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
609                 return -ENODEV;
610 
611         if (clid) {
612                 if (clid != TC_H_ROOT) {
613                         if (clid != TC_H_INGRESS) {
614                                 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
615                                         return -ENOENT;
616                                 q = qdisc_leaf(p, clid);
617                         } else { /*ingress */
618                                 q = dev->qdisc_ingress;
619                         }
620                 } else {
621                         q = dev->qdisc_sleeping;
622                 }
623 
624                 /* It may be default qdisc, ignore it */
625                 if (q && q->handle == 0)
626                         q = NULL;
627 
628                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
629                         if (tcm->tcm_handle) {
630                                 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
631                                         return -EEXIST;
632                                 if (TC_H_MIN(tcm->tcm_handle))
633                                         return -EINVAL;
634                                 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
635                                         goto create_n_graft;
636                                 if (n->nlmsg_flags&NLM_F_EXCL)
637                                         return -EEXIST;
638                                 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
639                                         return -EINVAL;
640                                 if (q == p ||
641                                     (p && check_loop(q, p, 0)))
642                                         return -ELOOP;
643                                 atomic_inc(&q->refcnt);
644                                 goto graft;
645                         } else {
646                                 if (q == NULL)
647                                         goto create_n_graft;
648 
649                                 /* This magic test requires explanation.
650                                  *
651                                  *   We know, that some child q is already
652                                  *   attached to this parent and have choice:
653                                  *   either to change it or to create/graft new one.
654                                  *
655                                  *   1. We are allowed to create/graft only
656                                  *   if CREATE and REPLACE flags are set.
657                                  *
658                                  *   2. If EXCL is set, requestor wanted to say,
659                                  *   that qdisc tcm_handle is not expected
660                                  *   to exist, so that we choose create/graft too.
661                                  *
662                                  *   3. The last case is when no flags are set.
663                                  *   Alas, it is sort of hole in API, we
664                                  *   cannot decide what to do unambiguously.
665                                  *   For now we select create/graft, if
666                                  *   user gave KIND, which does not match existing.
667                                  */
668                                 if ((n->nlmsg_flags&NLM_F_CREATE) &&
669                                     (n->nlmsg_flags&NLM_F_REPLACE) &&
670                                     ((n->nlmsg_flags&NLM_F_EXCL) ||
671                                      (tca[TCA_KIND-1] &&
672                                       rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
673                                         goto create_n_graft;
674                         }
675                 }
676         } else {
677                 if (!tcm->tcm_handle)
678                         return -EINVAL;
679                 q = qdisc_lookup(dev, tcm->tcm_handle);
680         }
681 
682         /* Change qdisc parameters */
683         if (q == NULL)
684                 return -ENOENT;
685         if (n->nlmsg_flags&NLM_F_EXCL)
686                 return -EEXIST;
687         if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
688                 return -EINVAL;
689         err = qdisc_change(q, tca);
690         if (err == 0)
691                 qdisc_notify(skb, n, clid, NULL, q);
692         return err;
693 
694 create_n_graft:
695         if (!(n->nlmsg_flags&NLM_F_CREATE))
696                 return -ENOENT;
697         if (clid == TC_H_INGRESS)
698                 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
699         else
700                 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
701         if (q == NULL)
702                 return err;
703 
704 graft:
705         if (1) {
706                 struct Qdisc *old_q = NULL;
707                 err = qdisc_graft(dev, p, clid, q, &old_q);
708                 if (err) {
709                         if (q) {
710                                 spin_lock_bh(&dev->queue_lock);
711                                 qdisc_destroy(q);
712                                 spin_unlock_bh(&dev->queue_lock);
713                         }
714                         return err;
715                 }
716                 qdisc_notify(skb, n, clid, old_q, q);
717                 if (old_q) {
718                         spin_lock_bh(&dev->queue_lock);
719                         qdisc_destroy(old_q);
720                         spin_unlock_bh(&dev->queue_lock);
721                 }
722         }
723         return 0;
724 }
725 
726 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
727 {
728         spin_lock_bh(st->lock);
729         RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
730         spin_unlock_bh(st->lock);
731         return 0;
732 
733 rtattr_failure:
734         spin_unlock_bh(st->lock);
735         return -1;
736 }
737 
738 
739 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
740                          u32 pid, u32 seq, unsigned flags, int event)
741 {
742         struct tcmsg *tcm;
743         struct nlmsghdr  *nlh;
744         unsigned char    *b = skb->tail;
745 
746         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
747         nlh->nlmsg_flags = flags;
748         tcm = NLMSG_DATA(nlh);
749         tcm->tcm_family = AF_UNSPEC;
750         tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
751         tcm->tcm_parent = clid;
752         tcm->tcm_handle = q->handle;
753         tcm->tcm_info = atomic_read(&q->refcnt);
754         RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
755         if (q->ops->dump && q->ops->dump(q, skb) < 0)
756                 goto rtattr_failure;
757         q->stats.qlen = q->q.qlen;
758         if (qdisc_copy_stats(skb, &q->stats))
759                 goto rtattr_failure;
760         nlh->nlmsg_len = skb->tail - b;
761         return skb->len;
762 
763 nlmsg_failure:
764 rtattr_failure:
765         skb_trim(skb, b - skb->data);
766         return -1;
767 }
768 
769 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
770                         u32 clid, struct Qdisc *old, struct Qdisc *new)
771 {
772         struct sk_buff *skb;
773         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
774 
775         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
776         if (!skb)
777                 return -ENOBUFS;
778 
779         if (old && old->handle) {
780                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
781                         goto err_out;
782         }
783         if (new) {
784                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
785                         goto err_out;
786         }
787 
788         if (skb->len)
789                 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
790 
791 err_out:
792         kfree_skb(skb);
793         return -EINVAL;
794 }
795 
796 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
797 {
798         int idx, q_idx;
799         int s_idx, s_q_idx;
800         struct net_device *dev;
801         struct Qdisc *q;
802 
803         s_idx = cb->args[0];
804         s_q_idx = q_idx = cb->args[1];
805         read_lock(&dev_base_lock);
806         for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
807                 if (idx < s_idx)
808                         continue;
809                 if (idx > s_idx)
810                         s_q_idx = 0;
811                 read_lock(&qdisc_tree_lock);
812                 for (q = dev->qdisc_list, q_idx = 0; q;
813                      q = q->next, q_idx++) {
814                         if (q_idx < s_q_idx)
815                                 continue;
816                         if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
817                                           cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
818                                 read_unlock(&qdisc_tree_lock);
819                                 goto done;
820                         }
821                 }
822                 read_unlock(&qdisc_tree_lock);
823         }
824 
825 done:
826         read_unlock(&dev_base_lock);
827 
828         cb->args[0] = idx;
829         cb->args[1] = q_idx;
830 
831         return skb->len;
832 }
833 
834 
835 
836 /************************************************
837  *      Traffic classes manipulation.           *
838  ************************************************/
839 
840 
841 
842 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
843 {
844         struct tcmsg *tcm = NLMSG_DATA(n);
845         struct rtattr **tca = arg;
846         struct net_device *dev;
847         struct Qdisc *q = NULL;
848         struct Qdisc_class_ops *cops;
849         unsigned long cl = 0;
850         unsigned long new_cl;
851         u32 pid = tcm->tcm_parent;
852         u32 clid = tcm->tcm_handle;
853         u32 qid = TC_H_MAJ(clid);
854         int err;
855 
856         if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
857                 return -ENODEV;
858 
859         /*
860            parent == TC_H_UNSPEC - unspecified parent.
861            parent == TC_H_ROOT   - class is root, which has no parent.
862            parent == X:0         - parent is root class.
863            parent == X:Y         - parent is a node in hierarchy.
864            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
865 
866            handle == 0:0         - generate handle from kernel pool.
867            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
868            handle == X:Y         - clear.
869            handle == X:0         - root class.
870          */
871 
872         /* Step 1. Determine qdisc handle X:0 */
873 
874         if (pid != TC_H_ROOT) {
875                 u32 qid1 = TC_H_MAJ(pid);
876 
877                 if (qid && qid1) {
878                         /* If both majors are known, they must be identical. */
879                         if (qid != qid1)
880                                 return -EINVAL;
881                 } else if (qid1) {
882                         qid = qid1;
883                 } else if (qid == 0)
884                         qid = dev->qdisc_sleeping->handle;
885 
886                 /* Now qid is genuine qdisc handle consistent
887                    both with parent and child.
888 
889                    TC_H_MAJ(pid) still may be unspecified, complete it now.
890                  */
891                 if (pid)
892                         pid = TC_H_MAKE(qid, pid);
893         } else {
894                 if (qid == 0)
895                         qid = dev->qdisc_sleeping->handle;
896         }
897 
898         /* OK. Locate qdisc */
899         if ((q = qdisc_lookup(dev, qid)) == NULL) 
900                 return -ENOENT;
901 
902         /* An check that it supports classes */
903         cops = q->ops->cl_ops;
904         if (cops == NULL)
905                 return -EINVAL;
906 
907         /* Now try to get class */
908         if (clid == 0) {
909                 if (pid == TC_H_ROOT)
910                         clid = qid;
911         } else
912                 clid = TC_H_MAKE(qid, clid);
913 
914         if (clid)
915                 cl = cops->get(q, clid);
916 
917         if (cl == 0) {
918                 err = -ENOENT;
919                 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
920                         goto out;
921         } else {
922                 switch (n->nlmsg_type) {
923                 case RTM_NEWTCLASS:     
924                         err = -EEXIST;
925                         if (n->nlmsg_flags&NLM_F_EXCL)
926                                 goto out;
927                         break;
928                 case RTM_DELTCLASS:
929                         err = cops->delete(q, cl);
930                         if (err == 0)
931                                 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
932                         goto out;
933                 case RTM_GETTCLASS:
934                         err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
935                         goto out;
936                 default:
937                         err = -EINVAL;
938                         goto out;
939                 }
940         }
941 
942         new_cl = cl;
943         err = cops->change(q, clid, pid, tca, &new_cl);
944         if (err == 0)
945                 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
946 
947 out:
948         if (cl)
949                 cops->put(q, cl);
950 
951         return err;
952 }
953 
954 
955 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
956                           unsigned long cl,
957                           u32 pid, u32 seq, unsigned flags, int event)
958 {
959         struct tcmsg *tcm;
960         struct nlmsghdr  *nlh;
961         unsigned char    *b = skb->tail;
962 
963         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
964         nlh->nlmsg_flags = flags;
965         tcm = NLMSG_DATA(nlh);
966         tcm->tcm_family = AF_UNSPEC;
967         tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
968         tcm->tcm_parent = q->handle;
969         tcm->tcm_handle = q->handle;
970         tcm->tcm_info = 0;
971         RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
972         if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
973                 goto rtattr_failure;
974         nlh->nlmsg_len = skb->tail - b;
975         return skb->len;
976 
977 nlmsg_failure:
978 rtattr_failure:
979         skb_trim(skb, b - skb->data);
980         return -1;
981 }
982 
983 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
984                           struct Qdisc *q, unsigned long cl, int event)
985 {
986         struct sk_buff *skb;
987         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
988 
989         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
990         if (!skb)
991                 return -ENOBUFS;
992 
993         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
994                 kfree_skb(skb);
995                 return -EINVAL;
996         }
997 
998         return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
999 }
1000 
1001 struct qdisc_dump_args
1002 {
1003         struct qdisc_walker w;
1004         struct sk_buff *skb;
1005         struct netlink_callback *cb;
1006 };
1007 
1008 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1009 {
1010         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1011 
1012         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1013                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1014 }
1015 
1016 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1017 {
1018         int t;
1019         int s_t;
1020         struct net_device *dev;
1021         struct Qdisc *q;
1022         struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1023         struct qdisc_dump_args arg;
1024 
1025         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1026                 return 0;
1027         if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1028                 return 0;
1029 
1030         s_t = cb->args[0];
1031 
1032         read_lock(&qdisc_tree_lock);
1033         for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
1034                 if (t < s_t) continue;
1035                 if (!q->ops->cl_ops) continue;
1036                 if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
1037                         continue;
1038                 if (t > s_t)
1039                         memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1040                 arg.w.fn = qdisc_class_dump;
1041                 arg.skb = skb;
1042                 arg.cb = cb;
1043                 arg.w.stop  = 0;
1044                 arg.w.skip = cb->args[1];
1045                 arg.w.count = 0;
1046                 q->ops->cl_ops->walk(q, &arg.w);
1047                 cb->args[1] = arg.w.count;
1048                 if (arg.w.stop)
1049                         break;
1050         }
1051         read_unlock(&qdisc_tree_lock);
1052 
1053         cb->args[0] = t;
1054 
1055         dev_put(dev);
1056         return skb->len;
1057 }
1058 
1059 int psched_us_per_tick = 1;
1060 int psched_tick_per_us = 1;
1061 
1062 #ifdef CONFIG_PROC_FS
1063 static int psched_show(struct seq_file *seq, void *v)
1064 {
1065         seq_printf(seq, "%08x %08x %08x %08x\n",
1066                       psched_tick_per_us, psched_us_per_tick,
1067                       1000000, HZ);
1068 
1069         return 0;
1070 }
1071 
1072 static int psched_open(struct inode *inode, struct file *file)
1073 {
1074         return single_open(file, psched_show, PDE(inode)->data);
1075 }
1076 
1077 static struct file_operations psched_fops = {
1078         .owner = THIS_MODULE,
1079         .open = psched_open,
1080         .read  = seq_read,
1081         .llseek = seq_lseek,
1082         .release = single_release,
1083 };      
1084 #endif
1085 
1086 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
1087 int psched_tod_diff(int delta_sec, int bound)
1088 {
1089         int delta;
1090 
1091         if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1092                 return bound;
1093         delta = delta_sec * 1000000;
1094         if (delta > bound)
1095                 delta = bound;
1096         return delta;
1097 }
1098 #endif
1099 
1100 psched_time_t psched_time_base;
1101 
1102 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1103 psched_tdiff_t psched_clock_per_hz;
1104 int psched_clock_scale;
1105 #endif
1106 
1107 #ifdef PSCHED_WATCHER
1108 PSCHED_WATCHER psched_time_mark;
1109 
1110 static void psched_tick(unsigned long);
1111 
1112 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1113 
1114 static void psched_tick(unsigned long dummy)
1115 {
1116 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1117         psched_time_t dummy_stamp;
1118         PSCHED_GET_TIME(dummy_stamp);
1119         /* It is OK up to 4GHz cpu */
1120         psched_timer.expires = jiffies + 1*HZ;
1121 #else
1122         unsigned long now = jiffies;
1123         psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
1124         psched_time_mark = now;
1125         psched_timer.expires = now + 60*60*HZ;
1126 #endif
1127         add_timer(&psched_timer);
1128 }
1129 #endif
1130 
1131 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1132 int __init psched_calibrate_clock(void)
1133 {
1134         psched_time_t stamp, stamp1;
1135         struct timeval tv, tv1;
1136         psched_tdiff_t delay;
1137         long rdelay;
1138         unsigned long stop;
1139 
1140 #ifdef PSCHED_WATCHER
1141         psched_tick(0);
1142 #endif
1143         stop = jiffies + HZ/10;
1144         PSCHED_GET_TIME(stamp);
1145         do_gettimeofday(&tv);
1146         while (time_before(jiffies, stop)) {
1147                 barrier();
1148                 cpu_relax();
1149         }
1150         PSCHED_GET_TIME(stamp1);
1151         do_gettimeofday(&tv1);
1152 
1153         delay = PSCHED_TDIFF(stamp1, stamp);
1154         rdelay = tv1.tv_usec - tv.tv_usec;
1155         rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1156         if (rdelay > delay)
1157                 return -1;
1158         delay /= rdelay;
1159         psched_tick_per_us = delay;
1160         while ((delay>>=1) != 0)
1161                 psched_clock_scale++;
1162         psched_us_per_tick = 1<<psched_clock_scale;
1163         psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1164         return 0;
1165 }
1166 #endif
1167 
1168 int __init pktsched_init(void)
1169 {
1170         struct rtnetlink_link *link_p;
1171 
1172 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1173         if (psched_calibrate_clock() < 0)
1174                 return -1;
1175 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1176         psched_tick_per_us = HZ<<PSCHED_JSCALE;
1177         psched_us_per_tick = 1000000;
1178 #ifdef PSCHED_WATCHER
1179         psched_tick(0);
1180 #endif
1181 #endif
1182 
1183         link_p = rtnetlink_links[PF_UNSPEC];
1184 
1185         /* Setup rtnetlink links. It is made here to avoid
1186            exporting large number of public symbols.
1187          */
1188 
1189         if (link_p) {
1190                 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1191                 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1192                 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1193                 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1194                 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1195                 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1196                 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1197                 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1198         }
1199 
1200 #define INIT_QDISC(name) { \
1201           extern struct Qdisc_ops name##_qdisc_ops; \
1202           register_qdisc(& name##_qdisc_ops);       \
1203         }
1204 
1205         INIT_QDISC(pfifo);
1206         INIT_QDISC(bfifo);
1207 
1208 #ifdef CONFIG_NET_SCH_CBQ
1209         INIT_QDISC(cbq);
1210 #endif
1211 #ifdef CONFIG_NET_SCH_HTB
1212         INIT_QDISC(htb);
1213 #endif
1214 #ifdef CONFIG_NET_SCH_CSZ
1215         INIT_QDISC(csz);
1216 #endif
1217 #ifdef CONFIG_NET_SCH_HPFQ
1218         INIT_QDISC(hpfq);
1219 #endif
1220 #ifdef CONFIG_NET_SCH_HFSC
1221         INIT_QDISC(hfsc);
1222 #endif
1223 #ifdef CONFIG_NET_SCH_RED
1224         INIT_QDISC(red);
1225 #endif
1226 #ifdef CONFIG_NET_SCH_GRED
1227        INIT_QDISC(gred);
1228 #endif
1229 #ifdef CONFIG_NET_SCH_INGRESS
1230        INIT_QDISC(ingress);
1231 #endif
1232 #ifdef CONFIG_NET_SCH_DSMARK
1233        INIT_QDISC(dsmark);
1234 #endif
1235 #ifdef CONFIG_NET_SCH_SFQ
1236         INIT_QDISC(sfq);
1237 #endif
1238 #ifdef CONFIG_NET_SCH_TBF
1239         INIT_QDISC(tbf);
1240 #endif
1241 #ifdef CONFIG_NET_SCH_TEQL
1242         teql_init();
1243 #endif
1244 #ifdef CONFIG_NET_SCH_PRIO
1245         INIT_QDISC(prio);
1246 #endif
1247 #ifdef CONFIG_NET_SCH_ATM
1248         INIT_QDISC(atm);
1249 #endif
1250 #ifdef CONFIG_NET_CLS
1251         tc_filter_init();
1252 #endif
1253 
1254         proc_net_fops_create("psched", 0, &psched_fops);
1255 
1256         return 0;
1257 }
1258 
1259 EXPORT_SYMBOL(qdisc_copy_stats);
1260 EXPORT_SYMBOL(qdisc_get_rtab);
1261 EXPORT_SYMBOL(qdisc_put_rtab);
1262 EXPORT_SYMBOL(register_qdisc);
1263 EXPORT_SYMBOL(unregister_qdisc);
1264 PSCHED_EXPORTLIST;
1265 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp