1 /* 2 * net/sched/sch_api.c Packet scheduler API. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Fixes: 12 * 13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 16 */ 17 18 #include <linux/config.h> 19 #include <linux/module.h> 20 #include <linux/types.h> 21 #include <linux/kernel.h> 22 #include <linux/sched.h> 23 #include <linux/string.h> 24 #include <linux/mm.h> 25 #include <linux/socket.h> 26 #include <linux/sockios.h> 27 #include <linux/in.h> 28 #include <linux/errno.h> 29 #include <linux/interrupt.h> 30 #include <linux/netdevice.h> 31 #include <linux/skbuff.h> 32 #include <linux/rtnetlink.h> 33 #include <linux/init.h> 34 #include <linux/proc_fs.h> 35 #include <linux/seq_file.h> 36 #include <linux/kmod.h> 37 38 #include <net/sock.h> 39 #include <net/pkt_sched.h> 40 41 #include <asm/processor.h> 42 #include <asm/uaccess.h> 43 #include <asm/system.h> 44 #include <asm/bitops.h> 45 46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, 47 struct Qdisc *old, struct Qdisc *new); 48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 49 struct Qdisc *q, unsigned long cl, int event); 50 51 /* 52 53 Short review. 54 ------------- 55 56 This file consists of two interrelated parts: 57 58 1. queueing disciplines manager frontend. 59 2. traffic classes manager frontend. 60 61 Generally, queueing discipline ("qdisc") is a black box, 62 which is able to enqueue packets and to dequeue them (when 63 device is ready to send something) in order and at times 64 determined by algorithm hidden in it. 65 66 qdisc's are divided to two categories: 67 - "queues", which have no internal structure visible from outside. 68 - "schedulers", which split all the packets to "traffic classes", 69 using "packet classifiers" (look at cls_api.c) 70 71 In turn, classes may have child qdiscs (as rule, queues) 72 attached to them etc. etc. etc. 73 74 The goal of the routines in this file is to translate 75 information supplied by user in the form of handles 76 to more intelligible for kernel form, to make some sanity 77 checks and part of work, which is common to all qdiscs 78 and to provide rtnetlink notifications. 79 80 All real intelligent work is done inside qdisc modules. 81 82 83 84 Every discipline has two major routines: enqueue and dequeue. 85 86 ---dequeue 87 88 dequeue usually returns a skb to send. It is allowed to return NULL, 89 but it does not mean that queue is empty, it just means that 90 discipline does not want to send anything this time. 91 Queue is really empty if q->q.qlen == 0. 92 For complicated disciplines with multiple queues q->q is not 93 real packet queue, but however q->q.qlen must be valid. 94 95 ---enqueue 96 97 enqueue returns 0, if packet was enqueued successfully. 98 If packet (this one or another one) was dropped, it returns 99 not zero error code. 100 NET_XMIT_DROP - this packet dropped 101 Expected action: do not backoff, but wait until queue will clear. 102 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 103 Expected action: backoff or ignore 104 NET_XMIT_POLICED - dropped by police. 105 Expected action: backoff or error to real-time apps. 106 107 Auxiliary routines: 108 109 ---requeue 110 111 requeues once dequeued packet. It is used for non-standard or 112 just buggy devices, which can defer output even if dev->tbusy=0. 113 114 ---reset 115 116 returns qdisc to initial state: purge all buffers, clear all 117 timers, counters (except for statistics) etc. 118 119 ---init 120 121 initializes newly created qdisc. 122 123 ---destroy 124 125 destroys resources allocated by init and during lifetime of qdisc. 126 127 ---change 128 129 changes qdisc parameters. 130 */ 131 132 /* Protects list of registered TC modules. It is pure SMP lock. */ 133 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; 134 135 136 /************************************************ 137 * Queueing disciplines manipulation. * 138 ************************************************/ 139 140 141 /* The list of all installed queueing disciplines. */ 142 143 static struct Qdisc_ops *qdisc_base; 144 145 /* Register/uregister queueing discipline */ 146 147 int register_qdisc(struct Qdisc_ops *qops) 148 { 149 struct Qdisc_ops *q, **qp; 150 int rc = -EEXIST; 151 152 write_lock(&qdisc_mod_lock); 153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 154 if (!strcmp(qops->id, q->id)) 155 goto out; 156 157 if (qops->enqueue == NULL) 158 qops->enqueue = noop_qdisc_ops.enqueue; 159 if (qops->requeue == NULL) 160 qops->requeue = noop_qdisc_ops.requeue; 161 if (qops->dequeue == NULL) 162 qops->dequeue = noop_qdisc_ops.dequeue; 163 164 qops->next = NULL; 165 *qp = qops; 166 rc = 0; 167 out: 168 write_unlock(&qdisc_mod_lock); 169 return rc; 170 } 171 172 int unregister_qdisc(struct Qdisc_ops *qops) 173 { 174 struct Qdisc_ops *q, **qp; 175 int err = -ENOENT; 176 177 write_lock(&qdisc_mod_lock); 178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) 179 if (q == qops) 180 break; 181 if (q) { 182 *qp = q->next; 183 q->next = NULL; 184 err = 0; 185 } 186 write_unlock(&qdisc_mod_lock); 187 return err; 188 } 189 190 /* We know handle. Find qdisc among all qdisc's attached to device 191 (root qdisc, all its children, children of children etc.) 192 */ 193 194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 195 { 196 struct Qdisc *q; 197 198 for (q = dev->qdisc_list; q; q = q->next) { 199 if (q->handle == handle) 200 return q; 201 } 202 return NULL; 203 } 204 205 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 206 { 207 unsigned long cl; 208 struct Qdisc *leaf; 209 struct Qdisc_class_ops *cops = p->ops->cl_ops; 210 211 if (cops == NULL) 212 return NULL; 213 cl = cops->get(p, classid); 214 215 if (cl == 0) 216 return NULL; 217 leaf = cops->leaf(p, cl); 218 cops->put(p, cl); 219 return leaf; 220 } 221 222 /* Find queueing discipline by name */ 223 224 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) 225 { 226 struct Qdisc_ops *q = NULL; 227 228 if (kind) { 229 read_lock(&qdisc_mod_lock); 230 for (q = qdisc_base; q; q = q->next) { 231 if (rtattr_strcmp(kind, q->id) == 0) 232 break; 233 } 234 read_unlock(&qdisc_mod_lock); 235 } 236 return q; 237 } 238 239 static struct qdisc_rate_table *qdisc_rtab_list; 240 241 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) 242 { 243 struct qdisc_rate_table *rtab; 244 245 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 246 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { 247 rtab->refcnt++; 248 return rtab; 249 } 250 } 251 252 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) 253 return NULL; 254 255 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 256 if (rtab) { 257 rtab->rate = *r; 258 rtab->refcnt = 1; 259 memcpy(rtab->data, RTA_DATA(tab), 1024); 260 rtab->next = qdisc_rtab_list; 261 qdisc_rtab_list = rtab; 262 } 263 return rtab; 264 } 265 266 void qdisc_put_rtab(struct qdisc_rate_table *tab) 267 { 268 struct qdisc_rate_table *rtab, **rtabp; 269 270 if (!tab || --tab->refcnt) 271 return; 272 273 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { 274 if (rtab == tab) { 275 *rtabp = rtab->next; 276 kfree(rtab); 277 return; 278 } 279 } 280 } 281 282 283 /* Allocate an unique handle from space managed by kernel */ 284 285 u32 qdisc_alloc_handle(struct net_device *dev) 286 { 287 int i = 0x10000; 288 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 289 290 do { 291 autohandle += TC_H_MAKE(0x10000U, 0); 292 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 293 autohandle = TC_H_MAKE(0x80000000U, 0); 294 } while (qdisc_lookup(dev, autohandle) && --i > 0); 295 296 return i>0 ? autohandle : 0; 297 } 298 299 /* Attach toplevel qdisc to device dev */ 300 301 static struct Qdisc * 302 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) 303 { 304 struct Qdisc *oqdisc; 305 306 if (dev->flags & IFF_UP) 307 dev_deactivate(dev); 308 309 write_lock(&qdisc_tree_lock); 310 spin_lock_bh(&dev->queue_lock); 311 if (qdisc && qdisc->flags&TCQ_F_INGRES) { 312 oqdisc = dev->qdisc_ingress; 313 /* Prune old scheduler */ 314 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { 315 /* delete */ 316 qdisc_reset(oqdisc); 317 dev->qdisc_ingress = NULL; 318 } else { /* new */ 319 dev->qdisc_ingress = qdisc; 320 } 321 322 } else { 323 324 oqdisc = dev->qdisc_sleeping; 325 326 /* Prune old scheduler */ 327 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) 328 qdisc_reset(oqdisc); 329 330 /* ... and graft new one */ 331 if (qdisc == NULL) 332 qdisc = &noop_qdisc; 333 dev->qdisc_sleeping = qdisc; 334 dev->qdisc = &noop_qdisc; 335 } 336 337 spin_unlock_bh(&dev->queue_lock); 338 write_unlock(&qdisc_tree_lock); 339 340 if (dev->flags & IFF_UP) 341 dev_activate(dev); 342 343 return oqdisc; 344 } 345 346 347 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 348 to device "dev". 349 350 Old qdisc is not destroyed but returned in *old. 351 */ 352 353 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, 354 struct Qdisc *new, struct Qdisc **old) 355 { 356 int err = 0; 357 struct Qdisc *q = *old; 358 359 360 if (parent == NULL) { 361 if (q && q->flags&TCQ_F_INGRES) { 362 *old = dev_graft_qdisc(dev, q); 363 } else { 364 *old = dev_graft_qdisc(dev, new); 365 } 366 } else { 367 struct Qdisc_class_ops *cops = parent->ops->cl_ops; 368 369 err = -EINVAL; 370 371 if (cops) { 372 unsigned long cl = cops->get(parent, classid); 373 if (cl) { 374 err = cops->graft(parent, cl, new, old); 375 cops->put(parent, cl); 376 } 377 } 378 } 379 return err; 380 } 381 382 /* 383 Allocate and initialize new qdisc. 384 385 Parameters are passed via opt. 386 */ 387 388 static struct Qdisc * 389 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) 390 { 391 int err; 392 struct rtattr *kind = tca[TCA_KIND-1]; 393 struct Qdisc *sch = NULL; 394 struct Qdisc_ops *ops; 395 int size; 396 397 ops = qdisc_lookup_ops(kind); 398 #ifdef CONFIG_KMOD 399 if (ops==NULL && tca[TCA_KIND-1] != NULL) { 400 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { 401 request_module("sch_%s", (char*)RTA_DATA(kind)); 402 ops = qdisc_lookup_ops(kind); 403 } 404 } 405 #endif 406 407 err = -EINVAL; 408 if (ops == NULL) 409 goto err_out; 410 411 size = sizeof(*sch) + ops->priv_size; 412 413 sch = kmalloc(size, GFP_KERNEL); 414 err = -ENOBUFS; 415 if (!sch) 416 goto err_out; 417 418 /* Grrr... Resolve race condition with module unload */ 419 420 err = -EINVAL; 421 if (ops != qdisc_lookup_ops(kind)) 422 goto err_out; 423 424 memset(sch, 0, size); 425 426 skb_queue_head_init(&sch->q); 427 428 if (handle == TC_H_INGRESS) 429 sch->flags |= TCQ_F_INGRES; 430 431 sch->ops = ops; 432 sch->enqueue = ops->enqueue; 433 sch->dequeue = ops->dequeue; 434 sch->dev = dev; 435 atomic_set(&sch->refcnt, 1); 436 sch->stats.lock = &dev->queue_lock; 437 if (handle == 0) { 438 handle = qdisc_alloc_handle(dev); 439 err = -ENOMEM; 440 if (handle == 0) 441 goto err_out; 442 } 443 444 if (handle == TC_H_INGRESS) 445 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); 446 else 447 sch->handle = handle; 448 449 err = -EBUSY; 450 if (!try_module_get(ops->owner)) 451 goto err_out; 452 453 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 454 write_lock(&qdisc_tree_lock); 455 sch->next = dev->qdisc_list; 456 dev->qdisc_list = sch; 457 write_unlock(&qdisc_tree_lock); 458 #ifdef CONFIG_NET_ESTIMATOR 459 if (tca[TCA_RATE-1]) 460 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); 461 #endif 462 return sch; 463 } 464 module_put(ops->owner); 465 466 err_out: 467 *errp = err; 468 if (sch) 469 kfree(sch); 470 return NULL; 471 } 472 473 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) 474 { 475 if (tca[TCA_OPTIONS-1]) { 476 int err; 477 478 if (sch->ops->change == NULL) 479 return -EINVAL; 480 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); 481 if (err) 482 return err; 483 } 484 #ifdef CONFIG_NET_ESTIMATOR 485 if (tca[TCA_RATE-1]) { 486 qdisc_kill_estimator(&sch->stats); 487 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); 488 } 489 #endif 490 return 0; 491 } 492 493 struct check_loop_arg 494 { 495 struct qdisc_walker w; 496 struct Qdisc *p; 497 int depth; 498 }; 499 500 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); 501 502 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 503 { 504 struct check_loop_arg arg; 505 506 if (q->ops->cl_ops == NULL) 507 return 0; 508 509 arg.w.stop = arg.w.skip = arg.w.count = 0; 510 arg.w.fn = check_loop_fn; 511 arg.depth = depth; 512 arg.p = p; 513 q->ops->cl_ops->walk(q, &arg.w); 514 return arg.w.stop ? -ELOOP : 0; 515 } 516 517 static int 518 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 519 { 520 struct Qdisc *leaf; 521 struct Qdisc_class_ops *cops = q->ops->cl_ops; 522 struct check_loop_arg *arg = (struct check_loop_arg *)w; 523 524 leaf = cops->leaf(q, cl); 525 if (leaf) { 526 if (leaf == arg->p || arg->depth > 7) 527 return -ELOOP; 528 return check_loop(leaf, arg->p, arg->depth + 1); 529 } 530 return 0; 531 } 532 533 /* 534 * Delete/get qdisc. 535 */ 536 537 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 538 { 539 struct tcmsg *tcm = NLMSG_DATA(n); 540 struct rtattr **tca = arg; 541 struct net_device *dev; 542 u32 clid = tcm->tcm_parent; 543 struct Qdisc *q = NULL; 544 struct Qdisc *p = NULL; 545 int err; 546 547 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 548 return -ENODEV; 549 550 if (clid) { 551 if (clid != TC_H_ROOT) { 552 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 553 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 554 return -ENOENT; 555 q = qdisc_leaf(p, clid); 556 } else { /* ingress */ 557 q = dev->qdisc_ingress; 558 } 559 } else { 560 q = dev->qdisc_sleeping; 561 } 562 if (!q) 563 return -ENOENT; 564 565 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) 566 return -EINVAL; 567 } else { 568 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 569 return -ENOENT; 570 } 571 572 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 573 return -EINVAL; 574 575 if (n->nlmsg_type == RTM_DELQDISC) { 576 if (!clid) 577 return -EINVAL; 578 if (q->handle == 0) 579 return -ENOENT; 580 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) 581 return err; 582 if (q) { 583 qdisc_notify(skb, n, clid, q, NULL); 584 spin_lock_bh(&dev->queue_lock); 585 qdisc_destroy(q); 586 spin_unlock_bh(&dev->queue_lock); 587 } 588 } else { 589 qdisc_notify(skb, n, clid, NULL, q); 590 } 591 return 0; 592 } 593 594 /* 595 Create/change qdisc. 596 */ 597 598 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 599 { 600 struct tcmsg *tcm = NLMSG_DATA(n); 601 struct rtattr **tca = arg; 602 struct net_device *dev; 603 u32 clid = tcm->tcm_parent; 604 struct Qdisc *q = NULL; 605 struct Qdisc *p = NULL; 606 int err; 607 608 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 609 return -ENODEV; 610 611 if (clid) { 612 if (clid != TC_H_ROOT) { 613 if (clid != TC_H_INGRESS) { 614 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 615 return -ENOENT; 616 q = qdisc_leaf(p, clid); 617 } else { /*ingress */ 618 q = dev->qdisc_ingress; 619 } 620 } else { 621 q = dev->qdisc_sleeping; 622 } 623 624 /* It may be default qdisc, ignore it */ 625 if (q && q->handle == 0) 626 q = NULL; 627 628 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 629 if (tcm->tcm_handle) { 630 if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) 631 return -EEXIST; 632 if (TC_H_MIN(tcm->tcm_handle)) 633 return -EINVAL; 634 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 635 goto create_n_graft; 636 if (n->nlmsg_flags&NLM_F_EXCL) 637 return -EEXIST; 638 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 639 return -EINVAL; 640 if (q == p || 641 (p && check_loop(q, p, 0))) 642 return -ELOOP; 643 atomic_inc(&q->refcnt); 644 goto graft; 645 } else { 646 if (q == NULL) 647 goto create_n_graft; 648 649 /* This magic test requires explanation. 650 * 651 * We know, that some child q is already 652 * attached to this parent and have choice: 653 * either to change it or to create/graft new one. 654 * 655 * 1. We are allowed to create/graft only 656 * if CREATE and REPLACE flags are set. 657 * 658 * 2. If EXCL is set, requestor wanted to say, 659 * that qdisc tcm_handle is not expected 660 * to exist, so that we choose create/graft too. 661 * 662 * 3. The last case is when no flags are set. 663 * Alas, it is sort of hole in API, we 664 * cannot decide what to do unambiguously. 665 * For now we select create/graft, if 666 * user gave KIND, which does not match existing. 667 */ 668 if ((n->nlmsg_flags&NLM_F_CREATE) && 669 (n->nlmsg_flags&NLM_F_REPLACE) && 670 ((n->nlmsg_flags&NLM_F_EXCL) || 671 (tca[TCA_KIND-1] && 672 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) 673 goto create_n_graft; 674 } 675 } 676 } else { 677 if (!tcm->tcm_handle) 678 return -EINVAL; 679 q = qdisc_lookup(dev, tcm->tcm_handle); 680 } 681 682 /* Change qdisc parameters */ 683 if (q == NULL) 684 return -ENOENT; 685 if (n->nlmsg_flags&NLM_F_EXCL) 686 return -EEXIST; 687 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 688 return -EINVAL; 689 err = qdisc_change(q, tca); 690 if (err == 0) 691 qdisc_notify(skb, n, clid, NULL, q); 692 return err; 693 694 create_n_graft: 695 if (!(n->nlmsg_flags&NLM_F_CREATE)) 696 return -ENOENT; 697 if (clid == TC_H_INGRESS) 698 q = qdisc_create(dev, tcm->tcm_parent, tca, &err); 699 else 700 q = qdisc_create(dev, tcm->tcm_handle, tca, &err); 701 if (q == NULL) 702 return err; 703 704 graft: 705 if (1) { 706 struct Qdisc *old_q = NULL; 707 err = qdisc_graft(dev, p, clid, q, &old_q); 708 if (err) { 709 if (q) { 710 spin_lock_bh(&dev->queue_lock); 711 qdisc_destroy(q); 712 spin_unlock_bh(&dev->queue_lock); 713 } 714 return err; 715 } 716 qdisc_notify(skb, n, clid, old_q, q); 717 if (old_q) { 718 spin_lock_bh(&dev->queue_lock); 719 qdisc_destroy(old_q); 720 spin_unlock_bh(&dev->queue_lock); 721 } 722 } 723 return 0; 724 } 725 726 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st) 727 { 728 spin_lock_bh(st->lock); 729 RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st); 730 spin_unlock_bh(st->lock); 731 return 0; 732 733 rtattr_failure: 734 spin_unlock_bh(st->lock); 735 return -1; 736 } 737 738 739 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 740 u32 pid, u32 seq, unsigned flags, int event) 741 { 742 struct tcmsg *tcm; 743 struct nlmsghdr *nlh; 744 unsigned char *b = skb->tail; 745 746 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 747 nlh->nlmsg_flags = flags; 748 tcm = NLMSG_DATA(nlh); 749 tcm->tcm_family = AF_UNSPEC; 750 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; 751 tcm->tcm_parent = clid; 752 tcm->tcm_handle = q->handle; 753 tcm->tcm_info = atomic_read(&q->refcnt); 754 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 755 if (q->ops->dump && q->ops->dump(q, skb) < 0) 756 goto rtattr_failure; 757 q->stats.qlen = q->q.qlen; 758 if (qdisc_copy_stats(skb, &q->stats)) 759 goto rtattr_failure; 760 nlh->nlmsg_len = skb->tail - b; 761 return skb->len; 762 763 nlmsg_failure: 764 rtattr_failure: 765 skb_trim(skb, b - skb->data); 766 return -1; 767 } 768 769 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, 770 u32 clid, struct Qdisc *old, struct Qdisc *new) 771 { 772 struct sk_buff *skb; 773 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 774 775 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 776 if (!skb) 777 return -ENOBUFS; 778 779 if (old && old->handle) { 780 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) 781 goto err_out; 782 } 783 if (new) { 784 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 785 goto err_out; 786 } 787 788 if (skb->len) 789 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 790 791 err_out: 792 kfree_skb(skb); 793 return -EINVAL; 794 } 795 796 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 797 { 798 int idx, q_idx; 799 int s_idx, s_q_idx; 800 struct net_device *dev; 801 struct Qdisc *q; 802 803 s_idx = cb->args[0]; 804 s_q_idx = q_idx = cb->args[1]; 805 read_lock(&dev_base_lock); 806 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { 807 if (idx < s_idx) 808 continue; 809 if (idx > s_idx) 810 s_q_idx = 0; 811 read_lock(&qdisc_tree_lock); 812 for (q = dev->qdisc_list, q_idx = 0; q; 813 q = q->next, q_idx++) { 814 if (q_idx < s_q_idx) 815 continue; 816 if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid, 817 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { 818 read_unlock(&qdisc_tree_lock); 819 goto done; 820 } 821 } 822 read_unlock(&qdisc_tree_lock); 823 } 824 825 done: 826 read_unlock(&dev_base_lock); 827 828 cb->args[0] = idx; 829 cb->args[1] = q_idx; 830 831 return skb->len; 832 } 833 834 835 836 /************************************************ 837 * Traffic classes manipulation. * 838 ************************************************/ 839 840 841 842 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 843 { 844 struct tcmsg *tcm = NLMSG_DATA(n); 845 struct rtattr **tca = arg; 846 struct net_device *dev; 847 struct Qdisc *q = NULL; 848 struct Qdisc_class_ops *cops; 849 unsigned long cl = 0; 850 unsigned long new_cl; 851 u32 pid = tcm->tcm_parent; 852 u32 clid = tcm->tcm_handle; 853 u32 qid = TC_H_MAJ(clid); 854 int err; 855 856 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 857 return -ENODEV; 858 859 /* 860 parent == TC_H_UNSPEC - unspecified parent. 861 parent == TC_H_ROOT - class is root, which has no parent. 862 parent == X:0 - parent is root class. 863 parent == X:Y - parent is a node in hierarchy. 864 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 865 866 handle == 0:0 - generate handle from kernel pool. 867 handle == 0:Y - class is X:Y, where X:0 is qdisc. 868 handle == X:Y - clear. 869 handle == X:0 - root class. 870 */ 871 872 /* Step 1. Determine qdisc handle X:0 */ 873 874 if (pid != TC_H_ROOT) { 875 u32 qid1 = TC_H_MAJ(pid); 876 877 if (qid && qid1) { 878 /* If both majors are known, they must be identical. */ 879 if (qid != qid1) 880 return -EINVAL; 881 } else if (qid1) { 882 qid = qid1; 883 } else if (qid == 0) 884 qid = dev->qdisc_sleeping->handle; 885 886 /* Now qid is genuine qdisc handle consistent 887 both with parent and child. 888 889 TC_H_MAJ(pid) still may be unspecified, complete it now. 890 */ 891 if (pid) 892 pid = TC_H_MAKE(qid, pid); 893 } else { 894 if (qid == 0) 895 qid = dev->qdisc_sleeping->handle; 896 } 897 898 /* OK. Locate qdisc */ 899 if ((q = qdisc_lookup(dev, qid)) == NULL) 900 return -ENOENT; 901 902 /* An check that it supports classes */ 903 cops = q->ops->cl_ops; 904 if (cops == NULL) 905 return -EINVAL; 906 907 /* Now try to get class */ 908 if (clid == 0) { 909 if (pid == TC_H_ROOT) 910 clid = qid; 911 } else 912 clid = TC_H_MAKE(qid, clid); 913 914 if (clid) 915 cl = cops->get(q, clid); 916 917 if (cl == 0) { 918 err = -ENOENT; 919 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) 920 goto out; 921 } else { 922 switch (n->nlmsg_type) { 923 case RTM_NEWTCLASS: 924 err = -EEXIST; 925 if (n->nlmsg_flags&NLM_F_EXCL) 926 goto out; 927 break; 928 case RTM_DELTCLASS: 929 err = cops->delete(q, cl); 930 if (err == 0) 931 tclass_notify(skb, n, q, cl, RTM_DELTCLASS); 932 goto out; 933 case RTM_GETTCLASS: 934 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); 935 goto out; 936 default: 937 err = -EINVAL; 938 goto out; 939 } 940 } 941 942 new_cl = cl; 943 err = cops->change(q, clid, pid, tca, &new_cl); 944 if (err == 0) 945 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); 946 947 out: 948 if (cl) 949 cops->put(q, cl); 950 951 return err; 952 } 953 954 955 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 956 unsigned long cl, 957 u32 pid, u32 seq, unsigned flags, int event) 958 { 959 struct tcmsg *tcm; 960 struct nlmsghdr *nlh; 961 unsigned char *b = skb->tail; 962 963 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 964 nlh->nlmsg_flags = flags; 965 tcm = NLMSG_DATA(nlh); 966 tcm->tcm_family = AF_UNSPEC; 967 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; 968 tcm->tcm_parent = q->handle; 969 tcm->tcm_handle = q->handle; 970 tcm->tcm_info = 0; 971 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 972 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) 973 goto rtattr_failure; 974 nlh->nlmsg_len = skb->tail - b; 975 return skb->len; 976 977 nlmsg_failure: 978 rtattr_failure: 979 skb_trim(skb, b - skb->data); 980 return -1; 981 } 982 983 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 984 struct Qdisc *q, unsigned long cl, int event) 985 { 986 struct sk_buff *skb; 987 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 988 989 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 990 if (!skb) 991 return -ENOBUFS; 992 993 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { 994 kfree_skb(skb); 995 return -EINVAL; 996 } 997 998 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 999 } 1000 1001 struct qdisc_dump_args 1002 { 1003 struct qdisc_walker w; 1004 struct sk_buff *skb; 1005 struct netlink_callback *cb; 1006 }; 1007 1008 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1009 { 1010 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 1011 1012 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, 1013 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); 1014 } 1015 1016 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 1017 { 1018 int t; 1019 int s_t; 1020 struct net_device *dev; 1021 struct Qdisc *q; 1022 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); 1023 struct qdisc_dump_args arg; 1024 1025 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 1026 return 0; 1027 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) 1028 return 0; 1029 1030 s_t = cb->args[0]; 1031 1032 read_lock(&qdisc_tree_lock); 1033 for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { 1034 if (t < s_t) continue; 1035 if (!q->ops->cl_ops) continue; 1036 if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle) 1037 continue; 1038 if (t > s_t) 1039 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 1040 arg.w.fn = qdisc_class_dump; 1041 arg.skb = skb; 1042 arg.cb = cb; 1043 arg.w.stop = 0; 1044 arg.w.skip = cb->args[1]; 1045 arg.w.count = 0; 1046 q->ops->cl_ops->walk(q, &arg.w); 1047 cb->args[1] = arg.w.count; 1048 if (arg.w.stop) 1049 break; 1050 } 1051 read_unlock(&qdisc_tree_lock); 1052 1053 cb->args[0] = t; 1054 1055 dev_put(dev); 1056 return skb->len; 1057 } 1058 1059 int psched_us_per_tick = 1; 1060 int psched_tick_per_us = 1; 1061 1062 #ifdef CONFIG_PROC_FS 1063 static int psched_show(struct seq_file *seq, void *v) 1064 { 1065 seq_printf(seq, "%08x %08x %08x %08x\n", 1066 psched_tick_per_us, psched_us_per_tick, 1067 1000000, HZ); 1068 1069 return 0; 1070 } 1071 1072 static int psched_open(struct inode *inode, struct file *file) 1073 { 1074 return single_open(file, psched_show, PDE(inode)->data); 1075 } 1076 1077 static struct file_operations psched_fops = { 1078 .owner = THIS_MODULE, 1079 .open = psched_open, 1080 .read = seq_read, 1081 .llseek = seq_lseek, 1082 .release = single_release, 1083 }; 1084 #endif 1085 1086 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY 1087 int psched_tod_diff(int delta_sec, int bound) 1088 { 1089 int delta; 1090 1091 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1) 1092 return bound; 1093 delta = delta_sec * 1000000; 1094 if (delta > bound) 1095 delta = bound; 1096 return delta; 1097 } 1098 #endif 1099 1100 psched_time_t psched_time_base; 1101 1102 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1103 psched_tdiff_t psched_clock_per_hz; 1104 int psched_clock_scale; 1105 #endif 1106 1107 #ifdef PSCHED_WATCHER 1108 PSCHED_WATCHER psched_time_mark; 1109 1110 static void psched_tick(unsigned long); 1111 1112 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); 1113 1114 static void psched_tick(unsigned long dummy) 1115 { 1116 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1117 psched_time_t dummy_stamp; 1118 PSCHED_GET_TIME(dummy_stamp); 1119 /* It is OK up to 4GHz cpu */ 1120 psched_timer.expires = jiffies + 1*HZ; 1121 #else 1122 unsigned long now = jiffies; 1123 psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE; 1124 psched_time_mark = now; 1125 psched_timer.expires = now + 60*60*HZ; 1126 #endif 1127 add_timer(&psched_timer); 1128 } 1129 #endif 1130 1131 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1132 int __init psched_calibrate_clock(void) 1133 { 1134 psched_time_t stamp, stamp1; 1135 struct timeval tv, tv1; 1136 psched_tdiff_t delay; 1137 long rdelay; 1138 unsigned long stop; 1139 1140 #ifdef PSCHED_WATCHER 1141 psched_tick(0); 1142 #endif 1143 stop = jiffies + HZ/10; 1144 PSCHED_GET_TIME(stamp); 1145 do_gettimeofday(&tv); 1146 while (time_before(jiffies, stop)) { 1147 barrier(); 1148 cpu_relax(); 1149 } 1150 PSCHED_GET_TIME(stamp1); 1151 do_gettimeofday(&tv1); 1152 1153 delay = PSCHED_TDIFF(stamp1, stamp); 1154 rdelay = tv1.tv_usec - tv.tv_usec; 1155 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; 1156 if (rdelay > delay) 1157 return -1; 1158 delay /= rdelay; 1159 psched_tick_per_us = delay; 1160 while ((delay>>=1) != 0) 1161 psched_clock_scale++; 1162 psched_us_per_tick = 1<<psched_clock_scale; 1163 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; 1164 return 0; 1165 } 1166 #endif 1167 1168 int __init pktsched_init(void) 1169 { 1170 struct rtnetlink_link *link_p; 1171 1172 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1173 if (psched_calibrate_clock() < 0) 1174 return -1; 1175 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES 1176 psched_tick_per_us = HZ<<PSCHED_JSCALE; 1177 psched_us_per_tick = 1000000; 1178 #ifdef PSCHED_WATCHER 1179 psched_tick(0); 1180 #endif 1181 #endif 1182 1183 link_p = rtnetlink_links[PF_UNSPEC]; 1184 1185 /* Setup rtnetlink links. It is made here to avoid 1186 exporting large number of public symbols. 1187 */ 1188 1189 if (link_p) { 1190 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; 1191 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; 1192 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; 1193 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; 1194 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1195 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1196 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1197 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; 1198 } 1199 1200 #define INIT_QDISC(name) { \ 1201 extern struct Qdisc_ops name##_qdisc_ops; \ 1202 register_qdisc(& name##_qdisc_ops); \ 1203 } 1204 1205 INIT_QDISC(pfifo); 1206 INIT_QDISC(bfifo); 1207 1208 #ifdef CONFIG_NET_SCH_CBQ 1209 INIT_QDISC(cbq); 1210 #endif 1211 #ifdef CONFIG_NET_SCH_HTB 1212 INIT_QDISC(htb); 1213 #endif 1214 #ifdef CONFIG_NET_SCH_CSZ 1215 INIT_QDISC(csz); 1216 #endif 1217 #ifdef CONFIG_NET_SCH_HPFQ 1218 INIT_QDISC(hpfq); 1219 #endif 1220 #ifdef CONFIG_NET_SCH_HFSC 1221 INIT_QDISC(hfsc); 1222 #endif 1223 #ifdef CONFIG_NET_SCH_RED 1224 INIT_QDISC(red); 1225 #endif 1226 #ifdef CONFIG_NET_SCH_GRED 1227 INIT_QDISC(gred); 1228 #endif 1229 #ifdef CONFIG_NET_SCH_INGRESS 1230 INIT_QDISC(ingress); 1231 #endif 1232 #ifdef CONFIG_NET_SCH_DSMARK 1233 INIT_QDISC(dsmark); 1234 #endif 1235 #ifdef CONFIG_NET_SCH_SFQ 1236 INIT_QDISC(sfq); 1237 #endif 1238 #ifdef CONFIG_NET_SCH_TBF 1239 INIT_QDISC(tbf); 1240 #endif 1241 #ifdef CONFIG_NET_SCH_TEQL 1242 teql_init(); 1243 #endif 1244 #ifdef CONFIG_NET_SCH_PRIO 1245 INIT_QDISC(prio); 1246 #endif 1247 #ifdef CONFIG_NET_SCH_ATM 1248 INIT_QDISC(atm); 1249 #endif 1250 #ifdef CONFIG_NET_CLS 1251 tc_filter_init(); 1252 #endif 1253 1254 proc_net_fops_create("psched", 0, &psched_fops); 1255 1256 return 0; 1257 } 1258 1259 EXPORT_SYMBOL(qdisc_copy_stats); 1260 EXPORT_SYMBOL(qdisc_get_rtab); 1261 EXPORT_SYMBOL(qdisc_put_rtab); 1262 EXPORT_SYMBOL(register_qdisc); 1263 EXPORT_SYMBOL(unregister_qdisc); 1264 PSCHED_EXPORTLIST; 1265
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.