1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux INET6 implementation 4 * FIB front-end. 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 */ 9 10 /* Changes: 11 * 12 * YOSHIFUJI Hideaki @USAGI 13 * reworked default router selection. 14 * - respect outgoing interface 15 * - select from (probably) reachable routers (i.e. 16 * routers in REACHABLE, STALE, DELAY or PROBE states). 17 * - always select the same router if it is (probably) 18 * reachable. otherwise, round-robin the list. 19 * Ville Nuorvala 20 * Fixed routing subtrees. 21 */ 22 23 #define pr_fmt(fmt) "IPv6: " fmt 24 25 #include <linux/capability.h> 26 #include <linux/errno.h> 27 #include <linux/export.h> 28 #include <linux/types.h> 29 #include <linux/times.h> 30 #include <linux/socket.h> 31 #include <linux/sockios.h> 32 #include <linux/net.h> 33 #include <linux/route.h> 34 #include <linux/netdevice.h> 35 #include <linux/in6.h> 36 #include <linux/mroute6.h> 37 #include <linux/init.h> 38 #include <linux/if_arp.h> 39 #include <linux/proc_fs.h> 40 #include <linux/seq_file.h> 41 #include <linux/nsproxy.h> 42 #include <linux/slab.h> 43 #include <linux/jhash.h> 44 #include <net/net_namespace.h> 45 #include <net/snmp.h> 46 #include <net/ipv6.h> 47 #include <net/ip6_fib.h> 48 #include <net/ip6_route.h> 49 #include <net/ndisc.h> 50 #include <net/addrconf.h> 51 #include <net/tcp.h> 52 #include <linux/rtnetlink.h> 53 #include <net/dst.h> 54 #include <net/dst_metadata.h> 55 #include <net/xfrm.h> 56 #include <net/netevent.h> 57 #include <net/netlink.h> 58 #include <net/rtnh.h> 59 #include <net/lwtunnel.h> 60 #include <net/ip_tunnels.h> 61 #include <net/l3mdev.h> 62 #include <net/ip.h> 63 #include <linux/uaccess.h> 64 65 #ifdef CONFIG_SYSCTL 66 #include <linux/sysctl.h> 67 #endif 68 69 static int ip6_rt_type_to_error(u8 fib6_type); 70 71 #define CREATE_TRACE_POINTS 72 #include <trace/events/fib6.h> 73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); 74 #undef CREATE_TRACE_POINTS 75 76 enum rt6_nud_state { 77 RT6_NUD_FAIL_HARD = -3, 78 RT6_NUD_FAIL_PROBE = -2, 79 RT6_NUD_FAIL_DO_RR = -1, 80 RT6_NUD_SUCCEED = 1 81 }; 82 83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 84 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 85 static unsigned int ip6_mtu(const struct dst_entry *dst); 86 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 87 static void ip6_dst_destroy(struct dst_entry *); 88 static void ip6_dst_ifdown(struct dst_entry *, 89 struct net_device *dev, int how); 90 static int ip6_dst_gc(struct dst_ops *ops); 91 92 static int ip6_pkt_discard(struct sk_buff *skb); 93 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static int ip6_pkt_prohibit(struct sk_buff *skb); 95 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 96 static void ip6_link_failure(struct sk_buff *skb); 97 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb, u32 mtu); 99 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 100 struct sk_buff *skb); 101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 102 int strict); 103 static size_t rt6_nlmsg_size(struct fib6_info *f6i); 104 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 105 struct fib6_info *rt, struct dst_entry *dst, 106 struct in6_addr *dest, struct in6_addr *src, 107 int iif, int type, u32 portid, u32 seq, 108 unsigned int flags); 109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 110 const struct in6_addr *daddr, 111 const struct in6_addr *saddr); 112 113 #ifdef CONFIG_IPV6_ROUTE_INFO 114 static struct fib6_info *rt6_add_route_info(struct net *net, 115 const struct in6_addr *prefix, int prefixlen, 116 const struct in6_addr *gwaddr, 117 struct net_device *dev, 118 unsigned int pref); 119 static struct fib6_info *rt6_get_route_info(struct net *net, 120 const struct in6_addr *prefix, int prefixlen, 121 const struct in6_addr *gwaddr, 122 struct net_device *dev); 123 #endif 124 125 struct uncached_list { 126 spinlock_t lock; 127 struct list_head head; 128 }; 129 130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 131 132 void rt6_uncached_list_add(struct rt6_info *rt) 133 { 134 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 135 136 rt->rt6i_uncached_list = ul; 137 138 spin_lock_bh(&ul->lock); 139 list_add_tail(&rt->rt6i_uncached, &ul->head); 140 spin_unlock_bh(&ul->lock); 141 } 142 143 void rt6_uncached_list_del(struct rt6_info *rt) 144 { 145 if (!list_empty(&rt->rt6i_uncached)) { 146 struct uncached_list *ul = rt->rt6i_uncached_list; 147 struct net *net = dev_net(rt->dst.dev); 148 149 spin_lock_bh(&ul->lock); 150 list_del(&rt->rt6i_uncached); 151 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 152 spin_unlock_bh(&ul->lock); 153 } 154 } 155 156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 157 { 158 struct net_device *loopback_dev = net->loopback_dev; 159 int cpu; 160 161 if (dev == loopback_dev) 162 return; 163 164 for_each_possible_cpu(cpu) { 165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 166 struct rt6_info *rt; 167 168 spin_lock_bh(&ul->lock); 169 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 170 struct inet6_dev *rt_idev = rt->rt6i_idev; 171 struct net_device *rt_dev = rt->dst.dev; 172 173 if (rt_idev->dev == dev) { 174 rt->rt6i_idev = in6_dev_get(loopback_dev); 175 in6_dev_put(rt_idev); 176 } 177 178 if (rt_dev == dev) { 179 rt->dst.dev = blackhole_netdev; 180 dev_hold(rt->dst.dev); 181 dev_put(rt_dev); 182 } 183 } 184 spin_unlock_bh(&ul->lock); 185 } 186 } 187 188 static inline const void *choose_neigh_daddr(const struct in6_addr *p, 189 struct sk_buff *skb, 190 const void *daddr) 191 { 192 if (!ipv6_addr_any(p)) 193 return (const void *) p; 194 else if (skb) 195 return &ipv6_hdr(skb)->daddr; 196 return daddr; 197 } 198 199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 200 struct net_device *dev, 201 struct sk_buff *skb, 202 const void *daddr) 203 { 204 struct neighbour *n; 205 206 daddr = choose_neigh_daddr(gw, skb, daddr); 207 n = __ipv6_neigh_lookup(dev, daddr); 208 if (n) 209 return n; 210 211 n = neigh_create(&nd_tbl, daddr, dev); 212 return IS_ERR(n) ? NULL : n; 213 } 214 215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, 216 struct sk_buff *skb, 217 const void *daddr) 218 { 219 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); 220 221 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), 222 dst->dev, skb, daddr); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = dst_cow_metrics_generic, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_dst_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_dst_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct fib6_info fib6_null_entry_template = { 293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), 294 .fib6_protocol = RTPROT_KERNEL, 295 .fib6_metric = ~(u32)0, 296 .fib6_ref = REFCOUNT_INIT(1), 297 .fib6_type = RTN_UNREACHABLE, 298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, 299 }; 300 301 static const struct rt6_info ip6_null_entry_template = { 302 .dst = { 303 .__refcnt = ATOMIC_INIT(1), 304 .__use = 1, 305 .obsolete = DST_OBSOLETE_FORCE_CHK, 306 .error = -ENETUNREACH, 307 .input = ip6_pkt_discard, 308 .output = ip6_pkt_discard_out, 309 }, 310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 }; 326 327 static const struct rt6_info ip6_blk_hole_entry_template = { 328 .dst = { 329 .__refcnt = ATOMIC_INIT(1), 330 .__use = 1, 331 .obsolete = DST_OBSOLETE_FORCE_CHK, 332 .error = -EINVAL, 333 .input = dst_discard, 334 .output = dst_discard_out, 335 }, 336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_uncached); 347 } 348 349 /* allocate dst with ip6_dst_ops */ 350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, 351 int flags) 352 { 353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 354 1, DST_OBSOLETE_FORCE_CHK, flags); 355 356 if (rt) { 357 rt6_info_init(rt); 358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 359 } 360 361 return rt; 362 } 363 EXPORT_SYMBOL(ip6_dst_alloc); 364 365 static void ip6_dst_destroy(struct dst_entry *dst) 366 { 367 struct rt6_info *rt = (struct rt6_info *)dst; 368 struct fib6_info *from; 369 struct inet6_dev *idev; 370 371 ip_dst_metrics_put(dst); 372 rt6_uncached_list_del(rt); 373 374 idev = rt->rt6i_idev; 375 if (idev) { 376 rt->rt6i_idev = NULL; 377 in6_dev_put(idev); 378 } 379 380 from = xchg((__force struct fib6_info **)&rt->from, NULL); 381 fib6_info_release(from); 382 } 383 384 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 385 int how) 386 { 387 struct rt6_info *rt = (struct rt6_info *)dst; 388 struct inet6_dev *idev = rt->rt6i_idev; 389 struct net_device *loopback_dev = 390 dev_net(dev)->loopback_dev; 391 392 if (idev && idev->dev != loopback_dev) { 393 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 394 if (loopback_idev) { 395 rt->rt6i_idev = loopback_idev; 396 in6_dev_put(idev); 397 } 398 } 399 } 400 401 static bool __rt6_check_expired(const struct rt6_info *rt) 402 { 403 if (rt->rt6i_flags & RTF_EXPIRES) 404 return time_after(jiffies, rt->dst.expires); 405 else 406 return false; 407 } 408 409 static bool rt6_check_expired(const struct rt6_info *rt) 410 { 411 struct fib6_info *from; 412 413 from = rcu_dereference(rt->from); 414 415 if (rt->rt6i_flags & RTF_EXPIRES) { 416 if (time_after(jiffies, rt->dst.expires)) 417 return true; 418 } else if (from) { 419 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 420 fib6_check_expired(from); 421 } 422 return false; 423 } 424 425 void fib6_select_path(const struct net *net, struct fib6_result *res, 426 struct flowi6 *fl6, int oif, bool have_oif_match, 427 const struct sk_buff *skb, int strict) 428 { 429 struct fib6_info *sibling, *next_sibling; 430 struct fib6_info *match = res->f6i; 431 432 if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) 433 goto out; 434 435 /* We might have already computed the hash for ICMPv6 errors. In such 436 * case it will always be non-zero. Otherwise now is the time to do it. 437 */ 438 if (!fl6->mp_hash && 439 (!match->nh || nexthop_is_multipath(match->nh))) 440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); 441 442 if (unlikely(match->nh)) { 443 nexthop_path_fib6_result(res, fl6->mp_hash); 444 return; 445 } 446 447 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) 448 goto out; 449 450 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, 451 fib6_siblings) { 452 const struct fib6_nh *nh = sibling->fib6_nh; 453 int nh_upper_bound; 454 455 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); 456 if (fl6->mp_hash > nh_upper_bound) 457 continue; 458 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) 459 break; 460 match = sibling; 461 break; 462 } 463 464 out: 465 res->f6i = match; 466 res->nh = match->fib6_nh; 467 } 468 469 /* 470 * Route lookup. rcu_read_lock() should be held. 471 */ 472 473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, 474 const struct in6_addr *saddr, int oif, int flags) 475 { 476 const struct net_device *dev; 477 478 if (nh->fib_nh_flags & RTNH_F_DEAD) 479 return false; 480 481 dev = nh->fib_nh_dev; 482 if (oif) { 483 if (dev->ifindex == oif) 484 return true; 485 } else { 486 if (ipv6_chk_addr(net, saddr, dev, 487 flags & RT6_LOOKUP_F_IFACE)) 488 return true; 489 } 490 491 return false; 492 } 493 494 struct fib6_nh_dm_arg { 495 struct net *net; 496 const struct in6_addr *saddr; 497 int oif; 498 int flags; 499 struct fib6_nh *nh; 500 }; 501 502 static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) 503 { 504 struct fib6_nh_dm_arg *arg = _arg; 505 506 arg->nh = nh; 507 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, 508 arg->flags); 509 } 510 511 /* returns fib6_nh from nexthop or NULL */ 512 static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, 513 struct fib6_result *res, 514 const struct in6_addr *saddr, 515 int oif, int flags) 516 { 517 struct fib6_nh_dm_arg arg = { 518 .net = net, 519 .saddr = saddr, 520 .oif = oif, 521 .flags = flags, 522 }; 523 524 if (nexthop_is_blackhole(nh)) 525 return NULL; 526 527 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) 528 return arg.nh; 529 530 return NULL; 531 } 532 533 static void rt6_device_match(struct net *net, struct fib6_result *res, 534 const struct in6_addr *saddr, int oif, int flags) 535 { 536 struct fib6_info *f6i = res->f6i; 537 struct fib6_info *spf6i; 538 struct fib6_nh *nh; 539 540 if (!oif && ipv6_addr_any(saddr)) { 541 if (unlikely(f6i->nh)) { 542 nh = nexthop_fib6_nh(f6i->nh); 543 if (nexthop_is_blackhole(f6i->nh)) 544 goto out_blackhole; 545 } else { 546 nh = f6i->fib6_nh; 547 } 548 if (!(nh->fib_nh_flags & RTNH_F_DEAD)) 549 goto out; 550 } 551 552 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { 553 bool matched = false; 554 555 if (unlikely(spf6i->nh)) { 556 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, 557 oif, flags); 558 if (nh) 559 matched = true; 560 } else { 561 nh = spf6i->fib6_nh; 562 if (__rt6_device_match(net, nh, saddr, oif, flags)) 563 matched = true; 564 } 565 if (matched) { 566 res->f6i = spf6i; 567 goto out; 568 } 569 } 570 571 if (oif && flags & RT6_LOOKUP_F_IFACE) { 572 res->f6i = net->ipv6.fib6_null_entry; 573 nh = res->f6i->fib6_nh; 574 goto out; 575 } 576 577 if (unlikely(f6i->nh)) { 578 nh = nexthop_fib6_nh(f6i->nh); 579 if (nexthop_is_blackhole(f6i->nh)) 580 goto out_blackhole; 581 } else { 582 nh = f6i->fib6_nh; 583 } 584 585 if (nh->fib_nh_flags & RTNH_F_DEAD) { 586 res->f6i = net->ipv6.fib6_null_entry; 587 nh = res->f6i->fib6_nh; 588 } 589 out: 590 res->nh = nh; 591 res->fib6_type = res->f6i->fib6_type; 592 res->fib6_flags = res->f6i->fib6_flags; 593 return; 594 595 out_blackhole: 596 res->fib6_flags |= RTF_REJECT; 597 res->fib6_type = RTN_BLACKHOLE; 598 res->nh = nh; 599 } 600 601 #ifdef CONFIG_IPV6_ROUTER_PREF 602 struct __rt6_probe_work { 603 struct work_struct work; 604 struct in6_addr target; 605 struct net_device *dev; 606 }; 607 608 static void rt6_probe_deferred(struct work_struct *w) 609 { 610 struct in6_addr mcaddr; 611 struct __rt6_probe_work *work = 612 container_of(w, struct __rt6_probe_work, work); 613 614 addrconf_addr_solict_mult(&work->target, &mcaddr); 615 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 616 dev_put(work->dev); 617 kfree(work); 618 } 619 620 static void rt6_probe(struct fib6_nh *fib6_nh) 621 { 622 struct __rt6_probe_work *work = NULL; 623 const struct in6_addr *nh_gw; 624 unsigned long last_probe; 625 struct neighbour *neigh; 626 struct net_device *dev; 627 struct inet6_dev *idev; 628 629 /* 630 * Okay, this does not seem to be appropriate 631 * for now, however, we need to check if it 632 * is really so; aka Router Reachability Probing. 633 * 634 * Router Reachability Probe MUST be rate-limited 635 * to no more than one per minute. 636 */ 637 if (!fib6_nh->fib_nh_gw_family) 638 return; 639 640 nh_gw = &fib6_nh->fib_nh_gw6; 641 dev = fib6_nh->fib_nh_dev; 642 rcu_read_lock_bh(); 643 last_probe = READ_ONCE(fib6_nh->last_probe); 644 idev = __in6_dev_get(dev); 645 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); 646 if (neigh) { 647 if (neigh->nud_state & NUD_VALID) 648 goto out; 649 650 write_lock(&neigh->lock); 651 if (!(neigh->nud_state & NUD_VALID) && 652 time_after(jiffies, 653 neigh->updated + idev->cnf.rtr_probe_interval)) { 654 work = kmalloc(sizeof(*work), GFP_ATOMIC); 655 if (work) 656 __neigh_set_probe_once(neigh); 657 } 658 write_unlock(&neigh->lock); 659 } else if (time_after(jiffies, last_probe + 660 idev->cnf.rtr_probe_interval)) { 661 work = kmalloc(sizeof(*work), GFP_ATOMIC); 662 } 663 664 if (!work || cmpxchg(&fib6_nh->last_probe, 665 last_probe, jiffies) != last_probe) { 666 kfree(work); 667 } else { 668 INIT_WORK(&work->work, rt6_probe_deferred); 669 work->target = *nh_gw; 670 dev_hold(dev); 671 work->dev = dev; 672 schedule_work(&work->work); 673 } 674 675 out: 676 rcu_read_unlock_bh(); 677 } 678 #else 679 static inline void rt6_probe(struct fib6_nh *fib6_nh) 680 { 681 } 682 #endif 683 684 /* 685 * Default Router Selection (RFC 2461 6.3.6) 686 */ 687 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) 688 { 689 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 690 struct neighbour *neigh; 691 692 rcu_read_lock_bh(); 693 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, 694 &fib6_nh->fib_nh_gw6); 695 if (neigh) { 696 read_lock(&neigh->lock); 697 if (neigh->nud_state & NUD_VALID) 698 ret = RT6_NUD_SUCCEED; 699 #ifdef CONFIG_IPV6_ROUTER_PREF 700 else if (!(neigh->nud_state & NUD_FAILED)) 701 ret = RT6_NUD_SUCCEED; 702 else 703 ret = RT6_NUD_FAIL_PROBE; 704 #endif 705 read_unlock(&neigh->lock); 706 } else { 707 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 708 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 709 } 710 rcu_read_unlock_bh(); 711 712 return ret; 713 } 714 715 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, 716 int strict) 717 { 718 int m = 0; 719 720 if (!oif || nh->fib_nh_dev->ifindex == oif) 721 m = 2; 722 723 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 724 return RT6_NUD_FAIL_HARD; 725 #ifdef CONFIG_IPV6_ROUTER_PREF 726 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; 727 #endif 728 if ((strict & RT6_LOOKUP_F_REACHABLE) && 729 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { 730 int n = rt6_check_neigh(nh); 731 if (n < 0) 732 return n; 733 } 734 return m; 735 } 736 737 static bool find_match(struct fib6_nh *nh, u32 fib6_flags, 738 int oif, int strict, int *mpri, bool *do_rr) 739 { 740 bool match_do_rr = false; 741 bool rc = false; 742 int m; 743 744 if (nh->fib_nh_flags & RTNH_F_DEAD) 745 goto out; 746 747 if (ip6_ignore_linkdown(nh->fib_nh_dev) && 748 nh->fib_nh_flags & RTNH_F_LINKDOWN && 749 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 750 goto out; 751 752 m = rt6_score_route(nh, fib6_flags, oif, strict); 753 if (m == RT6_NUD_FAIL_DO_RR) { 754 match_do_rr = true; 755 m = 0; /* lowest valid score */ 756 } else if (m == RT6_NUD_FAIL_HARD) { 757 goto out; 758 } 759 760 if (strict & RT6_LOOKUP_F_REACHABLE) 761 rt6_probe(nh); 762 763 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 764 if (m > *mpri) { 765 *do_rr = match_do_rr; 766 *mpri = m; 767 rc = true; 768 } 769 out: 770 return rc; 771 } 772 773 struct fib6_nh_frl_arg { 774 u32 flags; 775 int oif; 776 int strict; 777 int *mpri; 778 bool *do_rr; 779 struct fib6_nh *nh; 780 }; 781 782 static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) 783 { 784 struct fib6_nh_frl_arg *arg = _arg; 785 786 arg->nh = nh; 787 return find_match(nh, arg->flags, arg->oif, arg->strict, 788 arg->mpri, arg->do_rr); 789 } 790 791 static void __find_rr_leaf(struct fib6_info *f6i_start, 792 struct fib6_info *nomatch, u32 metric, 793 struct fib6_result *res, struct fib6_info **cont, 794 int oif, int strict, bool *do_rr, int *mpri) 795 { 796 struct fib6_info *f6i; 797 798 for (f6i = f6i_start; 799 f6i && f6i != nomatch; 800 f6i = rcu_dereference(f6i->fib6_next)) { 801 bool matched = false; 802 struct fib6_nh *nh; 803 804 if (cont && f6i->fib6_metric != metric) { 805 *cont = f6i; 806 return; 807 } 808 809 if (fib6_check_expired(f6i)) 810 continue; 811 812 if (unlikely(f6i->nh)) { 813 struct fib6_nh_frl_arg arg = { 814 .flags = f6i->fib6_flags, 815 .oif = oif, 816 .strict = strict, 817 .mpri = mpri, 818 .do_rr = do_rr 819 }; 820 821 if (nexthop_is_blackhole(f6i->nh)) { 822 res->fib6_flags = RTF_REJECT; 823 res->fib6_type = RTN_BLACKHOLE; 824 res->f6i = f6i; 825 res->nh = nexthop_fib6_nh(f6i->nh); 826 return; 827 } 828 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, 829 &arg)) { 830 matched = true; 831 nh = arg.nh; 832 } 833 } else { 834 nh = f6i->fib6_nh; 835 if (find_match(nh, f6i->fib6_flags, oif, strict, 836 mpri, do_rr)) 837 matched = true; 838 } 839 if (matched) { 840 res->f6i = f6i; 841 res->nh = nh; 842 res->fib6_flags = f6i->fib6_flags; 843 res->fib6_type = f6i->fib6_type; 844 } 845 } 846 } 847 848 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, 849 struct fib6_info *rr_head, int oif, int strict, 850 bool *do_rr, struct fib6_result *res) 851 { 852 u32 metric = rr_head->fib6_metric; 853 struct fib6_info *cont = NULL; 854 int mpri = -1; 855 856 __find_rr_leaf(rr_head, NULL, metric, res, &cont, 857 oif, strict, do_rr, &mpri); 858 859 __find_rr_leaf(leaf, rr_head, metric, res, &cont, 860 oif, strict, do_rr, &mpri); 861 862 if (res->f6i || !cont) 863 return; 864 865 __find_rr_leaf(cont, NULL, metric, res, NULL, 866 oif, strict, do_rr, &mpri); 867 } 868 869 static void rt6_select(struct net *net, struct fib6_node *fn, int oif, 870 struct fib6_result *res, int strict) 871 { 872 struct fib6_info *leaf = rcu_dereference(fn->leaf); 873 struct fib6_info *rt0; 874 bool do_rr = false; 875 int key_plen; 876 877 /* make sure this function or its helpers sets f6i */ 878 res->f6i = NULL; 879 880 if (!leaf || leaf == net->ipv6.fib6_null_entry) 881 goto out; 882 883 rt0 = rcu_dereference(fn->rr_ptr); 884 if (!rt0) 885 rt0 = leaf; 886 887 /* Double check to make sure fn is not an intermediate node 888 * and fn->leaf does not points to its child's leaf 889 * (This might happen if all routes under fn are deleted from 890 * the tree and fib6_repair_tree() is called on the node.) 891 */ 892 key_plen = rt0->fib6_dst.plen; 893 #ifdef CONFIG_IPV6_SUBTREES 894 if (rt0->fib6_src.plen) 895 key_plen = rt0->fib6_src.plen; 896 #endif 897 if (fn->fn_bit != key_plen) 898 goto out; 899 900 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); 901 if (do_rr) { 902 struct fib6_info *next = rcu_dereference(rt0->fib6_next); 903 904 /* no entries matched; do round-robin */ 905 if (!next || next->fib6_metric != rt0->fib6_metric) 906 next = leaf; 907 908 if (next != rt0) { 909 spin_lock_bh(&leaf->fib6_table->tb6_lock); 910 /* make sure next is not being deleted from the tree */ 911 if (next->fib6_node) 912 rcu_assign_pointer(fn->rr_ptr, next); 913 spin_unlock_bh(&leaf->fib6_table->tb6_lock); 914 } 915 } 916 917 out: 918 if (!res->f6i) { 919 res->f6i = net->ipv6.fib6_null_entry; 920 res->nh = res->f6i->fib6_nh; 921 res->fib6_flags = res->f6i->fib6_flags; 922 res->fib6_type = res->f6i->fib6_type; 923 } 924 } 925 926 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) 927 { 928 return (res->f6i->fib6_flags & RTF_NONEXTHOP) || 929 res->nh->fib_nh_gw_family; 930 } 931 932 #ifdef CONFIG_IPV6_ROUTE_INFO 933 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 934 const struct in6_addr *gwaddr) 935 { 936 struct net *net = dev_net(dev); 937 struct route_info *rinfo = (struct route_info *) opt; 938 struct in6_addr prefix_buf, *prefix; 939 unsigned int pref; 940 unsigned long lifetime; 941 struct fib6_info *rt; 942 943 if (len < sizeof(struct route_info)) { 944 return -EINVAL; 945 } 946 947 /* Sanity check for prefix_len and length */ 948 if (rinfo->length > 3) { 949 return -EINVAL; 950 } else if (rinfo->prefix_len > 128) { 951 return -EINVAL; 952 } else if (rinfo->prefix_len > 64) { 953 if (rinfo->length < 2) { 954 return -EINVAL; 955 } 956 } else if (rinfo->prefix_len > 0) { 957 if (rinfo->length < 1) { 958 return -EINVAL; 959 } 960 } 961 962 pref = rinfo->route_pref; 963 if (pref == ICMPV6_ROUTER_PREF_INVALID) 964 return -EINVAL; 965 966 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 967 968 if (rinfo->length == 3) 969 prefix = (struct in6_addr *)rinfo->prefix; 970 else { 971 /* this function is safe */ 972 ipv6_addr_prefix(&prefix_buf, 973 (struct in6_addr *)rinfo->prefix, 974 rinfo->prefix_len); 975 prefix = &prefix_buf; 976 } 977 978 if (rinfo->prefix_len == 0) 979 rt = rt6_get_dflt_router(net, gwaddr, dev); 980 else 981 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 982 gwaddr, dev); 983 984 if (rt && !lifetime) { 985 ip6_del_rt(net, rt); 986 rt = NULL; 987 } 988 989 if (!rt && lifetime) 990 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 991 dev, pref); 992 else if (rt) 993 rt->fib6_flags = RTF_ROUTEINFO | 994 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 995 996 if (rt) { 997 if (!addrconf_finite_timeout(lifetime)) 998 fib6_clean_expires(rt); 999 else 1000 fib6_set_expires(rt, jiffies + HZ * lifetime); 1001 1002 fib6_info_release(rt); 1003 } 1004 return 0; 1005 } 1006 #endif 1007 1008 /* 1009 * Misc support functions 1010 */ 1011 1012 /* called with rcu_lock held */ 1013 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) 1014 { 1015 struct net_device *dev = res->nh->fib_nh_dev; 1016 1017 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { 1018 /* for copies of local routes, dst->dev needs to be the 1019 * device if it is a master device, the master device if 1020 * device is enslaved, and the loopback as the default 1021 */ 1022 if (netif_is_l3_slave(dev) && 1023 !rt6_need_strict(&res->f6i->fib6_dst.addr)) 1024 dev = l3mdev_master_dev_rcu(dev); 1025 else if (!netif_is_l3_master(dev)) 1026 dev = dev_net(dev)->loopback_dev; 1027 /* last case is netif_is_l3_master(dev) is true in which 1028 * case we want dev returned to be dev 1029 */ 1030 } 1031 1032 return dev; 1033 } 1034 1035 static const int fib6_prop[RTN_MAX + 1] = { 1036 [RTN_UNSPEC] = 0, 1037 [RTN_UNICAST] = 0, 1038 [RTN_LOCAL] = 0, 1039 [RTN_BROADCAST] = 0, 1040 [RTN_ANYCAST] = 0, 1041 [RTN_MULTICAST] = 0, 1042 [RTN_BLACKHOLE] = -EINVAL, 1043 [RTN_UNREACHABLE] = -EHOSTUNREACH, 1044 [RTN_PROHIBIT] = -EACCES, 1045 [RTN_THROW] = -EAGAIN, 1046 [RTN_NAT] = -EINVAL, 1047 [RTN_XRESOLVE] = -EINVAL, 1048 }; 1049 1050 static int ip6_rt_type_to_error(u8 fib6_type) 1051 { 1052 return fib6_prop[fib6_type]; 1053 } 1054 1055 static unsigned short fib6_info_dst_flags(struct fib6_info *rt) 1056 { 1057 unsigned short flags = 0; 1058 1059 if (rt->dst_nocount) 1060 flags |= DST_NOCOUNT; 1061 if (rt->dst_nopolicy) 1062 flags |= DST_NOPOLICY; 1063 if (rt->dst_host) 1064 flags |= DST_HOST; 1065 1066 return flags; 1067 } 1068 1069 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) 1070 { 1071 rt->dst.error = ip6_rt_type_to_error(fib6_type); 1072 1073 switch (fib6_type) { 1074 case RTN_BLACKHOLE: 1075 rt->dst.output = dst_discard_out; 1076 rt->dst.input = dst_discard; 1077 break; 1078 case RTN_PROHIBIT: 1079 rt->dst.output = ip6_pkt_prohibit_out; 1080 rt->dst.input = ip6_pkt_prohibit; 1081 break; 1082 case RTN_THROW: 1083 case RTN_UNREACHABLE: 1084 default: 1085 rt->dst.output = ip6_pkt_discard_out; 1086 rt->dst.input = ip6_pkt_discard; 1087 break; 1088 } 1089 } 1090 1091 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) 1092 { 1093 struct fib6_info *f6i = res->f6i; 1094 1095 if (res->fib6_flags & RTF_REJECT) { 1096 ip6_rt_init_dst_reject(rt, res->fib6_type); 1097 return; 1098 } 1099 1100 rt->dst.error = 0; 1101 rt->dst.output = ip6_output; 1102 1103 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { 1104 rt->dst.input = ip6_input; 1105 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 1106 rt->dst.input = ip6_mc_input; 1107 } else { 1108 rt->dst.input = ip6_forward; 1109 } 1110 1111 if (res->nh->fib_nh_lws) { 1112 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); 1113 lwtunnel_set_redirect(&rt->dst); 1114 } 1115 1116 rt->dst.lastuse = jiffies; 1117 } 1118 1119 /* Caller must already hold reference to @from */ 1120 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 1121 { 1122 rt->rt6i_flags &= ~RTF_EXPIRES; 1123 rcu_assign_pointer(rt->from, from); 1124 ip_dst_init_metrics(&rt->dst, from->fib6_metrics); 1125 } 1126 1127 /* Caller must already hold reference to f6i in result */ 1128 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) 1129 { 1130 const struct fib6_nh *nh = res->nh; 1131 const struct net_device *dev = nh->fib_nh_dev; 1132 struct fib6_info *f6i = res->f6i; 1133 1134 ip6_rt_init_dst(rt, res); 1135 1136 rt->rt6i_dst = f6i->fib6_dst; 1137 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; 1138 rt->rt6i_flags = res->fib6_flags; 1139 if (nh->fib_nh_gw_family) { 1140 rt->rt6i_gateway = nh->fib_nh_gw6; 1141 rt->rt6i_flags |= RTF_GATEWAY; 1142 } 1143 rt6_set_from(rt, f6i); 1144 #ifdef CONFIG_IPV6_SUBTREES 1145 rt->rt6i_src = f6i->fib6_src; 1146 #endif 1147 } 1148 1149 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 1150 struct in6_addr *saddr) 1151 { 1152 struct fib6_node *pn, *sn; 1153 while (1) { 1154 if (fn->fn_flags & RTN_TL_ROOT) 1155 return NULL; 1156 pn = rcu_dereference(fn->parent); 1157 sn = FIB6_SUBTREE(pn); 1158 if (sn && sn != fn) 1159 fn = fib6_node_lookup(sn, NULL, saddr); 1160 else 1161 fn = pn; 1162 if (fn->fn_flags & RTN_RTINFO) 1163 return fn; 1164 } 1165 } 1166 1167 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) 1168 { 1169 struct rt6_info *rt = *prt; 1170 1171 if (dst_hold_safe(&rt->dst)) 1172 return true; 1173 if (net) { 1174 rt = net->ipv6.ip6_null_entry; 1175 dst_hold(&rt->dst); 1176 } else { 1177 rt = NULL; 1178 } 1179 *prt = rt; 1180 return false; 1181 } 1182 1183 /* called with rcu_lock held */ 1184 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) 1185 { 1186 struct net_device *dev = res->nh->fib_nh_dev; 1187 struct fib6_info *f6i = res->f6i; 1188 unsigned short flags; 1189 struct rt6_info *nrt; 1190 1191 if (!fib6_info_hold_safe(f6i)) 1192 goto fallback; 1193 1194 flags = fib6_info_dst_flags(f6i); 1195 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1196 if (!nrt) { 1197 fib6_info_release(f6i); 1198 goto fallback; 1199 } 1200 1201 ip6_rt_copy_init(nrt, res); 1202 return nrt; 1203 1204 fallback: 1205 nrt = dev_net(dev)->ipv6.ip6_null_entry; 1206 dst_hold(&nrt->dst); 1207 return nrt; 1208 } 1209 1210 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 1211 struct fib6_table *table, 1212 struct flowi6 *fl6, 1213 const struct sk_buff *skb, 1214 int flags) 1215 { 1216 struct fib6_result res = {}; 1217 struct fib6_node *fn; 1218 struct rt6_info *rt; 1219 1220 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1221 flags &= ~RT6_LOOKUP_F_IFACE; 1222 1223 rcu_read_lock(); 1224 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1225 restart: 1226 res.f6i = rcu_dereference(fn->leaf); 1227 if (!res.f6i) 1228 res.f6i = net->ipv6.fib6_null_entry; 1229 else 1230 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, 1231 flags); 1232 1233 if (res.f6i == net->ipv6.fib6_null_entry) { 1234 fn = fib6_backtrack(fn, &fl6->saddr); 1235 if (fn) 1236 goto restart; 1237 1238 rt = net->ipv6.ip6_null_entry; 1239 dst_hold(&rt->dst); 1240 goto out; 1241 } else if (res.fib6_flags & RTF_REJECT) { 1242 goto do_create; 1243 } 1244 1245 fib6_select_path(net, &res, fl6, fl6->flowi6_oif, 1246 fl6->flowi6_oif != 0, skb, flags); 1247 1248 /* Search through exception table */ 1249 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 1250 if (rt) { 1251 if (ip6_hold_safe(net, &rt)) 1252 dst_use_noref(&rt->dst, jiffies); 1253 } else { 1254 do_create: 1255 rt = ip6_create_rt_rcu(&res); 1256 } 1257 1258 out: 1259 trace_fib6_table_lookup(net, &res, table, fl6); 1260 1261 rcu_read_unlock(); 1262 1263 return rt; 1264 } 1265 1266 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 1267 const struct sk_buff *skb, int flags) 1268 { 1269 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); 1270 } 1271 EXPORT_SYMBOL_GPL(ip6_route_lookup); 1272 1273 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 1274 const struct in6_addr *saddr, int oif, 1275 const struct sk_buff *skb, int strict) 1276 { 1277 struct flowi6 fl6 = { 1278 .flowi6_oif = oif, 1279 .daddr = *daddr, 1280 }; 1281 struct dst_entry *dst; 1282 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 1283 1284 if (saddr) { 1285 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 1286 flags |= RT6_LOOKUP_F_HAS_SADDR; 1287 } 1288 1289 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); 1290 if (dst->error == 0) 1291 return (struct rt6_info *) dst; 1292 1293 dst_release(dst); 1294 1295 return NULL; 1296 } 1297 EXPORT_SYMBOL(rt6_lookup); 1298 1299 /* ip6_ins_rt is called with FREE table->tb6_lock. 1300 * It takes new route entry, the addition fails by any reason the 1301 * route is released. 1302 * Caller must hold dst before calling it. 1303 */ 1304 1305 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, 1306 struct netlink_ext_ack *extack) 1307 { 1308 int err; 1309 struct fib6_table *table; 1310 1311 table = rt->fib6_table; 1312 spin_lock_bh(&table->tb6_lock); 1313 err = fib6_add(&table->tb6_root, rt, info, extack); 1314 spin_unlock_bh(&table->tb6_lock); 1315 1316 return err; 1317 } 1318 1319 int ip6_ins_rt(struct net *net, struct fib6_info *rt) 1320 { 1321 struct nl_info info = { .nl_net = net, }; 1322 1323 return __ip6_ins_rt(rt, &info, NULL); 1324 } 1325 1326 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, 1327 const struct in6_addr *daddr, 1328 const struct in6_addr *saddr) 1329 { 1330 struct fib6_info *f6i = res->f6i; 1331 struct net_device *dev; 1332 struct rt6_info *rt; 1333 1334 /* 1335 * Clone the route. 1336 */ 1337 1338 if (!fib6_info_hold_safe(f6i)) 1339 return NULL; 1340 1341 dev = ip6_rt_get_dev_rcu(res); 1342 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1343 if (!rt) { 1344 fib6_info_release(f6i); 1345 return NULL; 1346 } 1347 1348 ip6_rt_copy_init(rt, res); 1349 rt->rt6i_flags |= RTF_CACHE; 1350 rt->dst.flags |= DST_HOST; 1351 rt->rt6i_dst.addr = *daddr; 1352 rt->rt6i_dst.plen = 128; 1353 1354 if (!rt6_is_gw_or_nonexthop(res)) { 1355 if (f6i->fib6_dst.plen != 128 && 1356 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) 1357 rt->rt6i_flags |= RTF_ANYCAST; 1358 #ifdef CONFIG_IPV6_SUBTREES 1359 if (rt->rt6i_src.plen && saddr) { 1360 rt->rt6i_src.addr = *saddr; 1361 rt->rt6i_src.plen = 128; 1362 } 1363 #endif 1364 } 1365 1366 return rt; 1367 } 1368 1369 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) 1370 { 1371 struct fib6_info *f6i = res->f6i; 1372 unsigned short flags = fib6_info_dst_flags(f6i); 1373 struct net_device *dev; 1374 struct rt6_info *pcpu_rt; 1375 1376 if (!fib6_info_hold_safe(f6i)) 1377 return NULL; 1378 1379 rcu_read_lock(); 1380 dev = ip6_rt_get_dev_rcu(res); 1381 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1382 rcu_read_unlock(); 1383 if (!pcpu_rt) { 1384 fib6_info_release(f6i); 1385 return NULL; 1386 } 1387 ip6_rt_copy_init(pcpu_rt, res); 1388 pcpu_rt->rt6i_flags |= RTF_PCPU; 1389 return pcpu_rt; 1390 } 1391 1392 /* It should be called with rcu_read_lock() acquired */ 1393 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) 1394 { 1395 struct rt6_info *pcpu_rt; 1396 1397 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1398 1399 return pcpu_rt; 1400 } 1401 1402 static struct rt6_info *rt6_make_pcpu_route(struct net *net, 1403 const struct fib6_result *res) 1404 { 1405 struct rt6_info *pcpu_rt, *prev, **p; 1406 1407 pcpu_rt = ip6_rt_pcpu_alloc(res); 1408 if (!pcpu_rt) 1409 return NULL; 1410 1411 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1412 prev = cmpxchg(p, NULL, pcpu_rt); 1413 BUG_ON(prev); 1414 1415 if (res->f6i->fib6_destroying) { 1416 struct fib6_info *from; 1417 1418 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); 1419 fib6_info_release(from); 1420 } 1421 1422 return pcpu_rt; 1423 } 1424 1425 /* exception hash table implementation 1426 */ 1427 static DEFINE_SPINLOCK(rt6_exception_lock); 1428 1429 /* Remove rt6_ex from hash table and free the memory 1430 * Caller must hold rt6_exception_lock 1431 */ 1432 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1433 struct rt6_exception *rt6_ex) 1434 { 1435 struct fib6_info *from; 1436 struct net *net; 1437 1438 if (!bucket || !rt6_ex) 1439 return; 1440 1441 net = dev_net(rt6_ex->rt6i->dst.dev); 1442 net->ipv6.rt6_stats->fib_rt_cache--; 1443 1444 /* purge completely the exception to allow releasing the held resources: 1445 * some [sk] cache may keep the dst around for unlimited time 1446 */ 1447 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); 1448 fib6_info_release(from); 1449 dst_dev_put(&rt6_ex->rt6i->dst); 1450 1451 hlist_del_rcu(&rt6_ex->hlist); 1452 dst_release(&rt6_ex->rt6i->dst); 1453 kfree_rcu(rt6_ex, rcu); 1454 WARN_ON_ONCE(!bucket->depth); 1455 bucket->depth--; 1456 } 1457 1458 /* Remove oldest rt6_ex in bucket and free the memory 1459 * Caller must hold rt6_exception_lock 1460 */ 1461 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1462 { 1463 struct rt6_exception *rt6_ex, *oldest = NULL; 1464 1465 if (!bucket) 1466 return; 1467 1468 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1469 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1470 oldest = rt6_ex; 1471 } 1472 rt6_remove_exception(bucket, oldest); 1473 } 1474 1475 static u32 rt6_exception_hash(const struct in6_addr *dst, 1476 const struct in6_addr *src) 1477 { 1478 static u32 seed __read_mostly; 1479 u32 val; 1480 1481 net_get_random_once(&seed, sizeof(seed)); 1482 val = jhash(dst, sizeof(*dst), seed); 1483 1484 #ifdef CONFIG_IPV6_SUBTREES 1485 if (src) 1486 val = jhash(src, sizeof(*src), val); 1487 #endif 1488 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1489 } 1490 1491 /* Helper function to find the cached rt in the hash table 1492 * and update bucket pointer to point to the bucket for this 1493 * (daddr, saddr) pair 1494 * Caller must hold rt6_exception_lock 1495 */ 1496 static struct rt6_exception * 1497 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1498 const struct in6_addr *daddr, 1499 const struct in6_addr *saddr) 1500 { 1501 struct rt6_exception *rt6_ex; 1502 u32 hval; 1503 1504 if (!(*bucket) || !daddr) 1505 return NULL; 1506 1507 hval = rt6_exception_hash(daddr, saddr); 1508 *bucket += hval; 1509 1510 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1511 struct rt6_info *rt6 = rt6_ex->rt6i; 1512 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1513 1514 #ifdef CONFIG_IPV6_SUBTREES 1515 if (matched && saddr) 1516 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1517 #endif 1518 if (matched) 1519 return rt6_ex; 1520 } 1521 return NULL; 1522 } 1523 1524 /* Helper function to find the cached rt in the hash table 1525 * and update bucket pointer to point to the bucket for this 1526 * (daddr, saddr) pair 1527 * Caller must hold rcu_read_lock() 1528 */ 1529 static struct rt6_exception * 1530 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1531 const struct in6_addr *daddr, 1532 const struct in6_addr *saddr) 1533 { 1534 struct rt6_exception *rt6_ex; 1535 u32 hval; 1536 1537 WARN_ON_ONCE(!rcu_read_lock_held()); 1538 1539 if (!(*bucket) || !daddr) 1540 return NULL; 1541 1542 hval = rt6_exception_hash(daddr, saddr); 1543 *bucket += hval; 1544 1545 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1546 struct rt6_info *rt6 = rt6_ex->rt6i; 1547 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1548 1549 #ifdef CONFIG_IPV6_SUBTREES 1550 if (matched && saddr) 1551 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1552 #endif 1553 if (matched) 1554 return rt6_ex; 1555 } 1556 return NULL; 1557 } 1558 1559 static unsigned int fib6_mtu(const struct fib6_result *res) 1560 { 1561 const struct fib6_nh *nh = res->nh; 1562 unsigned int mtu; 1563 1564 if (res->f6i->fib6_pmtu) { 1565 mtu = res->f6i->fib6_pmtu; 1566 } else { 1567 struct net_device *dev = nh->fib_nh_dev; 1568 struct inet6_dev *idev; 1569 1570 rcu_read_lock(); 1571 idev = __in6_dev_get(dev); 1572 mtu = idev->cnf.mtu6; 1573 rcu_read_unlock(); 1574 } 1575 1576 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1577 1578 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 1579 } 1580 1581 #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL 1582 1583 /* used when the flushed bit is not relevant, only access to the bucket 1584 * (ie., all bucket users except rt6_insert_exception); 1585 * 1586 * called under rcu lock; sometimes called with rt6_exception_lock held 1587 */ 1588 static 1589 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, 1590 spinlock_t *lock) 1591 { 1592 struct rt6_exception_bucket *bucket; 1593 1594 if (lock) 1595 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1596 lockdep_is_held(lock)); 1597 else 1598 bucket = rcu_dereference(nh->rt6i_exception_bucket); 1599 1600 /* remove bucket flushed bit if set */ 1601 if (bucket) { 1602 unsigned long p = (unsigned long)bucket; 1603 1604 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; 1605 bucket = (struct rt6_exception_bucket *)p; 1606 } 1607 1608 return bucket; 1609 } 1610 1611 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) 1612 { 1613 unsigned long p = (unsigned long)bucket; 1614 1615 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); 1616 } 1617 1618 /* called with rt6_exception_lock held */ 1619 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, 1620 spinlock_t *lock) 1621 { 1622 struct rt6_exception_bucket *bucket; 1623 unsigned long p; 1624 1625 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1626 lockdep_is_held(lock)); 1627 1628 p = (unsigned long)bucket; 1629 p |= FIB6_EXCEPTION_BUCKET_FLUSHED; 1630 bucket = (struct rt6_exception_bucket *)p; 1631 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1632 } 1633 1634 static int rt6_insert_exception(struct rt6_info *nrt, 1635 const struct fib6_result *res) 1636 { 1637 struct net *net = dev_net(nrt->dst.dev); 1638 struct rt6_exception_bucket *bucket; 1639 struct fib6_info *f6i = res->f6i; 1640 struct in6_addr *src_key = NULL; 1641 struct rt6_exception *rt6_ex; 1642 struct fib6_nh *nh = res->nh; 1643 int err = 0; 1644 1645 spin_lock_bh(&rt6_exception_lock); 1646 1647 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, 1648 lockdep_is_held(&rt6_exception_lock)); 1649 if (!bucket) { 1650 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1651 GFP_ATOMIC); 1652 if (!bucket) { 1653 err = -ENOMEM; 1654 goto out; 1655 } 1656 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); 1657 } else if (fib6_nh_excptn_bucket_flushed(bucket)) { 1658 err = -EINVAL; 1659 goto out; 1660 } 1661 1662 #ifdef CONFIG_IPV6_SUBTREES 1663 /* fib6_src.plen != 0 indicates f6i is in subtree 1664 * and exception table is indexed by a hash of 1665 * both fib6_dst and fib6_src. 1666 * Otherwise, the exception table is indexed by 1667 * a hash of only fib6_dst. 1668 */ 1669 if (f6i->fib6_src.plen) 1670 src_key = &nrt->rt6i_src.addr; 1671 #endif 1672 /* rt6_mtu_change() might lower mtu on f6i. 1673 * Only insert this exception route if its mtu 1674 * is less than f6i's mtu value. 1675 */ 1676 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { 1677 err = -EINVAL; 1678 goto out; 1679 } 1680 1681 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1682 src_key); 1683 if (rt6_ex) 1684 rt6_remove_exception(bucket, rt6_ex); 1685 1686 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1687 if (!rt6_ex) { 1688 err = -ENOMEM; 1689 goto out; 1690 } 1691 rt6_ex->rt6i = nrt; 1692 rt6_ex->stamp = jiffies; 1693 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1694 bucket->depth++; 1695 net->ipv6.rt6_stats->fib_rt_cache++; 1696 1697 if (bucket->depth > FIB6_MAX_DEPTH) 1698 rt6_exception_remove_oldest(bucket); 1699 1700 out: 1701 spin_unlock_bh(&rt6_exception_lock); 1702 1703 /* Update fn->fn_sernum to invalidate all cached dst */ 1704 if (!err) { 1705 spin_lock_bh(&f6i->fib6_table->tb6_lock); 1706 fib6_update_sernum(net, f6i); 1707 spin_unlock_bh(&f6i->fib6_table->tb6_lock); 1708 fib6_force_start_gc(net); 1709 } 1710 1711 return err; 1712 } 1713 1714 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) 1715 { 1716 struct rt6_exception_bucket *bucket; 1717 struct rt6_exception *rt6_ex; 1718 struct hlist_node *tmp; 1719 int i; 1720 1721 spin_lock_bh(&rt6_exception_lock); 1722 1723 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1724 if (!bucket) 1725 goto out; 1726 1727 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1728 if (!from) 1729 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); 1730 1731 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1732 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { 1733 if (!from || 1734 rcu_access_pointer(rt6_ex->rt6i->from) == from) 1735 rt6_remove_exception(bucket, rt6_ex); 1736 } 1737 WARN_ON_ONCE(!from && bucket->depth); 1738 bucket++; 1739 } 1740 out: 1741 spin_unlock_bh(&rt6_exception_lock); 1742 } 1743 1744 static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) 1745 { 1746 struct fib6_info *f6i = arg; 1747 1748 fib6_nh_flush_exceptions(nh, f6i); 1749 1750 return 0; 1751 } 1752 1753 void rt6_flush_exceptions(struct fib6_info *f6i) 1754 { 1755 if (f6i->nh) 1756 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, 1757 f6i); 1758 else 1759 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); 1760 } 1761 1762 /* Find cached rt in the hash table inside passed in rt 1763 * Caller has to hold rcu_read_lock() 1764 */ 1765 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, 1766 const struct in6_addr *daddr, 1767 const struct in6_addr *saddr) 1768 { 1769 const struct in6_addr *src_key = NULL; 1770 struct rt6_exception_bucket *bucket; 1771 struct rt6_exception *rt6_ex; 1772 struct rt6_info *ret = NULL; 1773 1774 #ifdef CONFIG_IPV6_SUBTREES 1775 /* fib6i_src.plen != 0 indicates f6i is in subtree 1776 * and exception table is indexed by a hash of 1777 * both fib6_dst and fib6_src. 1778 * However, the src addr used to create the hash 1779 * might not be exactly the passed in saddr which 1780 * is a /128 addr from the flow. 1781 * So we need to use f6i->fib6_src to redo lookup 1782 * if the passed in saddr does not find anything. 1783 * (See the logic in ip6_rt_cache_alloc() on how 1784 * rt->rt6i_src is updated.) 1785 */ 1786 if (res->f6i->fib6_src.plen) 1787 src_key = saddr; 1788 find_ex: 1789 #endif 1790 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); 1791 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1792 1793 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1794 ret = rt6_ex->rt6i; 1795 1796 #ifdef CONFIG_IPV6_SUBTREES 1797 /* Use fib6_src as src_key and redo lookup */ 1798 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { 1799 src_key = &res->f6i->fib6_src.addr; 1800 goto find_ex; 1801 } 1802 #endif 1803 1804 return ret; 1805 } 1806 1807 /* Remove the passed in cached rt from the hash table that contains it */ 1808 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, 1809 const struct rt6_info *rt) 1810 { 1811 const struct in6_addr *src_key = NULL; 1812 struct rt6_exception_bucket *bucket; 1813 struct rt6_exception *rt6_ex; 1814 int err; 1815 1816 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 1817 return -ENOENT; 1818 1819 spin_lock_bh(&rt6_exception_lock); 1820 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1821 1822 #ifdef CONFIG_IPV6_SUBTREES 1823 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1824 * and exception table is indexed by a hash of 1825 * both rt6i_dst and rt6i_src. 1826 * Otherwise, the exception table is indexed by 1827 * a hash of only rt6i_dst. 1828 */ 1829 if (plen) 1830 src_key = &rt->rt6i_src.addr; 1831 #endif 1832 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1833 &rt->rt6i_dst.addr, 1834 src_key); 1835 if (rt6_ex) { 1836 rt6_remove_exception(bucket, rt6_ex); 1837 err = 0; 1838 } else { 1839 err = -ENOENT; 1840 } 1841 1842 spin_unlock_bh(&rt6_exception_lock); 1843 return err; 1844 } 1845 1846 struct fib6_nh_excptn_arg { 1847 struct rt6_info *rt; 1848 int plen; 1849 }; 1850 1851 static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) 1852 { 1853 struct fib6_nh_excptn_arg *arg = _arg; 1854 int err; 1855 1856 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); 1857 if (err == 0) 1858 return 1; 1859 1860 return 0; 1861 } 1862 1863 static int rt6_remove_exception_rt(struct rt6_info *rt) 1864 { 1865 struct fib6_info *from; 1866 1867 from = rcu_dereference(rt->from); 1868 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1869 return -EINVAL; 1870 1871 if (from->nh) { 1872 struct fib6_nh_excptn_arg arg = { 1873 .rt = rt, 1874 .plen = from->fib6_src.plen 1875 }; 1876 int rc; 1877 1878 /* rc = 1 means an entry was found */ 1879 rc = nexthop_for_each_fib6_nh(from->nh, 1880 rt6_nh_remove_exception_rt, 1881 &arg); 1882 return rc ? 0 : -ENOENT; 1883 } 1884 1885 return fib6_nh_remove_exception(from->fib6_nh, 1886 from->fib6_src.plen, rt); 1887 } 1888 1889 /* Find rt6_ex which contains the passed in rt cache and 1890 * refresh its stamp 1891 */ 1892 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, 1893 const struct rt6_info *rt) 1894 { 1895 const struct in6_addr *src_key = NULL; 1896 struct rt6_exception_bucket *bucket; 1897 struct rt6_exception *rt6_ex; 1898 1899 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 1900 #ifdef CONFIG_IPV6_SUBTREES 1901 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1902 * and exception table is indexed by a hash of 1903 * both rt6i_dst and rt6i_src. 1904 * Otherwise, the exception table is indexed by 1905 * a hash of only rt6i_dst. 1906 */ 1907 if (plen) 1908 src_key = &rt->rt6i_src.addr; 1909 #endif 1910 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); 1911 if (rt6_ex) 1912 rt6_ex->stamp = jiffies; 1913 } 1914 1915 struct fib6_nh_match_arg { 1916 const struct net_device *dev; 1917 const struct in6_addr *gw; 1918 struct fib6_nh *match; 1919 }; 1920 1921 /* determine if fib6_nh has given device and gateway */ 1922 static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) 1923 { 1924 struct fib6_nh_match_arg *arg = _arg; 1925 1926 if (arg->dev != nh->fib_nh_dev || 1927 (arg->gw && !nh->fib_nh_gw_family) || 1928 (!arg->gw && nh->fib_nh_gw_family) || 1929 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) 1930 return 0; 1931 1932 arg->match = nh; 1933 1934 /* found a match, break the loop */ 1935 return 1; 1936 } 1937 1938 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1939 { 1940 struct fib6_info *from; 1941 struct fib6_nh *fib6_nh; 1942 1943 rcu_read_lock(); 1944 1945 from = rcu_dereference(rt->from); 1946 if (!from || !(rt->rt6i_flags & RTF_CACHE)) 1947 goto unlock; 1948 1949 if (from->nh) { 1950 struct fib6_nh_match_arg arg = { 1951 .dev = rt->dst.dev, 1952 .gw = &rt->rt6i_gateway, 1953 }; 1954 1955 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); 1956 1957 if (!arg.match) 1958 goto unlock; 1959 fib6_nh = arg.match; 1960 } else { 1961 fib6_nh = from->fib6_nh; 1962 } 1963 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); 1964 unlock: 1965 rcu_read_unlock(); 1966 } 1967 1968 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, 1969 struct rt6_info *rt, int mtu) 1970 { 1971 /* If the new MTU is lower than the route PMTU, this new MTU will be the 1972 * lowest MTU in the path: always allow updating the route PMTU to 1973 * reflect PMTU decreases. 1974 * 1975 * If the new MTU is higher, and the route PMTU is equal to the local 1976 * MTU, this means the old MTU is the lowest in the path, so allow 1977 * updating it: if other nodes now have lower MTUs, PMTU discovery will 1978 * handle this. 1979 */ 1980 1981 if (dst_mtu(&rt->dst) >= mtu) 1982 return true; 1983 1984 if (dst_mtu(&rt->dst) == idev->cnf.mtu6) 1985 return true; 1986 1987 return false; 1988 } 1989 1990 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, 1991 const struct fib6_nh *nh, int mtu) 1992 { 1993 struct rt6_exception_bucket *bucket; 1994 struct rt6_exception *rt6_ex; 1995 int i; 1996 1997 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 1998 if (!bucket) 1999 return; 2000 2001 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2002 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 2003 struct rt6_info *entry = rt6_ex->rt6i; 2004 2005 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected 2006 * route), the metrics of its rt->from have already 2007 * been updated. 2008 */ 2009 if (dst_metric_raw(&entry->dst, RTAX_MTU) && 2010 rt6_mtu_change_route_allowed(idev, entry, mtu)) 2011 dst_metric_set(&entry->dst, RTAX_MTU, mtu); 2012 } 2013 bucket++; 2014 } 2015 } 2016 2017 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2018 2019 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, 2020 const struct in6_addr *gateway) 2021 { 2022 struct rt6_exception_bucket *bucket; 2023 struct rt6_exception *rt6_ex; 2024 struct hlist_node *tmp; 2025 int i; 2026 2027 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2028 return; 2029 2030 spin_lock_bh(&rt6_exception_lock); 2031 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2032 if (bucket) { 2033 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2034 hlist_for_each_entry_safe(rt6_ex, tmp, 2035 &bucket->chain, hlist) { 2036 struct rt6_info *entry = rt6_ex->rt6i; 2037 2038 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 2039 RTF_CACHE_GATEWAY && 2040 ipv6_addr_equal(gateway, 2041 &entry->rt6i_gateway)) { 2042 rt6_remove_exception(bucket, rt6_ex); 2043 } 2044 } 2045 bucket++; 2046 } 2047 } 2048 2049 spin_unlock_bh(&rt6_exception_lock); 2050 } 2051 2052 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 2053 struct rt6_exception *rt6_ex, 2054 struct fib6_gc_args *gc_args, 2055 unsigned long now) 2056 { 2057 struct rt6_info *rt = rt6_ex->rt6i; 2058 2059 /* we are pruning and obsoleting aged-out and non gateway exceptions 2060 * even if others have still references to them, so that on next 2061 * dst_check() such references can be dropped. 2062 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when 2063 * expired, independently from their aging, as per RFC 8201 section 4 2064 */ 2065 if (!(rt->rt6i_flags & RTF_EXPIRES)) { 2066 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 2067 RT6_TRACE("aging clone %p\n", rt); 2068 rt6_remove_exception(bucket, rt6_ex); 2069 return; 2070 } 2071 } else if (time_after(jiffies, rt->dst.expires)) { 2072 RT6_TRACE("purging expired route %p\n", rt); 2073 rt6_remove_exception(bucket, rt6_ex); 2074 return; 2075 } 2076 2077 if (rt->rt6i_flags & RTF_GATEWAY) { 2078 struct neighbour *neigh; 2079 __u8 neigh_flags = 0; 2080 2081 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 2082 if (neigh) 2083 neigh_flags = neigh->flags; 2084 2085 if (!(neigh_flags & NTF_ROUTER)) { 2086 RT6_TRACE("purging route %p via non-router but gateway\n", 2087 rt); 2088 rt6_remove_exception(bucket, rt6_ex); 2089 return; 2090 } 2091 } 2092 2093 gc_args->more++; 2094 } 2095 2096 static void fib6_nh_age_exceptions(const struct fib6_nh *nh, 2097 struct fib6_gc_args *gc_args, 2098 unsigned long now) 2099 { 2100 struct rt6_exception_bucket *bucket; 2101 struct rt6_exception *rt6_ex; 2102 struct hlist_node *tmp; 2103 int i; 2104 2105 if (!rcu_access_pointer(nh->rt6i_exception_bucket)) 2106 return; 2107 2108 rcu_read_lock_bh(); 2109 spin_lock(&rt6_exception_lock); 2110 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); 2111 if (bucket) { 2112 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 2113 hlist_for_each_entry_safe(rt6_ex, tmp, 2114 &bucket->chain, hlist) { 2115 rt6_age_examine_exception(bucket, rt6_ex, 2116 gc_args, now); 2117 } 2118 bucket++; 2119 } 2120 } 2121 spin_unlock(&rt6_exception_lock); 2122 rcu_read_unlock_bh(); 2123 } 2124 2125 struct fib6_nh_age_excptn_arg { 2126 struct fib6_gc_args *gc_args; 2127 unsigned long now; 2128 }; 2129 2130 static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) 2131 { 2132 struct fib6_nh_age_excptn_arg *arg = _arg; 2133 2134 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); 2135 return 0; 2136 } 2137 2138 void rt6_age_exceptions(struct fib6_info *f6i, 2139 struct fib6_gc_args *gc_args, 2140 unsigned long now) 2141 { 2142 if (f6i->nh) { 2143 struct fib6_nh_age_excptn_arg arg = { 2144 .gc_args = gc_args, 2145 .now = now 2146 }; 2147 2148 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, 2149 &arg); 2150 } else { 2151 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); 2152 } 2153 } 2154 2155 /* must be called with rcu lock held */ 2156 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, 2157 struct flowi6 *fl6, struct fib6_result *res, int strict) 2158 { 2159 struct fib6_node *fn, *saved_fn; 2160 2161 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2162 saved_fn = fn; 2163 2164 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2165 oif = 0; 2166 2167 redo_rt6_select: 2168 rt6_select(net, fn, oif, res, strict); 2169 if (res->f6i == net->ipv6.fib6_null_entry) { 2170 fn = fib6_backtrack(fn, &fl6->saddr); 2171 if (fn) 2172 goto redo_rt6_select; 2173 else if (strict & RT6_LOOKUP_F_REACHABLE) { 2174 /* also consider unreachable route */ 2175 strict &= ~RT6_LOOKUP_F_REACHABLE; 2176 fn = saved_fn; 2177 goto redo_rt6_select; 2178 } 2179 } 2180 2181 trace_fib6_table_lookup(net, res, table, fl6); 2182 2183 return 0; 2184 } 2185 2186 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 2187 int oif, struct flowi6 *fl6, 2188 const struct sk_buff *skb, int flags) 2189 { 2190 struct fib6_result res = {}; 2191 struct rt6_info *rt = NULL; 2192 int strict = 0; 2193 2194 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2195 !rcu_read_lock_held()); 2196 2197 strict |= flags & RT6_LOOKUP_F_IFACE; 2198 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 2199 if (net->ipv6.devconf_all->forwarding == 0) 2200 strict |= RT6_LOOKUP_F_REACHABLE; 2201 2202 rcu_read_lock(); 2203 2204 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2205 if (res.f6i == net->ipv6.fib6_null_entry) 2206 goto out; 2207 2208 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2209 2210 /*Search through exception table */ 2211 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2212 if (rt) { 2213 goto out; 2214 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2215 !res.nh->fib_nh_gw_family)) { 2216 /* Create a RTF_CACHE clone which will not be 2217 * owned by the fib6 tree. It is for the special case where 2218 * the daddr in the skb during the neighbor look-up is different 2219 * from the fl6->daddr used to look-up route here. 2220 */ 2221 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2222 2223 if (rt) { 2224 /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2225 * As rt6_uncached_list_add() does not consume refcnt, 2226 * this refcnt is always returned to the caller even 2227 * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2228 */ 2229 rt6_uncached_list_add(rt); 2230 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2231 rcu_read_unlock(); 2232 2233 return rt; 2234 } 2235 } else { 2236 /* Get a percpu copy */ 2237 local_bh_disable(); 2238 rt = rt6_get_pcpu_route(&res); 2239 2240 if (!rt) 2241 rt = rt6_make_pcpu_route(net, &res); 2242 2243 local_bh_enable(); 2244 } 2245 out: 2246 if (!rt) 2247 rt = net->ipv6.ip6_null_entry; 2248 if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2249 ip6_hold_safe(net, &rt); 2250 rcu_read_unlock(); 2251 2252 return rt; 2253 } 2254 EXPORT_SYMBOL_GPL(ip6_pol_route); 2255 2256 static struct rt6_info *ip6_pol_route_input(struct net *net, 2257 struct fib6_table *table, 2258 struct flowi6 *fl6, 2259 const struct sk_buff *skb, 2260 int flags) 2261 { 2262 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); 2263 } 2264 2265 struct dst_entry *ip6_route_input_lookup(struct net *net, 2266 struct net_device *dev, 2267 struct flowi6 *fl6, 2268 const struct sk_buff *skb, 2269 int flags) 2270 { 2271 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 2272 flags |= RT6_LOOKUP_F_IFACE; 2273 2274 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); 2275 } 2276 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 2277 2278 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 2279 struct flow_keys *keys, 2280 struct flow_keys *flkeys) 2281 { 2282 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 2283 const struct ipv6hdr *key_iph = outer_iph; 2284 struct flow_keys *_flkeys = flkeys; 2285 const struct ipv6hdr *inner_iph; 2286 const struct icmp6hdr *icmph; 2287 struct ipv6hdr _inner_iph; 2288 struct icmp6hdr _icmph; 2289 2290 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 2291 goto out; 2292 2293 icmph = skb_header_pointer(skb, skb_transport_offset(skb), 2294 sizeof(_icmph), &_icmph); 2295 if (!icmph) 2296 goto out; 2297 2298 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 2299 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 2300 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 2301 icmph->icmp6_type != ICMPV6_PARAMPROB) 2302 goto out; 2303 2304 inner_iph = skb_header_pointer(skb, 2305 skb_transport_offset(skb) + sizeof(*icmph), 2306 sizeof(_inner_iph), &_inner_iph); 2307 if (!inner_iph) 2308 goto out; 2309 2310 key_iph = inner_iph; 2311 _flkeys = NULL; 2312 out: 2313 if (_flkeys) { 2314 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; 2315 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; 2316 keys->tags.flow_label = _flkeys->tags.flow_label; 2317 keys->basic.ip_proto = _flkeys->basic.ip_proto; 2318 } else { 2319 keys->addrs.v6addrs.src = key_iph->saddr; 2320 keys->addrs.v6addrs.dst = key_iph->daddr; 2321 keys->tags.flow_label = ip6_flowlabel(key_iph); 2322 keys->basic.ip_proto = key_iph->nexthdr; 2323 } 2324 } 2325 2326 /* if skb is set it will be used and fl6 can be NULL */ 2327 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, 2328 const struct sk_buff *skb, struct flow_keys *flkeys) 2329 { 2330 struct flow_keys hash_keys; 2331 u32 mhash; 2332 2333 switch (ip6_multipath_hash_policy(net)) { 2334 case 0: 2335 memset(&hash_keys, 0, sizeof(hash_keys)); 2336 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2337 if (skb) { 2338 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2339 } else { 2340 hash_keys.addrs.v6addrs.src = fl6->saddr; 2341 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2342 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2343 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2344 } 2345 break; 2346 case 1: 2347 if (skb) { 2348 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 2349 struct flow_keys keys; 2350 2351 /* short-circuit if we already have L4 hash present */ 2352 if (skb->l4_hash) 2353 return skb_get_hash_raw(skb) >> 1; 2354 2355 memset(&hash_keys, 0, sizeof(hash_keys)); 2356 2357 if (!flkeys) { 2358 skb_flow_dissect_flow_keys(skb, &keys, flag); 2359 flkeys = &keys; 2360 } 2361 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2362 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2363 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2364 hash_keys.ports.src = flkeys->ports.src; 2365 hash_keys.ports.dst = flkeys->ports.dst; 2366 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2367 } else { 2368 memset(&hash_keys, 0, sizeof(hash_keys)); 2369 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2370 hash_keys.addrs.v6addrs.src = fl6->saddr; 2371 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2372 hash_keys.ports.src = fl6->fl6_sport; 2373 hash_keys.ports.dst = fl6->fl6_dport; 2374 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2375 } 2376 break; 2377 case 2: 2378 memset(&hash_keys, 0, sizeof(hash_keys)); 2379 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2380 if (skb) { 2381 struct flow_keys keys; 2382 2383 if (!flkeys) { 2384 skb_flow_dissect_flow_keys(skb, &keys, 0); 2385 flkeys = &keys; 2386 } 2387 2388 /* Inner can be v4 or v6 */ 2389 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 2390 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2391 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 2392 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 2393 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 2394 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2395 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; 2396 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; 2397 hash_keys.tags.flow_label = flkeys->tags.flow_label; 2398 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 2399 } else { 2400 /* Same as case 0 */ 2401 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2402 ip6_multipath_l3_keys(skb, &hash_keys, flkeys); 2403 } 2404 } else { 2405 /* Same as case 0 */ 2406 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2407 hash_keys.addrs.v6addrs.src = fl6->saddr; 2408 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2409 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2410 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2411 } 2412 break; 2413 } 2414 mhash = flow_hash_from_keys(&hash_keys); 2415 2416 return mhash >> 1; 2417 } 2418 2419 /* Called with rcu held */ 2420 void ip6_route_input(struct sk_buff *skb) 2421 { 2422 const struct ipv6hdr *iph = ipv6_hdr(skb); 2423 struct net *net = dev_net(skb->dev); 2424 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2425 struct ip_tunnel_info *tun_info; 2426 struct flowi6 fl6 = { 2427 .flowi6_iif = skb->dev->ifindex, 2428 .daddr = iph->daddr, 2429 .saddr = iph->saddr, 2430 .flowlabel = ip6_flowinfo(iph), 2431 .flowi6_mark = skb->mark, 2432 .flowi6_proto = iph->nexthdr, 2433 }; 2434 struct flow_keys *flkeys = NULL, _flkeys; 2435 2436 tun_info = skb_tunnel_info(skb); 2437 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2438 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 2439 2440 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) 2441 flkeys = &_flkeys; 2442 2443 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2444 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2445 skb_dst_drop(skb); 2446 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2447 &fl6, skb, flags)); 2448 } 2449 2450 static struct rt6_info *ip6_pol_route_output(struct net *net, 2451 struct fib6_table *table, 2452 struct flowi6 *fl6, 2453 const struct sk_buff *skb, 2454 int flags) 2455 { 2456 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2457 } 2458 2459 struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2460 const struct sock *sk, 2461 struct flowi6 *fl6, int flags) 2462 { 2463 bool any_src; 2464 2465 if (ipv6_addr_type(&fl6->daddr) & 2466 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2467 struct dst_entry *dst; 2468 2469 /* This function does not take refcnt on the dst */ 2470 dst = l3mdev_link_scope_lookup(net, fl6); 2471 if (dst) 2472 return dst; 2473 } 2474 2475 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2476 2477 flags |= RT6_LOOKUP_F_DST_NOREF; 2478 any_src = ipv6_addr_any(&fl6->saddr); 2479 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2480 (fl6->flowi6_oif && any_src)) 2481 flags |= RT6_LOOKUP_F_IFACE; 2482 2483 if (!any_src) 2484 flags |= RT6_LOOKUP_F_HAS_SADDR; 2485 else if (sk) 2486 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2487 2488 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2489 } 2490 EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); 2491 2492 struct dst_entry *ip6_route_output_flags(struct net *net, 2493 const struct sock *sk, 2494 struct flowi6 *fl6, 2495 int flags) 2496 { 2497 struct dst_entry *dst; 2498 struct rt6_info *rt6; 2499 2500 rcu_read_lock(); 2501 dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2502 rt6 = (struct rt6_info *)dst; 2503 /* For dst cached in uncached_list, refcnt is already taken. */ 2504 if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { 2505 dst = &net->ipv6.ip6_null_entry->dst; 2506 dst_hold(dst); 2507 } 2508 rcu_read_unlock(); 2509 2510 return dst; 2511 } 2512 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2513 2514 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2515 { 2516 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 2517 struct net_device *loopback_dev = net->loopback_dev; 2518 struct dst_entry *new = NULL; 2519 2520 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 2521 DST_OBSOLETE_DEAD, 0); 2522 if (rt) { 2523 rt6_info_init(rt); 2524 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 2525 2526 new = &rt->dst; 2527 new->__use = 1; 2528 new->input = dst_discard; 2529 new->output = dst_discard_out; 2530 2531 dst_copy_metrics(new, &ort->dst); 2532 2533 rt->rt6i_idev = in6_dev_get(loopback_dev); 2534 rt->rt6i_gateway = ort->rt6i_gateway; 2535 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 2536 2537 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 2538 #ifdef CONFIG_IPV6_SUBTREES 2539 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 2540 #endif 2541 } 2542 2543 dst_release(dst_orig); 2544 return new ? new : ERR_PTR(-ENOMEM); 2545 } 2546 2547 /* 2548 * Destination cache support functions 2549 */ 2550 2551 static bool fib6_check(struct fib6_info *f6i, u32 cookie) 2552 { 2553 u32 rt_cookie = 0; 2554 2555 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) 2556 return false; 2557 2558 if (fib6_check_expired(f6i)) 2559 return false; 2560 2561 return true; 2562 } 2563 2564 static struct dst_entry *rt6_check(struct rt6_info *rt, 2565 struct fib6_info *from, 2566 u32 cookie) 2567 { 2568 u32 rt_cookie = 0; 2569 2570 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || 2571 rt_cookie != cookie) 2572 return NULL; 2573 2574 if (rt6_check_expired(rt)) 2575 return NULL; 2576 2577 return &rt->dst; 2578 } 2579 2580 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, 2581 struct fib6_info *from, 2582 u32 cookie) 2583 { 2584 if (!__rt6_check_expired(rt) && 2585 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 2586 fib6_check(from, cookie)) 2587 return &rt->dst; 2588 else 2589 return NULL; 2590 } 2591 2592 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 2593 { 2594 struct dst_entry *dst_ret; 2595 struct fib6_info *from; 2596 struct rt6_info *rt; 2597 2598 rt = container_of(dst, struct rt6_info, dst); 2599 2600 rcu_read_lock(); 2601 2602 /* All IPV6 dsts are created with ->obsolete set to the value 2603 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 2604 * into this function always. 2605 */ 2606 2607 from = rcu_dereference(rt->from); 2608 2609 if (from && (rt->rt6i_flags & RTF_PCPU || 2610 unlikely(!list_empty(&rt->rt6i_uncached)))) 2611 dst_ret = rt6_dst_from_check(rt, from, cookie); 2612 else 2613 dst_ret = rt6_check(rt, from, cookie); 2614 2615 rcu_read_unlock(); 2616 2617 return dst_ret; 2618 } 2619 2620 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2621 { 2622 struct rt6_info *rt = (struct rt6_info *) dst; 2623 2624 if (rt) { 2625 if (rt->rt6i_flags & RTF_CACHE) { 2626 rcu_read_lock(); 2627 if (rt6_check_expired(rt)) { 2628 rt6_remove_exception_rt(rt); 2629 dst = NULL; 2630 } 2631 rcu_read_unlock(); 2632 } else { 2633 dst_release(dst); 2634 dst = NULL; 2635 } 2636 } 2637 return dst; 2638 } 2639 2640 static void ip6_link_failure(struct sk_buff *skb) 2641 { 2642 struct rt6_info *rt; 2643 2644 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 2645 2646 rt = (struct rt6_info *) skb_dst(skb); 2647 if (rt) { 2648 rcu_read_lock(); 2649 if (rt->rt6i_flags & RTF_CACHE) { 2650 rt6_remove_exception_rt(rt); 2651 } else { 2652 struct fib6_info *from; 2653 struct fib6_node *fn; 2654 2655 from = rcu_dereference(rt->from); 2656 if (from) { 2657 fn = rcu_dereference(from->fib6_node); 2658 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2659 fn->fn_sernum = -1; 2660 } 2661 } 2662 rcu_read_unlock(); 2663 } 2664 } 2665 2666 static void rt6_update_expires(struct rt6_info *rt0, int timeout) 2667 { 2668 if (!(rt0->rt6i_flags & RTF_EXPIRES)) { 2669 struct fib6_info *from; 2670 2671 rcu_read_lock(); 2672 from = rcu_dereference(rt0->from); 2673 if (from) 2674 rt0->dst.expires = from->expires; 2675 rcu_read_unlock(); 2676 } 2677 2678 dst_set_expires(&rt0->dst, timeout); 2679 rt0->rt6i_flags |= RTF_EXPIRES; 2680 } 2681 2682 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2683 { 2684 struct net *net = dev_net(rt->dst.dev); 2685 2686 dst_metric_set(&rt->dst, RTAX_MTU, mtu); 2687 rt->rt6i_flags |= RTF_MODIFIED; 2688 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2689 } 2690 2691 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2692 { 2693 return !(rt->rt6i_flags & RTF_CACHE) && 2694 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); 2695 } 2696 2697 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2698 const struct ipv6hdr *iph, u32 mtu) 2699 { 2700 const struct in6_addr *daddr, *saddr; 2701 struct rt6_info *rt6 = (struct rt6_info *)dst; 2702 2703 if (dst_metric_locked(dst, RTAX_MTU)) 2704 return; 2705 2706 if (iph) { 2707 daddr = &iph->daddr; 2708 saddr = &iph->saddr; 2709 } else if (sk) { 2710 daddr = &sk->sk_v6_daddr; 2711 saddr = &inet6_sk(sk)->saddr; 2712 } else { 2713 daddr = NULL; 2714 saddr = NULL; 2715 } 2716 dst_confirm_neigh(dst, daddr); 2717 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2718 if (mtu >= dst_mtu(dst)) 2719 return; 2720 2721 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2722 rt6_do_update_pmtu(rt6, mtu); 2723 /* update rt6_ex->stamp for cache */ 2724 if (rt6->rt6i_flags & RTF_CACHE) 2725 rt6_update_exception_stamp_rt(rt6); 2726 } else if (daddr) { 2727 struct fib6_result res = {}; 2728 struct rt6_info *nrt6; 2729 2730 rcu_read_lock(); 2731 res.f6i = rcu_dereference(rt6->from); 2732 if (!res.f6i) { 2733 rcu_read_unlock(); 2734 return; 2735 } 2736 res.fib6_flags = res.f6i->fib6_flags; 2737 res.fib6_type = res.f6i->fib6_type; 2738 2739 if (res.f6i->nh) { 2740 struct fib6_nh_match_arg arg = { 2741 .dev = dst->dev, 2742 .gw = &rt6->rt6i_gateway, 2743 }; 2744 2745 nexthop_for_each_fib6_nh(res.f6i->nh, 2746 fib6_nh_find_match, &arg); 2747 2748 /* fib6_info uses a nexthop that does not have fib6_nh 2749 * using the dst->dev + gw. Should be impossible. 2750 */ 2751 if (!arg.match) { 2752 rcu_read_unlock(); 2753 return; 2754 } 2755 2756 res.nh = arg.match; 2757 } else { 2758 res.nh = res.f6i->fib6_nh; 2759 } 2760 2761 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); 2762 if (nrt6) { 2763 rt6_do_update_pmtu(nrt6, mtu); 2764 if (rt6_insert_exception(nrt6, &res)) 2765 dst_release_immediate(&nrt6->dst); 2766 } 2767 rcu_read_unlock(); 2768 } 2769 } 2770 2771 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2772 struct sk_buff *skb, u32 mtu) 2773 { 2774 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2775 } 2776 2777 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2778 int oif, u32 mark, kuid_t uid) 2779 { 2780 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2781 struct dst_entry *dst; 2782 struct flowi6 fl6 = { 2783 .flowi6_oif = oif, 2784 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), 2785 .daddr = iph->daddr, 2786 .saddr = iph->saddr, 2787 .flowlabel = ip6_flowinfo(iph), 2788 .flowi6_uid = uid, 2789 }; 2790 2791 dst = ip6_route_output(net, NULL, &fl6); 2792 if (!dst->error) 2793 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2794 dst_release(dst); 2795 } 2796 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2797 2798 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2799 { 2800 int oif = sk->sk_bound_dev_if; 2801 struct dst_entry *dst; 2802 2803 if (!oif && skb->dev) 2804 oif = l3mdev_master_ifindex(skb->dev); 2805 2806 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); 2807 2808 dst = __sk_dst_get(sk); 2809 if (!dst || !dst->obsolete || 2810 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2811 return; 2812 2813 bh_lock_sock(sk); 2814 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2815 ip6_datagram_dst_update(sk, false); 2816 bh_unlock_sock(sk); 2817 } 2818 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2819 2820 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, 2821 const struct flowi6 *fl6) 2822 { 2823 #ifdef CONFIG_IPV6_SUBTREES 2824 struct ipv6_pinfo *np = inet6_sk(sk); 2825 #endif 2826 2827 ip6_dst_store(sk, dst, 2828 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 2829 &sk->sk_v6_daddr : NULL, 2830 #ifdef CONFIG_IPV6_SUBTREES 2831 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 2832 &np->saddr : 2833 #endif 2834 NULL); 2835 } 2836 2837 static bool ip6_redirect_nh_match(const struct fib6_result *res, 2838 struct flowi6 *fl6, 2839 const struct in6_addr *gw, 2840 struct rt6_info **ret) 2841 { 2842 const struct fib6_nh *nh = res->nh; 2843 2844 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || 2845 fl6->flowi6_oif != nh->fib_nh_dev->ifindex) 2846 return false; 2847 2848 /* rt_cache's gateway might be different from its 'parent' 2849 * in the case of an ip redirect. 2850 * So we keep searching in the exception table if the gateway 2851 * is different. 2852 */ 2853 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { 2854 struct rt6_info *rt_cache; 2855 2856 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); 2857 if (rt_cache && 2858 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { 2859 *ret = rt_cache; 2860 return true; 2861 } 2862 return false; 2863 } 2864 return true; 2865 } 2866 2867 struct fib6_nh_rd_arg { 2868 struct fib6_result *res; 2869 struct flowi6 *fl6; 2870 const struct in6_addr *gw; 2871 struct rt6_info **ret; 2872 }; 2873 2874 static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) 2875 { 2876 struct fib6_nh_rd_arg *arg = _arg; 2877 2878 arg->res->nh = nh; 2879 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); 2880 } 2881 2882 /* Handle redirects */ 2883 struct ip6rd_flowi { 2884 struct flowi6 fl6; 2885 struct in6_addr gateway; 2886 }; 2887 2888 static struct rt6_info *__ip6_route_redirect(struct net *net, 2889 struct fib6_table *table, 2890 struct flowi6 *fl6, 2891 const struct sk_buff *skb, 2892 int flags) 2893 { 2894 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2895 struct rt6_info *ret = NULL; 2896 struct fib6_result res = {}; 2897 struct fib6_nh_rd_arg arg = { 2898 .res = &res, 2899 .fl6 = fl6, 2900 .gw = &rdfl->gateway, 2901 .ret = &ret 2902 }; 2903 struct fib6_info *rt; 2904 struct fib6_node *fn; 2905 2906 /* l3mdev_update_flow overrides oif if the device is enslaved; in 2907 * this case we must match on the real ingress device, so reset it 2908 */ 2909 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 2910 fl6->flowi6_oif = skb->dev->ifindex; 2911 2912 /* Get the "current" route for this destination and 2913 * check if the redirect has come from appropriate router. 2914 * 2915 * RFC 4861 specifies that redirects should only be 2916 * accepted if they come from the nexthop to the target. 2917 * Due to the way the routes are chosen, this notion 2918 * is a bit fuzzy and one might need to check all possible 2919 * routes. 2920 */ 2921 2922 rcu_read_lock(); 2923 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2924 restart: 2925 for_each_fib6_node_rt_rcu(fn) { 2926 res.f6i = rt; 2927 if (fib6_check_expired(rt)) 2928 continue; 2929 if (rt->fib6_flags & RTF_REJECT) 2930 break; 2931 if (unlikely(rt->nh)) { 2932 if (nexthop_is_blackhole(rt->nh)) 2933 continue; 2934 /* on match, res->nh is filled in and potentially ret */ 2935 if (nexthop_for_each_fib6_nh(rt->nh, 2936 fib6_nh_redirect_match, 2937 &arg)) 2938 goto out; 2939 } else { 2940 res.nh = rt->fib6_nh; 2941 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, 2942 &ret)) 2943 goto out; 2944 } 2945 } 2946 2947 if (!rt) 2948 rt = net->ipv6.fib6_null_entry; 2949 else if (rt->fib6_flags & RTF_REJECT) { 2950 ret = net->ipv6.ip6_null_entry; 2951 goto out; 2952 } 2953 2954 if (rt == net->ipv6.fib6_null_entry) { 2955 fn = fib6_backtrack(fn, &fl6->saddr); 2956 if (fn) 2957 goto restart; 2958 } 2959 2960 res.f6i = rt; 2961 res.nh = rt->fib6_nh; 2962 out: 2963 if (ret) { 2964 ip6_hold_safe(net, &ret); 2965 } else { 2966 res.fib6_flags = res.f6i->fib6_flags; 2967 res.fib6_type = res.f6i->fib6_type; 2968 ret = ip6_create_rt_rcu(&res); 2969 } 2970 2971 rcu_read_unlock(); 2972 2973 trace_fib6_table_lookup(net, &res, table, fl6); 2974 return ret; 2975 }; 2976 2977 static struct dst_entry *ip6_route_redirect(struct net *net, 2978 const struct flowi6 *fl6, 2979 const struct sk_buff *skb, 2980 const struct in6_addr *gateway) 2981 { 2982 int flags = RT6_LOOKUP_F_HAS_SADDR; 2983 struct ip6rd_flowi rdfl; 2984 2985 rdfl.fl6 = *fl6; 2986 rdfl.gateway = *gateway; 2987 2988 return fib6_rule_lookup(net, &rdfl.fl6, skb, 2989 flags, __ip6_route_redirect); 2990 } 2991 2992 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2993 kuid_t uid) 2994 { 2995 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2996 struct dst_entry *dst; 2997 struct flowi6 fl6 = { 2998 .flowi6_iif = LOOPBACK_IFINDEX, 2999 .flowi6_oif = oif, 3000 .flowi6_mark = mark, 3001 .daddr = iph->daddr, 3002 .saddr = iph->saddr, 3003 .flowlabel = ip6_flowinfo(iph), 3004 .flowi6_uid = uid, 3005 }; 3006 3007 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); 3008 rt6_do_redirect(dst, NULL, skb); 3009 dst_release(dst); 3010 } 3011 EXPORT_SYMBOL_GPL(ip6_redirect); 3012 3013 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) 3014 { 3015 const struct ipv6hdr *iph = ipv6_hdr(skb); 3016 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 3017 struct dst_entry *dst; 3018 struct flowi6 fl6 = { 3019 .flowi6_iif = LOOPBACK_IFINDEX, 3020 .flowi6_oif = oif, 3021 .daddr = msg->dest, 3022 .saddr = iph->daddr, 3023 .flowi6_uid = sock_net_uid(net, NULL), 3024 }; 3025 3026 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); 3027 rt6_do_redirect(dst, NULL, skb); 3028 dst_release(dst); 3029 } 3030 3031 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 3032 { 3033 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 3034 sk->sk_uid); 3035 } 3036 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 3037 3038 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 3039 { 3040 struct net_device *dev = dst->dev; 3041 unsigned int mtu = dst_mtu(dst); 3042 struct net *net = dev_net(dev); 3043 3044 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3045 3046 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3047 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3048 3049 /* 3050 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 3051 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 3052 * IPV6_MAXPLEN is also valid and means: "any MSS, 3053 * rely only on pmtu discovery" 3054 */ 3055 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 3056 mtu = IPV6_MAXPLEN; 3057 return mtu; 3058 } 3059 3060 static unsigned int ip6_mtu(const struct dst_entry *dst) 3061 { 3062 struct inet6_dev *idev; 3063 unsigned int mtu; 3064 3065 mtu = dst_metric_raw(dst, RTAX_MTU); 3066 if (mtu) 3067 goto out; 3068 3069 mtu = IPV6_MIN_MTU; 3070 3071 rcu_read_lock(); 3072 idev = __in6_dev_get(dst->dev); 3073 if (idev) 3074 mtu = idev->cnf.mtu6; 3075 rcu_read_unlock(); 3076 3077 out: 3078 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3079 3080 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 3081 } 3082 3083 /* MTU selection: 3084 * 1. mtu on route is locked - use it 3085 * 2. mtu from nexthop exception 3086 * 3. mtu from egress device 3087 * 3088 * based on ip6_dst_mtu_forward and exception logic of 3089 * rt6_find_cached_rt; called with rcu_read_lock 3090 */ 3091 u32 ip6_mtu_from_fib6(const struct fib6_result *res, 3092 const struct in6_addr *daddr, 3093 const struct in6_addr *saddr) 3094 { 3095 const struct fib6_nh *nh = res->nh; 3096 struct fib6_info *f6i = res->f6i; 3097 struct inet6_dev *idev; 3098 struct rt6_info *rt; 3099 u32 mtu = 0; 3100 3101 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 3102 mtu = f6i->fib6_pmtu; 3103 if (mtu) 3104 goto out; 3105 } 3106 3107 rt = rt6_find_cached_rt(res, daddr, saddr); 3108 if (unlikely(rt)) { 3109 mtu = dst_metric_raw(&rt->dst, RTAX_MTU); 3110 } else { 3111 struct net_device *dev = nh->fib_nh_dev; 3112 3113 mtu = IPV6_MIN_MTU; 3114 idev = __in6_dev_get(dev); 3115 if (idev && idev->cnf.mtu6 > mtu) 3116 mtu = idev->cnf.mtu6; 3117 } 3118 3119 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 3120 out: 3121 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); 3122 } 3123 3124 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 3125 struct flowi6 *fl6) 3126 { 3127 struct dst_entry *dst; 3128 struct rt6_info *rt; 3129 struct inet6_dev *idev = in6_dev_get(dev); 3130 struct net *net = dev_net(dev); 3131 3132 if (unlikely(!idev)) 3133 return ERR_PTR(-ENODEV); 3134 3135 rt = ip6_dst_alloc(net, dev, 0); 3136 if (unlikely(!rt)) { 3137 in6_dev_put(idev); 3138 dst = ERR_PTR(-ENOMEM); 3139 goto out; 3140 } 3141 3142 rt->dst.flags |= DST_HOST; 3143 rt->dst.input = ip6_input; 3144 rt->dst.output = ip6_output; 3145 rt->rt6i_gateway = fl6->daddr; 3146 rt->rt6i_dst.addr = fl6->daddr; 3147 rt->rt6i_dst.plen = 128; 3148 rt->rt6i_idev = idev; 3149 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 3150 3151 /* Add this dst into uncached_list so that rt6_disable_ip() can 3152 * do proper release of the net_device 3153 */ 3154 rt6_uncached_list_add(rt); 3155 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 3156 3157 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 3158 3159 out: 3160 return dst; 3161 } 3162 3163 static int ip6_dst_gc(struct dst_ops *ops) 3164 { 3165 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 3166 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 3167 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 3168 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 3169 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 3170 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 3171 int entries; 3172 3173 entries = dst_entries_get_fast(ops); 3174 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 3175 entries <= rt_max_size) 3176 goto out; 3177 3178 net->ipv6.ip6_rt_gc_expire++; 3179 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 3180 entries = dst_entries_get_slow(ops); 3181 if (entries < ops->gc_thresh) 3182 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 3183 out: 3184 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 3185 return entries > rt_max_size; 3186 } 3187 3188 static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, 3189 const struct in6_addr *gw_addr, u32 tbid, 3190 int flags, struct fib6_result *res) 3191 { 3192 struct flowi6 fl6 = { 3193 .flowi6_oif = cfg->fc_ifindex, 3194 .daddr = *gw_addr, 3195 .saddr = cfg->fc_prefsrc, 3196 }; 3197 struct fib6_table *table; 3198 int err; 3199 3200 table = fib6_get_table(net, tbid); 3201 if (!table) 3202 return -EINVAL; 3203 3204 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 3205 flags |= RT6_LOOKUP_F_HAS_SADDR; 3206 3207 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; 3208 3209 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); 3210 if (!err && res->f6i != net->ipv6.fib6_null_entry) 3211 fib6_select_path(net, res, &fl6, cfg->fc_ifindex, 3212 cfg->fc_ifindex != 0, NULL, flags); 3213 3214 return err; 3215 } 3216 3217 static int ip6_route_check_nh_onlink(struct net *net, 3218 struct fib6_config *cfg, 3219 const struct net_device *dev, 3220 struct netlink_ext_ack *extack) 3221 { 3222 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 3223 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3224 struct fib6_result res = {}; 3225 int err; 3226 3227 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); 3228 if (!err && !(res.fib6_flags & RTF_REJECT) && 3229 /* ignore match if it is the default route */ 3230 !ipv6_addr_any(&res.f6i->fib6_dst.addr) && 3231 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { 3232 NL_SET_ERR_MSG(extack, 3233 "Nexthop has invalid gateway or device mismatch"); 3234 err = -EINVAL; 3235 } 3236 3237 return err; 3238 } 3239 3240 static int ip6_route_check_nh(struct net *net, 3241 struct fib6_config *cfg, 3242 struct net_device **_dev, 3243 struct inet6_dev **idev) 3244 { 3245 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3246 struct net_device *dev = _dev ? *_dev : NULL; 3247 int flags = RT6_LOOKUP_F_IFACE; 3248 struct fib6_result res = {}; 3249 int err = -EHOSTUNREACH; 3250 3251 if (cfg->fc_table) { 3252 err = ip6_nh_lookup_table(net, cfg, gw_addr, 3253 cfg->fc_table, flags, &res); 3254 /* gw_addr can not require a gateway or resolve to a reject 3255 * route. If a device is given, it must match the result. 3256 */ 3257 if (err || res.fib6_flags & RTF_REJECT || 3258 res.nh->fib_nh_gw_family || 3259 (dev && dev != res.nh->fib_nh_dev)) 3260 err = -EHOSTUNREACH; 3261 } 3262 3263 if (err < 0) { 3264 struct flowi6 fl6 = { 3265 .flowi6_oif = cfg->fc_ifindex, 3266 .daddr = *gw_addr, 3267 }; 3268 3269 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); 3270 if (err || res.fib6_flags & RTF_REJECT || 3271 res.nh->fib_nh_gw_family) 3272 err = -EHOSTUNREACH; 3273 3274 if (err) 3275 return err; 3276 3277 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, 3278 cfg->fc_ifindex != 0, NULL, flags); 3279 } 3280 3281 err = 0; 3282 if (dev) { 3283 if (dev != res.nh->fib_nh_dev) 3284 err = -EHOSTUNREACH; 3285 } else { 3286 *_dev = dev = res.nh->fib_nh_dev; 3287 dev_hold(dev); 3288 *idev = in6_dev_get(dev); 3289 } 3290 3291 return err; 3292 } 3293 3294 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, 3295 struct net_device **_dev, struct inet6_dev **idev, 3296 struct netlink_ext_ack *extack) 3297 { 3298 const struct in6_addr *gw_addr = &cfg->fc_gateway; 3299 int gwa_type = ipv6_addr_type(gw_addr); 3300 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; 3301 const struct net_device *dev = *_dev; 3302 bool need_addr_check = !dev; 3303 int err = -EINVAL; 3304 3305 /* if gw_addr is local we will fail to detect this in case 3306 * address is still TENTATIVE (DAD in progress). rt6_lookup() 3307 * will return already-added prefix route via interface that 3308 * prefix route was assigned to, which might be non-loopback. 3309 */ 3310 if (dev && 3311 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3312 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3313 goto out; 3314 } 3315 3316 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { 3317 /* IPv6 strictly inhibits using not link-local 3318 * addresses as nexthop address. 3319 * Otherwise, router will not able to send redirects. 3320 * It is very good, but in some (rare!) circumstances 3321 * (SIT, PtP, NBMA NOARP links) it is handy to allow 3322 * some exceptions. --ANK 3323 * We allow IPv4-mapped nexthops to support RFC4798-type 3324 * addressing 3325 */ 3326 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { 3327 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 3328 goto out; 3329 } 3330 3331 rcu_read_lock(); 3332 3333 if (cfg->fc_flags & RTNH_F_ONLINK) 3334 err = ip6_route_check_nh_onlink(net, cfg, dev, extack); 3335 else 3336 err = ip6_route_check_nh(net, cfg, _dev, idev); 3337 3338 rcu_read_unlock(); 3339 3340 if (err) 3341 goto out; 3342 } 3343 3344 /* reload in case device was changed */ 3345 dev = *_dev; 3346 3347 err = -EINVAL; 3348 if (!dev) { 3349 NL_SET_ERR_MSG(extack, "Egress device not specified"); 3350 goto out; 3351 } else if (dev->flags & IFF_LOOPBACK) { 3352 NL_SET_ERR_MSG(extack, 3353 "Egress device can not be loopback device for this route"); 3354 goto out; 3355 } 3356 3357 /* if we did not check gw_addr above, do so now that the 3358 * egress device has been resolved. 3359 */ 3360 if (need_addr_check && 3361 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { 3362 NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); 3363 goto out; 3364 } 3365 3366 err = 0; 3367 out: 3368 return err; 3369 } 3370 3371 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) 3372 { 3373 if ((flags & RTF_REJECT) || 3374 (dev && (dev->flags & IFF_LOOPBACK) && 3375 !(addr_type & IPV6_ADDR_LOOPBACK) && 3376 !(flags & RTF_LOCAL))) 3377 return true; 3378 3379 return false; 3380 } 3381 3382 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, 3383 struct fib6_config *cfg, gfp_t gfp_flags, 3384 struct netlink_ext_ack *extack) 3385 { 3386 struct net_device *dev = NULL; 3387 struct inet6_dev *idev = NULL; 3388 int addr_type; 3389 int err; 3390 3391 fib6_nh->fib_nh_family = AF_INET6; 3392 #ifdef CONFIG_IPV6_ROUTER_PREF 3393 fib6_nh->last_probe = jiffies; 3394 #endif 3395 3396 err = -ENODEV; 3397 if (cfg->fc_ifindex) { 3398 dev = dev_get_by_index(net, cfg->fc_ifindex); 3399 if (!dev) 3400 goto out; 3401 idev = in6_dev_get(dev); 3402 if (!idev) 3403 goto out; 3404 } 3405 3406 if (cfg->fc_flags & RTNH_F_ONLINK) { 3407 if (!dev) { 3408 NL_SET_ERR_MSG(extack, 3409 "Nexthop device required for onlink"); 3410 goto out; 3411 } 3412 3413 if (!(dev->flags & IFF_UP)) { 3414 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3415 err = -ENETDOWN; 3416 goto out; 3417 } 3418 3419 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; 3420 } 3421 3422 fib6_nh->fib_nh_weight = 1; 3423 3424 /* We cannot add true routes via loopback here, 3425 * they would result in kernel looping; promote them to reject routes 3426 */ 3427 addr_type = ipv6_addr_type(&cfg->fc_dst); 3428 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { 3429 /* hold loopback dev/idev if we haven't done so. */ 3430 if (dev != net->loopback_dev) { 3431 if (dev) { 3432 dev_put(dev); 3433 in6_dev_put(idev); 3434 } 3435 dev = net->loopback_dev; 3436 dev_hold(dev); 3437 idev = in6_dev_get(dev); 3438 if (!idev) { 3439 err = -ENODEV; 3440 goto out; 3441 } 3442 } 3443 goto pcpu_alloc; 3444 } 3445 3446 if (cfg->fc_flags & RTF_GATEWAY) { 3447 err = ip6_validate_gw(net, cfg, &dev, &idev, extack); 3448 if (err) 3449 goto out; 3450 3451 fib6_nh->fib_nh_gw6 = cfg->fc_gateway; 3452 fib6_nh->fib_nh_gw_family = AF_INET6; 3453 } 3454 3455 err = -ENODEV; 3456 if (!dev) 3457 goto out; 3458 3459 if (idev->cnf.disable_ipv6) { 3460 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); 3461 err = -EACCES; 3462 goto out; 3463 } 3464 3465 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { 3466 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3467 err = -ENETDOWN; 3468 goto out; 3469 } 3470 3471 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && 3472 !netif_carrier_ok(dev)) 3473 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 3474 3475 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, 3476 cfg->fc_encap_type, cfg, gfp_flags, extack); 3477 if (err) 3478 goto out; 3479 3480 pcpu_alloc: 3481 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); 3482 if (!fib6_nh->rt6i_pcpu) { 3483 err = -ENOMEM; 3484 goto out; 3485 } 3486 3487 fib6_nh->fib_nh_dev = dev; 3488 fib6_nh->fib_nh_oif = dev->ifindex; 3489 err = 0; 3490 out: 3491 if (idev) 3492 in6_dev_put(idev); 3493 3494 if (err) { 3495 lwtstate_put(fib6_nh->fib_nh_lws); 3496 fib6_nh->fib_nh_lws = NULL; 3497 if (dev) 3498 dev_put(dev); 3499 } 3500 3501 return err; 3502 } 3503 3504 void fib6_nh_release(struct fib6_nh *fib6_nh) 3505 { 3506 struct rt6_exception_bucket *bucket; 3507 3508 rcu_read_lock(); 3509 3510 fib6_nh_flush_exceptions(fib6_nh, NULL); 3511 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); 3512 if (bucket) { 3513 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); 3514 kfree(bucket); 3515 } 3516 3517 rcu_read_unlock(); 3518 3519 if (fib6_nh->rt6i_pcpu) { 3520 int cpu; 3521 3522 for_each_possible_cpu(cpu) { 3523 struct rt6_info **ppcpu_rt; 3524 struct rt6_info *pcpu_rt; 3525 3526 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); 3527 pcpu_rt = *ppcpu_rt; 3528 if (pcpu_rt) { 3529 dst_dev_put(&pcpu_rt->dst); 3530 dst_release(&pcpu_rt->dst); 3531 *ppcpu_rt = NULL; 3532 } 3533 } 3534 3535 free_percpu(fib6_nh->rt6i_pcpu); 3536 } 3537 3538 fib_nh_common_release(&fib6_nh->nh_common); 3539 } 3540 3541 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, 3542 gfp_t gfp_flags, 3543 struct netlink_ext_ack *extack) 3544 { 3545 struct net *net = cfg->fc_nlinfo.nl_net; 3546 struct fib6_info *rt = NULL; 3547 struct nexthop *nh = NULL; 3548 struct fib6_table *table; 3549 struct fib6_nh *fib6_nh; 3550 int err = -EINVAL; 3551 int addr_type; 3552 3553 /* RTF_PCPU is an internal flag; can not be set by userspace */ 3554 if (cfg->fc_flags & RTF_PCPU) { 3555 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 3556 goto out; 3557 } 3558 3559 /* RTF_CACHE is an internal flag; can not be set by userspace */ 3560 if (cfg->fc_flags & RTF_CACHE) { 3561 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); 3562 goto out; 3563 } 3564 3565 if (cfg->fc_type > RTN_MAX) { 3566 NL_SET_ERR_MSG(extack, "Invalid route type"); 3567 goto out; 3568 } 3569 3570 if (cfg->fc_dst_len > 128) { 3571 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 3572 goto out; 3573 } 3574 if (cfg->fc_src_len > 128) { 3575 NL_SET_ERR_MSG(extack, "Invalid source address length"); 3576 goto out; 3577 } 3578 #ifndef CONFIG_IPV6_SUBTREES 3579 if (cfg->fc_src_len) { 3580 NL_SET_ERR_MSG(extack, 3581 "Specifying source address requires IPV6_SUBTREES to be enabled"); 3582 goto out; 3583 } 3584 #endif 3585 if (cfg->fc_nh_id) { 3586 nh = nexthop_find_by_id(net, cfg->fc_nh_id); 3587 if (!nh) { 3588 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 3589 goto out; 3590 } 3591 err = fib6_check_nexthop(nh, cfg, extack); 3592 if (err) 3593 goto out; 3594 } 3595 3596 err = -ENOBUFS; 3597 if (cfg->fc_nlinfo.nlh && 3598 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 3599 table = fib6_get_table(net, cfg->fc_table); 3600 if (!table) { 3601 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 3602 table = fib6_new_table(net, cfg->fc_table); 3603 } 3604 } else { 3605 table = fib6_new_table(net, cfg->fc_table); 3606 } 3607 3608 if (!table) 3609 goto out; 3610 3611 err = -ENOMEM; 3612 rt = fib6_info_alloc(gfp_flags, !nh); 3613 if (!rt) 3614 goto out; 3615 3616 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, 3617 extack); 3618 if (IS_ERR(rt->fib6_metrics)) { 3619 err = PTR_ERR(rt->fib6_metrics); 3620 /* Do not leave garbage there. */ 3621 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; 3622 goto out; 3623 } 3624 3625 if (cfg->fc_flags & RTF_ADDRCONF) 3626 rt->dst_nocount = true; 3627 3628 if (cfg->fc_flags & RTF_EXPIRES) 3629 fib6_set_expires(rt, jiffies + 3630 clock_t_to_jiffies(cfg->fc_expires)); 3631 else 3632 fib6_clean_expires(rt); 3633 3634 if (cfg->fc_protocol == RTPROT_UNSPEC) 3635 cfg->fc_protocol = RTPROT_BOOT; 3636 rt->fib6_protocol = cfg->fc_protocol; 3637 3638 rt->fib6_table = table; 3639 rt->fib6_metric = cfg->fc_metric; 3640 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; 3641 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; 3642 3643 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 3644 rt->fib6_dst.plen = cfg->fc_dst_len; 3645 if (rt->fib6_dst.plen == 128) 3646 rt->dst_host = true; 3647 3648 #ifdef CONFIG_IPV6_SUBTREES 3649 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); 3650 rt->fib6_src.plen = cfg->fc_src_len; 3651 #endif 3652 if (nh) { 3653 if (!nexthop_get(nh)) { 3654 NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); 3655 goto out; 3656 } 3657 if (rt->fib6_src.plen) { 3658 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); 3659 goto out; 3660 } 3661 rt->nh = nh; 3662 fib6_nh = nexthop_fib6_nh(rt->nh); 3663 } else { 3664 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); 3665 if (err) 3666 goto out; 3667 3668 fib6_nh = rt->fib6_nh; 3669 3670 /* We cannot add true routes via loopback here, they would 3671 * result in kernel looping; promote them to reject routes 3672 */ 3673 addr_type = ipv6_addr_type(&cfg->fc_dst); 3674 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, 3675 addr_type)) 3676 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; 3677 } 3678 3679 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 3680 struct net_device *dev = fib6_nh->fib_nh_dev; 3681 3682 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 3683 NL_SET_ERR_MSG(extack, "Invalid source address"); 3684 err = -EINVAL; 3685 goto out; 3686 } 3687 rt->fib6_prefsrc.addr = cfg->fc_prefsrc; 3688 rt->fib6_prefsrc.plen = 128; 3689 } else 3690 rt->fib6_prefsrc.plen = 0; 3691 3692 return rt; 3693 out: 3694 fib6_info_release(rt); 3695 return ERR_PTR(err); 3696 } 3697 3698 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, 3699 struct netlink_ext_ack *extack) 3700 { 3701 struct fib6_info *rt; 3702 int err; 3703 3704 rt = ip6_route_info_create(cfg, gfp_flags, extack); 3705 if (IS_ERR(rt)) 3706 return PTR_ERR(rt); 3707 3708 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); 3709 fib6_info_release(rt); 3710 3711 return err; 3712 } 3713 3714 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) 3715 { 3716 struct net *net = info->nl_net; 3717 struct fib6_table *table; 3718 int err; 3719 3720 if (rt == net->ipv6.fib6_null_entry) { 3721 err = -ENOENT; 3722 goto out; 3723 } 3724 3725 table = rt->fib6_table; 3726 spin_lock_bh(&table->tb6_lock); 3727 err = fib6_del(rt, info); 3728 spin_unlock_bh(&table->tb6_lock); 3729 3730 out: 3731 fib6_info_release(rt); 3732 return err; 3733 } 3734 3735 int ip6_del_rt(struct net *net, struct fib6_info *rt) 3736 { 3737 struct nl_info info = { .nl_net = net }; 3738 3739 return __ip6_del_rt(rt, &info); 3740 } 3741 3742 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) 3743 { 3744 struct nl_info *info = &cfg->fc_nlinfo; 3745 struct net *net = info->nl_net; 3746 struct sk_buff *skb = NULL; 3747 struct fib6_table *table; 3748 int err = -ENOENT; 3749 3750 if (rt == net->ipv6.fib6_null_entry) 3751 goto out_put; 3752 table = rt->fib6_table; 3753 spin_lock_bh(&table->tb6_lock); 3754 3755 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { 3756 struct fib6_info *sibling, *next_sibling; 3757 3758 /* prefer to send a single notification with all hops */ 3759 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3760 if (skb) { 3761 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3762 3763 if (rt6_fill_node(net, skb, rt, NULL, 3764 NULL, NULL, 0, RTM_DELROUTE, 3765 info->portid, seq, 0) < 0) { 3766 kfree_skb(skb); 3767 skb = NULL; 3768 } else 3769 info->skip_notify = 1; 3770 } 3771 3772 info->skip_notify_kernel = 1; 3773 call_fib6_multipath_entry_notifiers(net, 3774 FIB_EVENT_ENTRY_DEL, 3775 rt, 3776 rt->fib6_nsiblings, 3777 NULL); 3778 list_for_each_entry_safe(sibling, next_sibling, 3779 &rt->fib6_siblings, 3780 fib6_siblings) { 3781 err = fib6_del(sibling, info); 3782 if (err) 3783 goto out_unlock; 3784 } 3785 } 3786 3787 err = fib6_del(rt, info); 3788 out_unlock: 3789 spin_unlock_bh(&table->tb6_lock); 3790 out_put: 3791 fib6_info_release(rt); 3792 3793 if (skb) { 3794 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3795 info->nlh, gfp_any()); 3796 } 3797 return err; 3798 } 3799 3800 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) 3801 { 3802 int rc = -ESRCH; 3803 3804 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) 3805 goto out; 3806 3807 if (cfg->fc_flags & RTF_GATEWAY && 3808 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 3809 goto out; 3810 3811 rc = rt6_remove_exception_rt(rt); 3812 out: 3813 return rc; 3814 } 3815 3816 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, 3817 struct fib6_nh *nh) 3818 { 3819 struct fib6_result res = { 3820 .f6i = rt, 3821 .nh = nh, 3822 }; 3823 struct rt6_info *rt_cache; 3824 3825 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); 3826 if (rt_cache) 3827 return __ip6_del_cached_rt(rt_cache, cfg); 3828 3829 return 0; 3830 } 3831 3832 struct fib6_nh_del_cached_rt_arg { 3833 struct fib6_config *cfg; 3834 struct fib6_info *f6i; 3835 }; 3836 3837 static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) 3838 { 3839 struct fib6_nh_del_cached_rt_arg *arg = _arg; 3840 int rc; 3841 3842 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); 3843 return rc != -ESRCH ? rc : 0; 3844 } 3845 3846 static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) 3847 { 3848 struct fib6_nh_del_cached_rt_arg arg = { 3849 .cfg = cfg, 3850 .f6i = f6i 3851 }; 3852 3853 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); 3854 } 3855 3856 static int ip6_route_del(struct fib6_config *cfg, 3857 struct netlink_ext_ack *extack) 3858 { 3859 struct fib6_table *table; 3860 struct fib6_info *rt; 3861 struct fib6_node *fn; 3862 int err = -ESRCH; 3863 3864 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 3865 if (!table) { 3866 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 3867 return err; 3868 } 3869 3870 rcu_read_lock(); 3871 3872 fn = fib6_locate(&table->tb6_root, 3873 &cfg->fc_dst, cfg->fc_dst_len, 3874 &cfg->fc_src, cfg->fc_src_len, 3875 !(cfg->fc_flags & RTF_CACHE)); 3876 3877 if (fn) { 3878 for_each_fib6_node_rt_rcu(fn) { 3879 struct fib6_nh *nh; 3880 3881 if (rt->nh && cfg->fc_nh_id && 3882 rt->nh->id != cfg->fc_nh_id) 3883 continue; 3884 3885 if (cfg->fc_flags & RTF_CACHE) { 3886 int rc = 0; 3887 3888 if (rt->nh) { 3889 rc = ip6_del_cached_rt_nh(cfg, rt); 3890 } else if (cfg->fc_nh_id) { 3891 continue; 3892 } else { 3893 nh = rt->fib6_nh; 3894 rc = ip6_del_cached_rt(cfg, rt, nh); 3895 } 3896 if (rc != -ESRCH) { 3897 rcu_read_unlock(); 3898 return rc; 3899 } 3900 continue; 3901 } 3902 3903 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) 3904 continue; 3905 if (cfg->fc_protocol && 3906 cfg->fc_protocol != rt->fib6_protocol) 3907 continue; 3908 3909 if (rt->nh) { 3910 if (!fib6_info_hold_safe(rt)) 3911 continue; 3912 rcu_read_unlock(); 3913 3914 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3915 } 3916 if (cfg->fc_nh_id) 3917 continue; 3918 3919 nh = rt->fib6_nh; 3920 if (cfg->fc_ifindex && 3921 (!nh->fib_nh_dev || 3922 nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) 3923 continue; 3924 if (cfg->fc_flags & RTF_GATEWAY && 3925 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) 3926 continue; 3927 if (!fib6_info_hold_safe(rt)) 3928 continue; 3929 rcu_read_unlock(); 3930 3931 /* if gateway was specified only delete the one hop */ 3932 if (cfg->fc_flags & RTF_GATEWAY) 3933 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 3934 3935 return __ip6_del_rt_siblings(rt, cfg); 3936 } 3937 } 3938 rcu_read_unlock(); 3939 3940 return err; 3941 } 3942 3943 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 3944 { 3945 struct netevent_redirect netevent; 3946 struct rt6_info *rt, *nrt = NULL; 3947 struct fib6_result res = {}; 3948 struct ndisc_options ndopts; 3949 struct inet6_dev *in6_dev; 3950 struct neighbour *neigh; 3951 struct rd_msg *msg; 3952 int optlen, on_link; 3953 u8 *lladdr; 3954 3955 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 3956 optlen -= sizeof(*msg); 3957 3958 if (optlen < 0) { 3959 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 3960 return; 3961 } 3962 3963 msg = (struct rd_msg *)icmp6_hdr(skb); 3964 3965 if (ipv6_addr_is_multicast(&msg->dest)) { 3966 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 3967 return; 3968 } 3969 3970 on_link = 0; 3971 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 3972 on_link = 1; 3973 } else if (ipv6_addr_type(&msg->target) != 3974 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 3975 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 3976 return; 3977 } 3978 3979 in6_dev = __in6_dev_get(skb->dev); 3980 if (!in6_dev) 3981 return; 3982 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 3983 return; 3984 3985 /* RFC2461 8.1: 3986 * The IP source address of the Redirect MUST be the same as the current 3987 * first-hop router for the specified ICMP Destination Address. 3988 */ 3989 3990 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 3991 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 3992 return; 3993 } 3994 3995 lladdr = NULL; 3996 if (ndopts.nd_opts_tgt_lladdr) { 3997 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 3998 skb->dev); 3999 if (!lladdr) { 4000 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 4001 return; 4002 } 4003 } 4004 4005 rt = (struct rt6_info *) dst; 4006 if (rt->rt6i_flags & RTF_REJECT) { 4007 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 4008 return; 4009 } 4010 4011 /* Redirect received -> path was valid. 4012 * Look, redirects are sent only in response to data packets, 4013 * so that this nexthop apparently is reachable. --ANK 4014 */ 4015 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 4016 4017 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 4018 if (!neigh) 4019 return; 4020 4021 /* 4022 * We have finally decided to accept it. 4023 */ 4024 4025 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 4026 NEIGH_UPDATE_F_WEAK_OVERRIDE| 4027 NEIGH_UPDATE_F_OVERRIDE| 4028 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 4029 NEIGH_UPDATE_F_ISROUTER)), 4030 NDISC_REDIRECT, &ndopts); 4031 4032 rcu_read_lock(); 4033 res.f6i = rcu_dereference(rt->from); 4034 if (!res.f6i) 4035 goto out; 4036 4037 if (res.f6i->nh) { 4038 struct fib6_nh_match_arg arg = { 4039 .dev = dst->dev, 4040 .gw = &rt->rt6i_gateway, 4041 }; 4042 4043 nexthop_for_each_fib6_nh(res.f6i->nh, 4044 fib6_nh_find_match, &arg); 4045 4046 /* fib6_info uses a nexthop that does not have fib6_nh 4047 * using the dst->dev. Should be impossible 4048 */ 4049 if (!arg.match) 4050 goto out; 4051 res.nh = arg.match; 4052 } else { 4053 res.nh = res.f6i->fib6_nh; 4054 } 4055 4056 res.fib6_flags = res.f6i->fib6_flags; 4057 res.fib6_type = res.f6i->fib6_type; 4058 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); 4059 if (!nrt) 4060 goto out; 4061 4062 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 4063 if (on_link) 4064 nrt->rt6i_flags &= ~RTF_GATEWAY; 4065 4066 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 4067 4068 /* rt6_insert_exception() will take care of duplicated exceptions */ 4069 if (rt6_insert_exception(nrt, &res)) { 4070 dst_release_immediate(&nrt->dst); 4071 goto out; 4072 } 4073 4074 netevent.old = &rt->dst; 4075 netevent.new = &nrt->dst; 4076 netevent.daddr = &msg->dest; 4077 netevent.neigh = neigh; 4078 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 4079 4080 out: 4081 rcu_read_unlock(); 4082 neigh_release(neigh); 4083 } 4084 4085 #ifdef CONFIG_IPV6_ROUTE_INFO 4086 static struct fib6_info *rt6_get_route_info(struct net *net, 4087 const struct in6_addr *prefix, int prefixlen, 4088 const struct in6_addr *gwaddr, 4089 struct net_device *dev) 4090 { 4091 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 4092 int ifindex = dev->ifindex; 4093 struct fib6_node *fn; 4094 struct fib6_info *rt = NULL; 4095 struct fib6_table *table; 4096 4097 table = fib6_get_table(net, tb_id); 4098 if (!table) 4099 return NULL; 4100 4101 rcu_read_lock(); 4102 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 4103 if (!fn) 4104 goto out; 4105 4106 for_each_fib6_node_rt_rcu(fn) { 4107 /* these routes do not use nexthops */ 4108 if (rt->nh) 4109 continue; 4110 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) 4111 continue; 4112 if (!(rt->fib6_flags & RTF_ROUTEINFO) || 4113 !rt->fib6_nh->fib_nh_gw_family) 4114 continue; 4115 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) 4116 continue; 4117 if (!fib6_info_hold_safe(rt)) 4118 continue; 4119 break; 4120 } 4121 out: 4122 rcu_read_unlock(); 4123 return rt; 4124 } 4125 4126 static struct fib6_info *rt6_add_route_info(struct net *net, 4127 const struct in6_addr *prefix, int prefixlen, 4128 const struct in6_addr *gwaddr, 4129 struct net_device *dev, 4130 unsigned int pref) 4131 { 4132 struct fib6_config cfg = { 4133 .fc_metric = IP6_RT_PRIO_USER, 4134 .fc_ifindex = dev->ifindex, 4135 .fc_dst_len = prefixlen, 4136 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 4137 RTF_UP | RTF_PREF(pref), 4138 .fc_protocol = RTPROT_RA, 4139 .fc_type = RTN_UNICAST, 4140 .fc_nlinfo.portid = 0, 4141 .fc_nlinfo.nlh = NULL, 4142 .fc_nlinfo.nl_net = net, 4143 }; 4144 4145 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 4146 cfg.fc_dst = *prefix; 4147 cfg.fc_gateway = *gwaddr; 4148 4149 /* We should treat it as a default route if prefix length is 0. */ 4150 if (!prefixlen) 4151 cfg.fc_flags |= RTF_DEFAULT; 4152 4153 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 4154 4155 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 4156 } 4157 #endif 4158 4159 struct fib6_info *rt6_get_dflt_router(struct net *net, 4160 const struct in6_addr *addr, 4161 struct net_device *dev) 4162 { 4163 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 4164 struct fib6_info *rt; 4165 struct fib6_table *table; 4166 4167 table = fib6_get_table(net, tb_id); 4168 if (!table) 4169 return NULL; 4170 4171 rcu_read_lock(); 4172 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4173 struct fib6_nh *nh; 4174 4175 /* RA routes do not use nexthops */ 4176 if (rt->nh) 4177 continue; 4178 4179 nh = rt->fib6_nh; 4180 if (dev == nh->fib_nh_dev && 4181 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 4182 ipv6_addr_equal(&nh->fib_nh_gw6, addr)) 4183 break; 4184 } 4185 if (rt && !fib6_info_hold_safe(rt)) 4186 rt = NULL; 4187 rcu_read_unlock(); 4188 return rt; 4189 } 4190 4191 struct fib6_info *rt6_add_dflt_router(struct net *net, 4192 const struct in6_addr *gwaddr, 4193 struct net_device *dev, 4194 unsigned int pref) 4195 { 4196 struct fib6_config cfg = { 4197 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 4198 .fc_metric = IP6_RT_PRIO_USER, 4199 .fc_ifindex = dev->ifindex, 4200 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 4201 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 4202 .fc_protocol = RTPROT_RA, 4203 .fc_type = RTN_UNICAST, 4204 .fc_nlinfo.portid = 0, 4205 .fc_nlinfo.nlh = NULL, 4206 .fc_nlinfo.nl_net = net, 4207 }; 4208 4209 cfg.fc_gateway = *gwaddr; 4210 4211 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { 4212 struct fib6_table *table; 4213 4214 table = fib6_get_table(dev_net(dev), cfg.fc_table); 4215 if (table) 4216 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 4217 } 4218 4219 return rt6_get_dflt_router(net, gwaddr, dev); 4220 } 4221 4222 static void __rt6_purge_dflt_routers(struct net *net, 4223 struct fib6_table *table) 4224 { 4225 struct fib6_info *rt; 4226 4227 restart: 4228 rcu_read_lock(); 4229 for_each_fib6_node_rt_rcu(&table->tb6_root) { 4230 struct net_device *dev = fib6_info_nh_dev(rt); 4231 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 4232 4233 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 4234 (!idev || idev->cnf.accept_ra != 2) && 4235 fib6_info_hold_safe(rt)) { 4236 rcu_read_unlock(); 4237 ip6_del_rt(net, rt); 4238 goto restart; 4239 } 4240 } 4241 rcu_read_unlock(); 4242 4243 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 4244 } 4245 4246 void rt6_purge_dflt_routers(struct net *net) 4247 { 4248 struct fib6_table *table; 4249 struct hlist_head *head; 4250 unsigned int h; 4251 4252 rcu_read_lock(); 4253 4254 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 4255 head = &net->ipv6.fib_table_hash[h]; 4256 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 4257 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 4258 __rt6_purge_dflt_routers(net, table); 4259 } 4260 } 4261 4262 rcu_read_unlock(); 4263 } 4264 4265 static void rtmsg_to_fib6_config(struct net *net, 4266 struct in6_rtmsg *rtmsg, 4267 struct fib6_config *cfg) 4268 { 4269 *cfg = (struct fib6_config){ 4270 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 4271 : RT6_TABLE_MAIN, 4272 .fc_ifindex = rtmsg->rtmsg_ifindex, 4273 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, 4274 .fc_expires = rtmsg->rtmsg_info, 4275 .fc_dst_len = rtmsg->rtmsg_dst_len, 4276 .fc_src_len = rtmsg->rtmsg_src_len, 4277 .fc_flags = rtmsg->rtmsg_flags, 4278 .fc_type = rtmsg->rtmsg_type, 4279 4280 .fc_nlinfo.nl_net = net, 4281 4282 .fc_dst = rtmsg->rtmsg_dst, 4283 .fc_src = rtmsg->rtmsg_src, 4284 .fc_gateway = rtmsg->rtmsg_gateway, 4285 }; 4286 } 4287 4288 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4289 { 4290 struct fib6_config cfg; 4291 struct in6_rtmsg rtmsg; 4292 int err; 4293 4294 switch (cmd) { 4295 case SIOCADDRT: /* Add a route */ 4296 case SIOCDELRT: /* Delete a route */ 4297 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 4298 return -EPERM; 4299 err = copy_from_user(&rtmsg, arg, 4300 sizeof(struct in6_rtmsg)); 4301 if (err) 4302 return -EFAULT; 4303 4304 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 4305 4306 rtnl_lock(); 4307 switch (cmd) { 4308 case SIOCADDRT: 4309 err = ip6_route_add(&cfg, GFP_KERNEL, NULL); 4310 break; 4311 case SIOCDELRT: 4312 err = ip6_route_del(&cfg, NULL); 4313 break; 4314 default: 4315 err = -EINVAL; 4316 } 4317 rtnl_unlock(); 4318 4319 return err; 4320 } 4321 4322 return -EINVAL; 4323 } 4324 4325 /* 4326 * Drop the packet on the floor 4327 */ 4328 4329 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 4330 { 4331 struct dst_entry *dst = skb_dst(skb); 4332 struct net *net = dev_net(dst->dev); 4333 struct inet6_dev *idev; 4334 int type; 4335 4336 if (netif_is_l3_master(skb->dev) && 4337 dst->dev == net->loopback_dev) 4338 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 4339 else 4340 idev = ip6_dst_idev(dst); 4341 4342 switch (ipstats_mib_noroutes) { 4343 case IPSTATS_MIB_INNOROUTES: 4344 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 4345 if (type == IPV6_ADDR_ANY) { 4346 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 4347 break; 4348 } 4349 /* FALLTHROUGH */ 4350 case IPSTATS_MIB_OUTNOROUTES: 4351 IP6_INC_STATS(net, idev, ipstats_mib_noroutes); 4352 break; 4353 } 4354 4355 /* Start over by dropping the dst for l3mdev case */ 4356 if (netif_is_l3_master(skb->dev)) 4357 skb_dst_drop(skb); 4358 4359 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 4360 kfree_skb(skb); 4361 return 0; 4362 } 4363 4364 static int ip6_pkt_discard(struct sk_buff *skb) 4365 { 4366 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 4367 } 4368 4369 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4370 { 4371 skb->dev = skb_dst(skb)->dev; 4372 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 4373 } 4374 4375 static int ip6_pkt_prohibit(struct sk_buff *skb) 4376 { 4377 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 4378 } 4379 4380 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 4381 { 4382 skb->dev = skb_dst(skb)->dev; 4383 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 4384 } 4385 4386 /* 4387 * Allocate a dst for local (unicast / anycast) address. 4388 */ 4389 4390 struct fib6_info *addrconf_f6i_alloc(struct net *net, 4391 struct inet6_dev *idev, 4392 const struct in6_addr *addr, 4393 bool anycast, gfp_t gfp_flags) 4394 { 4395 struct fib6_config cfg = { 4396 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, 4397 .fc_ifindex = idev->dev->ifindex, 4398 .fc_flags = RTF_UP | RTF_NONEXTHOP, 4399 .fc_dst = *addr, 4400 .fc_dst_len = 128, 4401 .fc_protocol = RTPROT_KERNEL, 4402 .fc_nlinfo.nl_net = net, 4403 .fc_ignore_dev_down = true, 4404 }; 4405 struct fib6_info *f6i; 4406 4407 if (anycast) { 4408 cfg.fc_type = RTN_ANYCAST; 4409 cfg.fc_flags |= RTF_ANYCAST; 4410 } else { 4411 cfg.fc_type = RTN_LOCAL; 4412 cfg.fc_flags |= RTF_LOCAL; 4413 } 4414 4415 f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); 4416 if (!IS_ERR(f6i)) 4417 f6i->dst_nocount = true; 4418 return f6i; 4419 } 4420 4421 /* remove deleted ip from prefsrc entries */ 4422 struct arg_dev_net_ip { 4423 struct net_device *dev; 4424 struct net *net; 4425 struct in6_addr *addr; 4426 }; 4427 4428 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) 4429 { 4430 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 4431 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 4432 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 4433 4434 if (!rt->nh && 4435 ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && 4436 rt != net->ipv6.fib6_null_entry && 4437 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { 4438 spin_lock_bh(&rt6_exception_lock); 4439 /* remove prefsrc entry */ 4440 rt->fib6_prefsrc.plen = 0; 4441 spin_unlock_bh(&rt6_exception_lock); 4442 } 4443 return 0; 4444 } 4445 4446 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 4447 { 4448 struct net *net = dev_net(ifp->idev->dev); 4449 struct arg_dev_net_ip adni = { 4450 .dev = ifp->idev->dev, 4451 .net = net, 4452 .addr = &ifp->addr, 4453 }; 4454 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 4455 } 4456 4457 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) 4458 4459 /* Remove routers and update dst entries when gateway turn into host. */ 4460 static int fib6_clean_tohost(struct fib6_info *rt, void *arg) 4461 { 4462 struct in6_addr *gateway = (struct in6_addr *)arg; 4463 struct fib6_nh *nh; 4464 4465 /* RA routes do not use nexthops */ 4466 if (rt->nh) 4467 return 0; 4468 4469 nh = rt->fib6_nh; 4470 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 4471 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) 4472 return -1; 4473 4474 /* Further clean up cached routes in exception table. 4475 * This is needed because cached route may have a different 4476 * gateway than its 'parent' in the case of an ip redirect. 4477 */ 4478 fib6_nh_exceptions_clean_tohost(nh, gateway); 4479 4480 return 0; 4481 } 4482 4483 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 4484 { 4485 fib6_clean_all(net, fib6_clean_tohost, gateway); 4486 } 4487 4488 struct arg_netdev_event { 4489 const struct net_device *dev; 4490 union { 4491 unsigned char nh_flags; 4492 unsigned long event; 4493 }; 4494 }; 4495 4496 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) 4497 { 4498 struct fib6_info *iter; 4499 struct fib6_node *fn; 4500 4501 fn = rcu_dereference_protected(rt->fib6_node, 4502 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4503 iter = rcu_dereference_protected(fn->leaf, 4504 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4505 while (iter) { 4506 if (iter->fib6_metric == rt->fib6_metric && 4507 rt6_qualify_for_ecmp(iter)) 4508 return iter; 4509 iter = rcu_dereference_protected(iter->fib6_next, 4510 lockdep_is_held(&rt->fib6_table->tb6_lock)); 4511 } 4512 4513 return NULL; 4514 } 4515 4516 /* only called for fib entries with builtin fib6_nh */ 4517 static bool rt6_is_dead(const struct fib6_info *rt) 4518 { 4519 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || 4520 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && 4521 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) 4522 return true; 4523 4524 return false; 4525 } 4526 4527 static int rt6_multipath_total_weight(const struct fib6_info *rt) 4528 { 4529 struct fib6_info *iter; 4530 int total = 0; 4531 4532 if (!rt6_is_dead(rt)) 4533 total += rt->fib6_nh->fib_nh_weight; 4534 4535 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 4536 if (!rt6_is_dead(iter)) 4537 total += iter->fib6_nh->fib_nh_weight; 4538 } 4539 4540 return total; 4541 } 4542 4543 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) 4544 { 4545 int upper_bound = -1; 4546 4547 if (!rt6_is_dead(rt)) { 4548 *weight += rt->fib6_nh->fib_nh_weight; 4549 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, 4550 total) - 1; 4551 } 4552 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); 4553 } 4554 4555 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) 4556 { 4557 struct fib6_info *iter; 4558 int weight = 0; 4559 4560 rt6_upper_bound_set(rt, &weight, total); 4561 4562 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4563 rt6_upper_bound_set(iter, &weight, total); 4564 } 4565 4566 void rt6_multipath_rebalance(struct fib6_info *rt) 4567 { 4568 struct fib6_info *first; 4569 int total; 4570 4571 /* In case the entire multipath route was marked for flushing, 4572 * then there is no need to rebalance upon the removal of every 4573 * sibling route. 4574 */ 4575 if (!rt->fib6_nsiblings || rt->should_flush) 4576 return; 4577 4578 /* During lookup routes are evaluated in order, so we need to 4579 * make sure upper bounds are assigned from the first sibling 4580 * onwards. 4581 */ 4582 first = rt6_multipath_first_sibling(rt); 4583 if (WARN_ON_ONCE(!first)) 4584 return; 4585 4586 total = rt6_multipath_total_weight(first); 4587 rt6_multipath_upper_bound_set(first, total); 4588 } 4589 4590 static int fib6_ifup(struct fib6_info *rt, void *p_arg) 4591 { 4592 const struct arg_netdev_event *arg = p_arg; 4593 struct net *net = dev_net(arg->dev); 4594 4595 if (rt != net->ipv6.fib6_null_entry && !rt->nh && 4596 rt->fib6_nh->fib_nh_dev == arg->dev) { 4597 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; 4598 fib6_update_sernum_upto_root(net, rt); 4599 rt6_multipath_rebalance(rt); 4600 } 4601 4602 return 0; 4603 } 4604 4605 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) 4606 { 4607 struct arg_netdev_event arg = { 4608 .dev = dev, 4609 { 4610 .nh_flags = nh_flags, 4611 }, 4612 }; 4613 4614 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) 4615 arg.nh_flags |= RTNH_F_LINKDOWN; 4616 4617 fib6_clean_all(dev_net(dev), fib6_ifup, &arg); 4618 } 4619 4620 /* only called for fib entries with inline fib6_nh */ 4621 static bool rt6_multipath_uses_dev(const struct fib6_info *rt, 4622 const struct net_device *dev) 4623 { 4624 struct fib6_info *iter; 4625 4626 if (rt->fib6_nh->fib_nh_dev == dev) 4627 return true; 4628 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4629 if (iter->fib6_nh->fib_nh_dev == dev) 4630 return true; 4631 4632 return false; 4633 } 4634 4635 static void rt6_multipath_flush(struct fib6_info *rt) 4636 { 4637 struct fib6_info *iter; 4638 4639 rt->should_flush = 1; 4640 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4641 iter->should_flush = 1; 4642 } 4643 4644 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, 4645 const struct net_device *down_dev) 4646 { 4647 struct fib6_info *iter; 4648 unsigned int dead = 0; 4649 4650 if (rt->fib6_nh->fib_nh_dev == down_dev || 4651 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4652 dead++; 4653 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4654 if (iter->fib6_nh->fib_nh_dev == down_dev || 4655 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) 4656 dead++; 4657 4658 return dead; 4659 } 4660 4661 static void rt6_multipath_nh_flags_set(struct fib6_info *rt, 4662 const struct net_device *dev, 4663 unsigned char nh_flags) 4664 { 4665 struct fib6_info *iter; 4666 4667 if (rt->fib6_nh->fib_nh_dev == dev) 4668 rt->fib6_nh->fib_nh_flags |= nh_flags; 4669 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) 4670 if (iter->fib6_nh->fib_nh_dev == dev) 4671 iter->fib6_nh->fib_nh_flags |= nh_flags; 4672 } 4673 4674 /* called with write lock held for table with rt */ 4675 static int fib6_ifdown(struct fib6_info *rt, void *p_arg) 4676 { 4677 const struct arg_netdev_event *arg = p_arg; 4678 const struct net_device *dev = arg->dev; 4679 struct net *net = dev_net(dev); 4680 4681 if (rt == net->ipv6.fib6_null_entry || rt->nh) 4682 return 0; 4683 4684 switch (arg->event) { 4685 case NETDEV_UNREGISTER: 4686 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4687 case NETDEV_DOWN: 4688 if (rt->should_flush) 4689 return -1; 4690 if (!rt->fib6_nsiblings) 4691 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; 4692 if (rt6_multipath_uses_dev(rt, dev)) { 4693 unsigned int count; 4694 4695 count = rt6_multipath_dead_count(rt, dev); 4696 if (rt->fib6_nsiblings + 1 == count) { 4697 rt6_multipath_flush(rt); 4698 return -1; 4699 } 4700 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | 4701 RTNH_F_LINKDOWN); 4702 fib6_update_sernum(net, rt); 4703 rt6_multipath_rebalance(rt); 4704 } 4705 return -2; 4706 case NETDEV_CHANGE: 4707 if (rt->fib6_nh->fib_nh_dev != dev || 4708 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) 4709 break; 4710 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 4711 rt6_multipath_rebalance(rt); 4712 break; 4713 } 4714 4715 return 0; 4716 } 4717 4718 void rt6_sync_down_dev(struct net_device *dev, unsigned long event) 4719 { 4720 struct arg_netdev_event arg = { 4721 .dev = dev, 4722 { 4723 .event = event, 4724 }, 4725 }; 4726 struct net *net = dev_net(dev); 4727 4728 if (net->ipv6.sysctl.skip_notify_on_dev_down) 4729 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); 4730 else 4731 fib6_clean_all(net, fib6_ifdown, &arg); 4732 } 4733 4734 void rt6_disable_ip(struct net_device *dev, unsigned long event) 4735 { 4736 rt6_sync_down_dev(dev, event); 4737 rt6_uncached_list_flush_dev(dev_net(dev), dev); 4738 neigh_ifdown(&nd_tbl, dev); 4739 } 4740 4741 struct rt6_mtu_change_arg { 4742 struct net_device *dev; 4743 unsigned int mtu; 4744 struct fib6_info *f6i; 4745 }; 4746 4747 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) 4748 { 4749 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; 4750 struct fib6_info *f6i = arg->f6i; 4751 4752 /* For administrative MTU increase, there is no way to discover 4753 * IPv6 PMTU increase, so PMTU increase should be updated here. 4754 * Since RFC 1981 doesn't include administrative MTU increase 4755 * update PMTU increase is a MUST. (i.e. jumbo frame) 4756 */ 4757 if (nh->fib_nh_dev == arg->dev) { 4758 struct inet6_dev *idev = __in6_dev_get(arg->dev); 4759 u32 mtu = f6i->fib6_pmtu; 4760 4761 if (mtu >= arg->mtu || 4762 (mtu < arg->mtu && mtu == idev->cnf.mtu6)) 4763 fib6_metric_set(f6i, RTAX_MTU, arg->mtu); 4764 4765 spin_lock_bh(&rt6_exception_lock); 4766 rt6_exceptions_update_pmtu(idev, nh, arg->mtu); 4767 spin_unlock_bh(&rt6_exception_lock); 4768 } 4769 4770 return 0; 4771 } 4772 4773 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) 4774 { 4775 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 4776 struct inet6_dev *idev; 4777 4778 /* In IPv6 pmtu discovery is not optional, 4779 so that RTAX_MTU lock cannot disable it. 4780 We still use this lock to block changes 4781 caused by addrconf/ndisc. 4782 */ 4783 4784 idev = __in6_dev_get(arg->dev); 4785 if (!idev) 4786 return 0; 4787 4788 if (fib6_metric_locked(f6i, RTAX_MTU)) 4789 return 0; 4790 4791 arg->f6i = f6i; 4792 if (f6i->nh) { 4793 /* fib6_nh_mtu_change only returns 0, so this is safe */ 4794 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, 4795 arg); 4796 } 4797 4798 return fib6_nh_mtu_change(f6i->fib6_nh, arg); 4799 } 4800 4801 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 4802 { 4803 struct rt6_mtu_change_arg arg = { 4804 .dev = dev, 4805 .mtu = mtu, 4806 }; 4807 4808 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 4809 } 4810 4811 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 4812 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, 4813 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 4814 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, 4815 [RTA_OIF] = { .type = NLA_U32 }, 4816 [RTA_IIF] = { .type = NLA_U32 }, 4817 [RTA_PRIORITY] = { .type = NLA_U32 }, 4818 [RTA_METRICS] = { .type = NLA_NESTED }, 4819 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 4820 [RTA_PREF] = { .type = NLA_U8 }, 4821 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 4822 [RTA_ENCAP] = { .type = NLA_NESTED }, 4823 [RTA_EXPIRES] = { .type = NLA_U32 }, 4824 [RTA_UID] = { .type = NLA_U32 }, 4825 [RTA_MARK] = { .type = NLA_U32 }, 4826 [RTA_TABLE] = { .type = NLA_U32 }, 4827 [RTA_IP_PROTO] = { .type = NLA_U8 }, 4828 [RTA_SPORT] = { .type = NLA_U16 }, 4829 [RTA_DPORT] = { .type = NLA_U16 }, 4830 [RTA_NH_ID] = { .type = NLA_U32 }, 4831 }; 4832 4833 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 4834 struct fib6_config *cfg, 4835 struct netlink_ext_ack *extack) 4836 { 4837 struct rtmsg *rtm; 4838 struct nlattr *tb[RTA_MAX+1]; 4839 unsigned int pref; 4840 int err; 4841 4842 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 4843 rtm_ipv6_policy, extack); 4844 if (err < 0) 4845 goto errout; 4846 4847 err = -EINVAL; 4848 rtm = nlmsg_data(nlh); 4849 4850 *cfg = (struct fib6_config){ 4851 .fc_table = rtm->rtm_table, 4852 .fc_dst_len = rtm->rtm_dst_len, 4853 .fc_src_len = rtm->rtm_src_len, 4854 .fc_flags = RTF_UP, 4855 .fc_protocol = rtm->rtm_protocol, 4856 .fc_type = rtm->rtm_type, 4857 4858 .fc_nlinfo.portid = NETLINK_CB(skb).portid, 4859 .fc_nlinfo.nlh = nlh, 4860 .fc_nlinfo.nl_net = sock_net(skb->sk), 4861 }; 4862 4863 if (rtm->rtm_type == RTN_UNREACHABLE || 4864 rtm->rtm_type == RTN_BLACKHOLE || 4865 rtm->rtm_type == RTN_PROHIBIT || 4866 rtm->rtm_type == RTN_THROW) 4867 cfg->fc_flags |= RTF_REJECT; 4868 4869 if (rtm->rtm_type == RTN_LOCAL) 4870 cfg->fc_flags |= RTF_LOCAL; 4871 4872 if (rtm->rtm_flags & RTM_F_CLONED) 4873 cfg->fc_flags |= RTF_CACHE; 4874 4875 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); 4876 4877 if (tb[RTA_NH_ID]) { 4878 if (tb[RTA_GATEWAY] || tb[RTA_OIF] || 4879 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { 4880 NL_SET_ERR_MSG(extack, 4881 "Nexthop specification and nexthop id are mutually exclusive"); 4882 goto errout; 4883 } 4884 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); 4885 } 4886 4887 if (tb[RTA_GATEWAY]) { 4888 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 4889 cfg->fc_flags |= RTF_GATEWAY; 4890 } 4891 if (tb[RTA_VIA]) { 4892 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); 4893 goto errout; 4894 } 4895 4896 if (tb[RTA_DST]) { 4897 int plen = (rtm->rtm_dst_len + 7) >> 3; 4898 4899 if (nla_len(tb[RTA_DST]) < plen) 4900 goto errout; 4901 4902 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 4903 } 4904 4905 if (tb[RTA_SRC]) { 4906 int plen = (rtm->rtm_src_len + 7) >> 3; 4907 4908 if (nla_len(tb[RTA_SRC]) < plen) 4909 goto errout; 4910 4911 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 4912 } 4913 4914 if (tb[RTA_PREFSRC]) 4915 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 4916 4917 if (tb[RTA_OIF]) 4918 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 4919 4920 if (tb[RTA_PRIORITY]) 4921 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 4922 4923 if (tb[RTA_METRICS]) { 4924 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 4925 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 4926 } 4927 4928 if (tb[RTA_TABLE]) 4929 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 4930 4931 if (tb[RTA_MULTIPATH]) { 4932 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 4933 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 4934 4935 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 4936 cfg->fc_mp_len, extack); 4937 if (err < 0) 4938 goto errout; 4939 } 4940 4941 if (tb[RTA_PREF]) { 4942 pref = nla_get_u8(tb[RTA_PREF]); 4943 if (pref != ICMPV6_ROUTER_PREF_LOW && 4944 pref != ICMPV6_ROUTER_PREF_HIGH) 4945 pref = ICMPV6_ROUTER_PREF_MEDIUM; 4946 cfg->fc_flags |= RTF_PREF(pref); 4947 } 4948 4949 if (tb[RTA_ENCAP]) 4950 cfg->fc_encap = tb[RTA_ENCAP]; 4951 4952 if (tb[RTA_ENCAP_TYPE]) { 4953 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 4954 4955 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 4956 if (err < 0) 4957 goto errout; 4958 } 4959 4960 if (tb[RTA_EXPIRES]) { 4961 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 4962 4963 if (addrconf_finite_timeout(timeout)) { 4964 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 4965 cfg->fc_flags |= RTF_EXPIRES; 4966 } 4967 } 4968 4969 err = 0; 4970 errout: 4971 return err; 4972 } 4973 4974 struct rt6_nh { 4975 struct fib6_info *fib6_info; 4976 struct fib6_config r_cfg; 4977 struct list_head next; 4978 }; 4979 4980 static int ip6_route_info_append(struct net *net, 4981 struct list_head *rt6_nh_list, 4982 struct fib6_info *rt, 4983 struct fib6_config *r_cfg) 4984 { 4985 struct rt6_nh *nh; 4986 int err = -EEXIST; 4987 4988 list_for_each_entry(nh, rt6_nh_list, next) { 4989 /* check if fib6_info already exists */ 4990 if (rt6_duplicate_nexthop(nh->fib6_info, rt)) 4991 return err; 4992 } 4993 4994 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 4995 if (!nh) 4996 return -ENOMEM; 4997 nh->fib6_info = rt; 4998 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 4999 list_add_tail(&nh->next, rt6_nh_list); 5000 5001 return 0; 5002 } 5003 5004 static void ip6_route_mpath_notify(struct fib6_info *rt, 5005 struct fib6_info *rt_last, 5006 struct nl_info *info, 5007 __u16 nlflags) 5008 { 5009 /* if this is an APPEND route, then rt points to the first route 5010 * inserted and rt_last points to last route inserted. Userspace 5011 * wants a consistent dump of the route which starts at the first 5012 * nexthop. Since sibling routes are always added at the end of 5013 * the list, find the first sibling of the last route appended 5014 */ 5015 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { 5016 rt = list_first_entry(&rt_last->fib6_siblings, 5017 struct fib6_info, 5018 fib6_siblings); 5019 } 5020 5021 if (rt) 5022 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 5023 } 5024 5025 static int ip6_route_multipath_add(struct fib6_config *cfg, 5026 struct netlink_ext_ack *extack) 5027 { 5028 struct fib6_info *rt_notif = NULL, *rt_last = NULL; 5029 struct nl_info *info = &cfg->fc_nlinfo; 5030 enum fib_event_type event_type; 5031 struct fib6_config r_cfg; 5032 struct rtnexthop *rtnh; 5033 struct fib6_info *rt; 5034 struct rt6_nh *err_nh; 5035 struct rt6_nh *nh, *nh_safe; 5036 __u16 nlflags; 5037 int remaining; 5038 int attrlen; 5039 int err = 1; 5040 int nhn = 0; 5041 int replace = (cfg->fc_nlinfo.nlh && 5042 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 5043 LIST_HEAD(rt6_nh_list); 5044 5045 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 5046 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 5047 nlflags |= NLM_F_APPEND; 5048 5049 remaining = cfg->fc_mp_len; 5050 rtnh = (struct rtnexthop *)cfg->fc_mp; 5051 5052 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 5053 * fib6_info structs per nexthop 5054 */ 5055 while (rtnh_ok(rtnh, remaining)) { 5056 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5057 if (rtnh->rtnh_ifindex) 5058 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5059 5060 attrlen = rtnh_attrlen(rtnh); 5061 if (attrlen > 0) { 5062 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5063 5064 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5065 if (nla) { 5066 r_cfg.fc_gateway = nla_get_in6_addr(nla); 5067 r_cfg.fc_flags |= RTF_GATEWAY; 5068 } 5069 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 5070 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 5071 if (nla) 5072 r_cfg.fc_encap_type = nla_get_u16(nla); 5073 } 5074 5075 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); 5076 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); 5077 if (IS_ERR(rt)) { 5078 err = PTR_ERR(rt); 5079 rt = NULL; 5080 goto cleanup; 5081 } 5082 if (!rt6_qualify_for_ecmp(rt)) { 5083 err = -EINVAL; 5084 NL_SET_ERR_MSG(extack, 5085 "Device only routes can not be added for IPv6 using the multipath API."); 5086 fib6_info_release(rt); 5087 goto cleanup; 5088 } 5089 5090 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; 5091 5092 err = ip6_route_info_append(info->nl_net, &rt6_nh_list, 5093 rt, &r_cfg); 5094 if (err) { 5095 fib6_info_release(rt); 5096 goto cleanup; 5097 } 5098 5099 rtnh = rtnh_next(rtnh, &remaining); 5100 } 5101 5102 if (list_empty(&rt6_nh_list)) { 5103 NL_SET_ERR_MSG(extack, 5104 "Invalid nexthop configuration - no valid nexthops"); 5105 return -EINVAL; 5106 } 5107 5108 /* for add and replace send one notification with all nexthops. 5109 * Skip the notification in fib6_add_rt2node and send one with 5110 * the full route when done 5111 */ 5112 info->skip_notify = 1; 5113 5114 /* For add and replace, send one notification with all nexthops. For 5115 * append, send one notification with all appended nexthops. 5116 */ 5117 info->skip_notify_kernel = 1; 5118 5119 err_nh = NULL; 5120 list_for_each_entry(nh, &rt6_nh_list, next) { 5121 err = __ip6_ins_rt(nh->fib6_info, info, extack); 5122 fib6_info_release(nh->fib6_info); 5123 5124 if (!err) { 5125 /* save reference to last route successfully inserted */ 5126 rt_last = nh->fib6_info; 5127 5128 /* save reference to first route for notification */ 5129 if (!rt_notif) 5130 rt_notif = nh->fib6_info; 5131 } 5132 5133 /* nh->fib6_info is used or freed at this point, reset to NULL*/ 5134 nh->fib6_info = NULL; 5135 if (err) { 5136 if (replace && nhn) 5137 NL_SET_ERR_MSG_MOD(extack, 5138 "multipath route replace failed (check consistency of installed routes)"); 5139 err_nh = nh; 5140 goto add_errout; 5141 } 5142 5143 /* Because each route is added like a single route we remove 5144 * these flags after the first nexthop: if there is a collision, 5145 * we have already failed to add the first nexthop: 5146 * fib6_add_rt2node() has rejected it; when replacing, old 5147 * nexthops have been replaced by first new, the rest should 5148 * be added to it. 5149 */ 5150 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 5151 NLM_F_REPLACE); 5152 nhn++; 5153 } 5154 5155 event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; 5156 err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, 5157 rt_notif, nhn - 1, extack); 5158 if (err) { 5159 /* Delete all the siblings that were just added */ 5160 err_nh = NULL; 5161 goto add_errout; 5162 } 5163 5164 /* success ... tell user about new route */ 5165 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5166 goto cleanup; 5167 5168 add_errout: 5169 /* send notification for routes that were added so that 5170 * the delete notifications sent by ip6_route_del are 5171 * coherent 5172 */ 5173 if (rt_notif) 5174 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 5175 5176 /* Delete routes that were already added */ 5177 list_for_each_entry(nh, &rt6_nh_list, next) { 5178 if (err_nh == nh) 5179 break; 5180 ip6_route_del(&nh->r_cfg, extack); 5181 } 5182 5183 cleanup: 5184 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 5185 if (nh->fib6_info) 5186 fib6_info_release(nh->fib6_info); 5187 list_del(&nh->next); 5188 kfree(nh); 5189 } 5190 5191 return err; 5192 } 5193 5194 static int ip6_route_multipath_del(struct fib6_config *cfg, 5195 struct netlink_ext_ack *extack) 5196 { 5197 struct fib6_config r_cfg; 5198 struct rtnexthop *rtnh; 5199 int remaining; 5200 int attrlen; 5201 int err = 1, last_err = 0; 5202 5203 remaining = cfg->fc_mp_len; 5204 rtnh = (struct rtnexthop *)cfg->fc_mp; 5205 5206 /* Parse a Multipath Entry */ 5207 while (rtnh_ok(rtnh, remaining)) { 5208 memcpy(&r_cfg, cfg, sizeof(*cfg)); 5209 if (rtnh->rtnh_ifindex) 5210 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 5211 5212 attrlen = rtnh_attrlen(rtnh); 5213 if (attrlen > 0) { 5214 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 5215 5216 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 5217 if (nla) { 5218 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 5219 r_cfg.fc_flags |= RTF_GATEWAY; 5220 } 5221 } 5222 err = ip6_route_del(&r_cfg, extack); 5223 if (err) 5224 last_err = err; 5225 5226 rtnh = rtnh_next(rtnh, &remaining); 5227 } 5228 5229 return last_err; 5230 } 5231 5232 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5233 struct netlink_ext_ack *extack) 5234 { 5235 struct fib6_config cfg; 5236 int err; 5237 5238 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5239 if (err < 0) 5240 return err; 5241 5242 if (cfg.fc_nh_id && 5243 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { 5244 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 5245 return -EINVAL; 5246 } 5247 5248 if (cfg.fc_mp) 5249 return ip6_route_multipath_del(&cfg, extack); 5250 else { 5251 cfg.fc_delete_all_nh = 1; 5252 return ip6_route_del(&cfg, extack); 5253 } 5254 } 5255 5256 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 5257 struct netlink_ext_ack *extack) 5258 { 5259 struct fib6_config cfg; 5260 int err; 5261 5262 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 5263 if (err < 0) 5264 return err; 5265 5266 if (cfg.fc_metric == 0) 5267 cfg.fc_metric = IP6_RT_PRIO_USER; 5268 5269 if (cfg.fc_mp) 5270 return ip6_route_multipath_add(&cfg, extack); 5271 else 5272 return ip6_route_add(&cfg, GFP_KERNEL, extack); 5273 } 5274 5275 /* add the overhead of this fib6_nh to nexthop_len */ 5276 static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) 5277 { 5278 int *nexthop_len = arg; 5279 5280 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ 5281 + NLA_ALIGN(sizeof(struct rtnexthop)) 5282 + nla_total_size(16); /* RTA_GATEWAY */ 5283 5284 if (nh->fib_nh_lws) { 5285 /* RTA_ENCAP_TYPE */ 5286 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5287 /* RTA_ENCAP */ 5288 *nexthop_len += nla_total_size(2); 5289 } 5290 5291 return 0; 5292 } 5293 5294 static size_t rt6_nlmsg_size(struct fib6_info *f6i) 5295 { 5296 int nexthop_len; 5297 5298 if (f6i->nh) { 5299 nexthop_len = nla_total_size(4); /* RTA_NH_ID */ 5300 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, 5301 &nexthop_len); 5302 } else { 5303 struct fib6_nh *nh = f6i->fib6_nh; 5304 5305 nexthop_len = 0; 5306 if (f6i->fib6_nsiblings) { 5307 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 5308 + NLA_ALIGN(sizeof(struct rtnexthop)) 5309 + nla_total_size(16) /* RTA_GATEWAY */ 5310 + lwtunnel_get_encap_size(nh->fib_nh_lws); 5311 5312 nexthop_len *= f6i->fib6_nsiblings; 5313 } 5314 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); 5315 } 5316 5317 return NLMSG_ALIGN(sizeof(struct rtmsg)) 5318 + nla_total_size(16) /* RTA_SRC */ 5319 + nla_total_size(16) /* RTA_DST */ 5320 + nla_total_size(16) /* RTA_GATEWAY */ 5321 + nla_total_size(16) /* RTA_PREFSRC */ 5322 + nla_total_size(4) /* RTA_TABLE */ 5323 + nla_total_size(4) /* RTA_IIF */ 5324 + nla_total_size(4) /* RTA_OIF */ 5325 + nla_total_size(4) /* RTA_PRIORITY */ 5326 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 5327 + nla_total_size(sizeof(struct rta_cacheinfo)) 5328 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 5329 + nla_total_size(1) /* RTA_PREF */ 5330 + nexthop_len; 5331 } 5332 5333 static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, 5334 unsigned char *flags) 5335 { 5336 if (nexthop_is_multipath(nh)) { 5337 struct nlattr *mp; 5338 5339 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5340 if (!mp) 5341 goto nla_put_failure; 5342 5343 if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) 5344 goto nla_put_failure; 5345 5346 nla_nest_end(skb, mp); 5347 } else { 5348 struct fib6_nh *fib6_nh; 5349 5350 fib6_nh = nexthop_fib6_nh(nh); 5351 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, 5352 flags, false) < 0) 5353 goto nla_put_failure; 5354 } 5355 5356 return 0; 5357 5358 nla_put_failure: 5359 return -EMSGSIZE; 5360 } 5361 5362 static int rt6_fill_node(struct net *net, struct sk_buff *skb, 5363 struct fib6_info *rt, struct dst_entry *dst, 5364 struct in6_addr *dest, struct in6_addr *src, 5365 int iif, int type, u32 portid, u32 seq, 5366 unsigned int flags) 5367 { 5368 struct rt6_info *rt6 = (struct rt6_info *)dst; 5369 struct rt6key *rt6_dst, *rt6_src; 5370 u32 *pmetrics, table, rt6_flags; 5371 unsigned char nh_flags = 0; 5372 struct nlmsghdr *nlh; 5373 struct rtmsg *rtm; 5374 long expires = 0; 5375 5376 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 5377 if (!nlh) 5378 return -EMSGSIZE; 5379 5380 if (rt6) { 5381 rt6_dst = &rt6->rt6i_dst; 5382 rt6_src = &rt6->rt6i_src; 5383 rt6_flags = rt6->rt6i_flags; 5384 } else { 5385 rt6_dst = &rt->fib6_dst; 5386 rt6_src = &rt->fib6_src; 5387 rt6_flags = rt->fib6_flags; 5388 } 5389 5390 rtm = nlmsg_data(nlh); 5391 rtm->rtm_family = AF_INET6; 5392 rtm->rtm_dst_len = rt6_dst->plen; 5393 rtm->rtm_src_len = rt6_src->plen; 5394 rtm->rtm_tos = 0; 5395 if (rt->fib6_table) 5396 table = rt->fib6_table->tb6_id; 5397 else 5398 table = RT6_TABLE_UNSPEC; 5399 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; 5400 if (nla_put_u32(skb, RTA_TABLE, table)) 5401 goto nla_put_failure; 5402 5403 rtm->rtm_type = rt->fib6_type; 5404 rtm->rtm_flags = 0; 5405 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 5406 rtm->rtm_protocol = rt->fib6_protocol; 5407 5408 if (rt6_flags & RTF_CACHE) 5409 rtm->rtm_flags |= RTM_F_CLONED; 5410 5411 if (dest) { 5412 if (nla_put_in6_addr(skb, RTA_DST, dest)) 5413 goto nla_put_failure; 5414 rtm->rtm_dst_len = 128; 5415 } else if (rtm->rtm_dst_len) 5416 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) 5417 goto nla_put_failure; 5418 #ifdef CONFIG_IPV6_SUBTREES 5419 if (src) { 5420 if (nla_put_in6_addr(skb, RTA_SRC, src)) 5421 goto nla_put_failure; 5422 rtm->rtm_src_len = 128; 5423 } else if (rtm->rtm_src_len && 5424 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) 5425 goto nla_put_failure; 5426 #endif 5427 if (iif) { 5428 #ifdef CONFIG_IPV6_MROUTE 5429 if (ipv6_addr_is_multicast(&rt6_dst->addr)) { 5430 int err = ip6mr_get_route(net, skb, rtm, portid); 5431 5432 if (err == 0) 5433 return 0; 5434 if (err < 0) 5435 goto nla_put_failure; 5436 } else 5437 #endif 5438 if (nla_put_u32(skb, RTA_IIF, iif)) 5439 goto nla_put_failure; 5440 } else if (dest) { 5441 struct in6_addr saddr_buf; 5442 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && 5443 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5444 goto nla_put_failure; 5445 } 5446 5447 if (rt->fib6_prefsrc.plen) { 5448 struct in6_addr saddr_buf; 5449 saddr_buf = rt->fib6_prefsrc.addr; 5450 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 5451 goto nla_put_failure; 5452 } 5453 5454 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; 5455 if (rtnetlink_put_metrics(skb, pmetrics) < 0) 5456 goto nla_put_failure; 5457 5458 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) 5459 goto nla_put_failure; 5460 5461 /* For multipath routes, walk the siblings list and add 5462 * each as a nexthop within RTA_MULTIPATH. 5463 */ 5464 if (rt6) { 5465 if (rt6_flags & RTF_GATEWAY && 5466 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) 5467 goto nla_put_failure; 5468 5469 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) 5470 goto nla_put_failure; 5471 } else if (rt->fib6_nsiblings) { 5472 struct fib6_info *sibling, *next_sibling; 5473 struct nlattr *mp; 5474 5475 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); 5476 if (!mp) 5477 goto nla_put_failure; 5478 5479 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, 5480 rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) 5481 goto nla_put_failure; 5482 5483 list_for_each_entry_safe(sibling, next_sibling, 5484 &rt->fib6_siblings, fib6_siblings) { 5485 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, 5486 sibling->fib6_nh->fib_nh_weight, 5487 AF_INET6) < 0) 5488 goto nla_put_failure; 5489 } 5490 5491 nla_nest_end(skb, mp); 5492 } else if (rt->nh) { 5493 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) 5494 goto nla_put_failure; 5495 5496 if (nexthop_is_blackhole(rt->nh)) 5497 rtm->rtm_type = RTN_BLACKHOLE; 5498 5499 if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) 5500 goto nla_put_failure; 5501 5502 rtm->rtm_flags |= nh_flags; 5503 } else { 5504 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, 5505 &nh_flags, false) < 0) 5506 goto nla_put_failure; 5507 5508 rtm->rtm_flags |= nh_flags; 5509 } 5510 5511 if (rt6_flags & RTF_EXPIRES) { 5512 expires = dst ? dst->expires : rt->expires; 5513 expires -= jiffies; 5514 } 5515 5516 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) 5517 goto nla_put_failure; 5518 5519 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) 5520 goto nla_put_failure; 5521 5522 5523 nlmsg_end(skb, nlh); 5524 return 0; 5525 5526 nla_put_failure: 5527 nlmsg_cancel(skb, nlh); 5528 return -EMSGSIZE; 5529 } 5530 5531 static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) 5532 { 5533 const struct net_device *dev = arg; 5534 5535 if (nh->fib_nh_dev == dev) 5536 return 1; 5537 5538 return 0; 5539 } 5540 5541 static bool fib6_info_uses_dev(const struct fib6_info *f6i, 5542 const struct net_device *dev) 5543 { 5544 if (f6i->nh) { 5545 struct net_device *_dev = (struct net_device *)dev; 5546 5547 return !!nexthop_for_each_fib6_nh(f6i->nh, 5548 fib6_info_nh_uses_dev, 5549 _dev); 5550 } 5551 5552 if (f6i->fib6_nh->fib_nh_dev == dev) 5553 return true; 5554 5555 if (f6i->fib6_nsiblings) { 5556 struct fib6_info *sibling, *next_sibling; 5557 5558 list_for_each_entry_safe(sibling, next_sibling, 5559 &f6i->fib6_siblings, fib6_siblings) { 5560 if (sibling->fib6_nh->fib_nh_dev == dev) 5561 return true; 5562 } 5563 } 5564 5565 return false; 5566 } 5567 5568 struct fib6_nh_exception_dump_walker { 5569 struct rt6_rtnl_dump_arg *dump; 5570 struct fib6_info *rt; 5571 unsigned int flags; 5572 unsigned int skip; 5573 unsigned int count; 5574 }; 5575 5576 static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) 5577 { 5578 struct fib6_nh_exception_dump_walker *w = arg; 5579 struct rt6_rtnl_dump_arg *dump = w->dump; 5580 struct rt6_exception_bucket *bucket; 5581 struct rt6_exception *rt6_ex; 5582 int i, err; 5583 5584 bucket = fib6_nh_get_excptn_bucket(nh, NULL); 5585 if (!bucket) 5586 return 0; 5587 5588 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 5589 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 5590 if (w->skip) { 5591 w->skip--; 5592 continue; 5593 } 5594 5595 /* Expiration of entries doesn't bump sernum, insertion 5596 * does. Removal is triggered by insertion, so we can 5597 * rely on the fact that if entries change between two 5598 * partial dumps, this node is scanned again completely, 5599 * see rt6_insert_exception() and fib6_dump_table(). 5600 * 5601 * Count expired entries we go through as handled 5602 * entries that we'll skip next time, in case of partial 5603 * node dump. Otherwise, if entries expire meanwhile, 5604 * we'll skip the wrong amount. 5605 */ 5606 if (rt6_check_expired(rt6_ex->rt6i)) { 5607 w->count++; 5608 continue; 5609 } 5610 5611 err = rt6_fill_node(dump->net, dump->skb, w->rt, 5612 &rt6_ex->rt6i->dst, NULL, NULL, 0, 5613 RTM_NEWROUTE, 5614 NETLINK_CB(dump->cb->skb).portid, 5615 dump->cb->nlh->nlmsg_seq, w->flags); 5616 if (err) 5617 return err; 5618 5619 w->count++; 5620 } 5621 bucket++; 5622 } 5623 5624 return 0; 5625 } 5626 5627 /* Return -1 if done with node, number of handled routes on partial dump */ 5628 int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) 5629 { 5630 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 5631 struct fib_dump_filter *filter = &arg->filter; 5632 unsigned int flags = NLM_F_MULTI; 5633 struct net *net = arg->net; 5634 int count = 0; 5635 5636 if (rt == net->ipv6.fib6_null_entry) 5637 return -1; 5638 5639 if ((filter->flags & RTM_F_PREFIX) && 5640 !(rt->fib6_flags & RTF_PREFIX_RT)) { 5641 /* success since this is not a prefix route */ 5642 return -1; 5643 } 5644 if (filter->filter_set && 5645 ((filter->rt_type && rt->fib6_type != filter->rt_type) || 5646 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || 5647 (filter->protocol && rt->fib6_protocol != filter->protocol))) { 5648 return -1; 5649 } 5650 5651 if (filter->filter_set || 5652 !filter->dump_routes || !filter->dump_exceptions) { 5653 flags |= NLM_F_DUMP_FILTERED; 5654 } 5655 5656 if (filter->dump_routes) { 5657 if (skip) { 5658 skip--; 5659 } else { 5660 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 5661 0, RTM_NEWROUTE, 5662 NETLINK_CB(arg->cb->skb).portid, 5663 arg->cb->nlh->nlmsg_seq, flags)) { 5664 return 0; 5665 } 5666 count++; 5667 } 5668 } 5669 5670 if (filter->dump_exceptions) { 5671 struct fib6_nh_exception_dump_walker w = { .dump = arg, 5672 .rt = rt, 5673 .flags = flags, 5674 .skip = skip, 5675 .count = 0 }; 5676 int err; 5677 5678 rcu_read_lock(); 5679 if (rt->nh) { 5680 err = nexthop_for_each_fib6_nh(rt->nh, 5681 rt6_nh_dump_exceptions, 5682 &w); 5683 } else { 5684 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); 5685 } 5686 rcu_read_unlock(); 5687 5688 if (err) 5689 return count += w.count; 5690 } 5691 5692 return -1; 5693 } 5694 5695 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, 5696 const struct nlmsghdr *nlh, 5697 struct nlattr **tb, 5698 struct netlink_ext_ack *extack) 5699 { 5700 struct rtmsg *rtm; 5701 int i, err; 5702 5703 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 5704 NL_SET_ERR_MSG_MOD(extack, 5705 "Invalid header for get route request"); 5706 return -EINVAL; 5707 } 5708 5709 if (!netlink_strict_get_check(skb)) 5710 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 5711 rtm_ipv6_policy, extack); 5712 5713 rtm = nlmsg_data(nlh); 5714 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || 5715 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || 5716 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || 5717 rtm->rtm_type) { 5718 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); 5719 return -EINVAL; 5720 } 5721 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { 5722 NL_SET_ERR_MSG_MOD(extack, 5723 "Invalid flags for get route request"); 5724 return -EINVAL; 5725 } 5726 5727 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 5728 rtm_ipv6_policy, extack); 5729 if (err) 5730 return err; 5731 5732 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 5733 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 5734 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); 5735 return -EINVAL; 5736 } 5737 5738 for (i = 0; i <= RTA_MAX; i++) { 5739 if (!tb[i]) 5740 continue; 5741 5742 switch (i) { 5743 case RTA_SRC: 5744 case RTA_DST: 5745 case RTA_IIF: 5746 case RTA_OIF: 5747 case RTA_MARK: 5748 case RTA_UID: 5749 case RTA_SPORT: 5750 case RTA_DPORT: 5751 case RTA_IP_PROTO: 5752 break; 5753 default: 5754 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); 5755 return -EINVAL; 5756 } 5757 } 5758 5759 return 0; 5760 } 5761 5762 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 5763 struct netlink_ext_ack *extack) 5764 { 5765 struct net *net = sock_net(in_skb->sk); 5766 struct nlattr *tb[RTA_MAX+1]; 5767 int err, iif = 0, oif = 0; 5768 struct fib6_info *from; 5769 struct dst_entry *dst; 5770 struct rt6_info *rt; 5771 struct sk_buff *skb; 5772 struct rtmsg *rtm; 5773 struct flowi6 fl6 = {}; 5774 bool fibmatch; 5775 5776 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 5777 if (err < 0) 5778 goto errout; 5779 5780 err = -EINVAL; 5781 rtm = nlmsg_data(nlh); 5782 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 5783 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 5784 5785 if (tb[RTA_SRC]) { 5786 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 5787 goto errout; 5788 5789 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 5790 } 5791 5792 if (tb[RTA_DST]) { 5793 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 5794 goto errout; 5795 5796 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 5797 } 5798 5799 if (tb[RTA_IIF]) 5800 iif = nla_get_u32(tb[RTA_IIF]); 5801 5802 if (tb[RTA_OIF]) 5803 oif = nla_get_u32(tb[RTA_OIF]); 5804 5805 if (tb[RTA_MARK]) 5806 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 5807 5808 if (tb[RTA_UID]) 5809 fl6.flowi6_uid = make_kuid(current_user_ns(), 5810 nla_get_u32(tb[RTA_UID])); 5811 else 5812 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 5813 5814 if (tb[RTA_SPORT]) 5815 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); 5816 5817 if (tb[RTA_DPORT]) 5818 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); 5819 5820 if (tb[RTA_IP_PROTO]) { 5821 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 5822 &fl6.flowi6_proto, AF_INET6, 5823 extack); 5824 if (err) 5825 goto errout; 5826 } 5827 5828 if (iif) { 5829 struct net_device *dev; 5830 int flags = 0; 5831 5832 rcu_read_lock(); 5833 5834 dev = dev_get_by_index_rcu(net, iif); 5835 if (!dev) { 5836 rcu_read_unlock(); 5837 err = -ENODEV; 5838 goto errout; 5839 } 5840 5841 fl6.flowi6_iif = iif; 5842 5843 if (!ipv6_addr_any(&fl6.saddr)) 5844 flags |= RT6_LOOKUP_F_HAS_SADDR; 5845 5846 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); 5847 5848 rcu_read_unlock(); 5849 } else { 5850 fl6.flowi6_oif = oif; 5851 5852 dst = ip6_route_output(net, NULL, &fl6); 5853 } 5854 5855 5856 rt = container_of(dst, struct rt6_info, dst); 5857 if (rt->dst.error) { 5858 err = rt->dst.error; 5859 ip6_rt_put(rt); 5860 goto errout; 5861 } 5862 5863 if (rt == net->ipv6.ip6_null_entry) { 5864 err = rt->dst.error; 5865 ip6_rt_put(rt); 5866 goto errout; 5867 } 5868 5869 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 5870 if (!skb) { 5871 ip6_rt_put(rt); 5872 err = -ENOBUFS; 5873 goto errout; 5874 } 5875 5876 skb_dst_set(skb, &rt->dst); 5877 5878 rcu_read_lock(); 5879 from = rcu_dereference(rt->from); 5880 if (from) { 5881 if (fibmatch) 5882 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, 5883 iif, RTM_NEWROUTE, 5884 NETLINK_CB(in_skb).portid, 5885 nlh->nlmsg_seq, 0); 5886 else 5887 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, 5888 &fl6.saddr, iif, RTM_NEWROUTE, 5889 NETLINK_CB(in_skb).portid, 5890 nlh->nlmsg_seq, 0); 5891 } else { 5892 err = -ENETUNREACH; 5893 } 5894 rcu_read_unlock(); 5895 5896 if (err < 0) { 5897 kfree_skb(skb); 5898 goto errout; 5899 } 5900 5901 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 5902 errout: 5903 return err; 5904 } 5905 5906 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 5907 unsigned int nlm_flags) 5908 { 5909 struct sk_buff *skb; 5910 struct net *net = info->nl_net; 5911 u32 seq; 5912 int err; 5913 <