~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv6/route.c

Version: ~ [ linux-5.12-rc5 ] ~ [ linux-5.11.11 ] ~ [ linux-5.10.27 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.109 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.184 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.228 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.264 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.264 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      Linux INET6 implementation
  3  *      FIB front-end.
  4  *
  5  *      Authors:
  6  *      Pedro Roque             <roque@di.fc.ul.pt>
  7  *
  8  *      This program is free software; you can redistribute it and/or
  9  *      modify it under the terms of the GNU General Public License
 10  *      as published by the Free Software Foundation; either version
 11  *      2 of the License, or (at your option) any later version.
 12  */
 13 
 14 /*      Changes:
 15  *
 16  *      YOSHIFUJI Hideaki @USAGI
 17  *              reworked default router selection.
 18  *              - respect outgoing interface
 19  *              - select from (probably) reachable routers (i.e.
 20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
 21  *              - always select the same router if it is (probably)
 22  *              reachable.  otherwise, round-robin the list.
 23  *      Ville Nuorvala
 24  *              Fixed routing subtrees.
 25  */
 26 
 27 #define pr_fmt(fmt) "IPv6: " fmt
 28 
 29 #include <linux/capability.h>
 30 #include <linux/errno.h>
 31 #include <linux/export.h>
 32 #include <linux/types.h>
 33 #include <linux/times.h>
 34 #include <linux/socket.h>
 35 #include <linux/sockios.h>
 36 #include <linux/net.h>
 37 #include <linux/route.h>
 38 #include <linux/netdevice.h>
 39 #include <linux/in6.h>
 40 #include <linux/mroute6.h>
 41 #include <linux/init.h>
 42 #include <linux/if_arp.h>
 43 #include <linux/proc_fs.h>
 44 #include <linux/seq_file.h>
 45 #include <linux/nsproxy.h>
 46 #include <linux/slab.h>
 47 #include <net/net_namespace.h>
 48 #include <net/snmp.h>
 49 #include <net/ipv6.h>
 50 #include <net/ip6_fib.h>
 51 #include <net/ip6_route.h>
 52 #include <net/ndisc.h>
 53 #include <net/addrconf.h>
 54 #include <net/tcp.h>
 55 #include <linux/rtnetlink.h>
 56 #include <net/dst.h>
 57 #include <net/dst_metadata.h>
 58 #include <net/xfrm.h>
 59 #include <net/netevent.h>
 60 #include <net/netlink.h>
 61 #include <net/nexthop.h>
 62 #include <net/lwtunnel.h>
 63 #include <net/ip_tunnels.h>
 64 #include <net/l3mdev.h>
 65 #include <trace/events/fib6.h>
 66 
 67 #include <linux/uaccess.h>
 68 
 69 #ifdef CONFIG_SYSCTL
 70 #include <linux/sysctl.h>
 71 #endif
 72 
 73 enum rt6_nud_state {
 74         RT6_NUD_FAIL_HARD = -3,
 75         RT6_NUD_FAIL_PROBE = -2,
 76         RT6_NUD_FAIL_DO_RR = -1,
 77         RT6_NUD_SUCCEED = 1
 78 };
 79 
 80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
 82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
 83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
 85 static void             ip6_dst_destroy(struct dst_entry *);
 86 static void             ip6_dst_ifdown(struct dst_entry *,
 87                                        struct net_device *dev, int how);
 88 static int               ip6_dst_gc(struct dst_ops *ops);
 89 
 90 static int              ip6_pkt_discard(struct sk_buff *skb);
 91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
 93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 94 static void             ip6_link_failure(struct sk_buff *skb);
 95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 96                                            struct sk_buff *skb, u32 mtu);
 97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 98                                         struct sk_buff *skb);
 99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107 
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119 
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124 
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126 
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130 
131         rt->dst.flags |= DST_NOCACHE;
132         rt->rt6i_uncached_list = ul;
133 
134         spin_lock_bh(&ul->lock);
135         list_add_tail(&rt->rt6i_uncached, &ul->head);
136         spin_unlock_bh(&ul->lock);
137 }
138 
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141         if (!list_empty(&rt->rt6i_uncached)) {
142                 struct uncached_list *ul = rt->rt6i_uncached_list;
143 
144                 spin_lock_bh(&ul->lock);
145                 list_del(&rt->rt6i_uncached);
146                 spin_unlock_bh(&ul->lock);
147         }
148 }
149 
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152         struct net_device *loopback_dev = net->loopback_dev;
153         int cpu;
154 
155         if (dev == loopback_dev)
156                 return;
157 
158         for_each_possible_cpu(cpu) {
159                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160                 struct rt6_info *rt;
161 
162                 spin_lock_bh(&ul->lock);
163                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164                         struct inet6_dev *rt_idev = rt->rt6i_idev;
165                         struct net_device *rt_dev = rt->dst.dev;
166 
167                         if (rt_idev->dev == dev) {
168                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
169                                 in6_dev_put(rt_idev);
170                         }
171 
172                         if (rt_dev == dev) {
173                                 rt->dst.dev = loopback_dev;
174                                 dev_hold(rt->dst.dev);
175                                 dev_put(rt_dev);
176                         }
177                 }
178                 spin_unlock_bh(&ul->lock);
179         }
180 }
181 
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184         return dst_metrics_write_ptr(rt->dst.from);
185 }
186 
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190 
191         if (rt->rt6i_flags & RTF_PCPU)
192                 return rt6_pcpu_cow_metrics(rt);
193         else if (rt->rt6i_flags & RTF_CACHE)
194                 return NULL;
195         else
196                 return dst_cow_metrics_generic(dst, old);
197 }
198 
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200                                              struct sk_buff *skb,
201                                              const void *daddr)
202 {
203         struct in6_addr *p = &rt->rt6i_gateway;
204 
205         if (!ipv6_addr_any(p))
206                 return (const void *) p;
207         else if (skb)
208                 return &ipv6_hdr(skb)->daddr;
209         return daddr;
210 }
211 
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213                                           struct sk_buff *skb,
214                                           const void *daddr)
215 {
216         struct rt6_info *rt = (struct rt6_info *) dst;
217         struct neighbour *n;
218 
219         daddr = choose_neigh_daddr(rt, skb, daddr);
220         n = __ipv6_neigh_lookup(dst->dev, daddr);
221         if (n)
222                 return n;
223         return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225 
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228         struct net_device *dev = dst->dev;
229         struct rt6_info *rt = (struct rt6_info *)dst;
230 
231         daddr = choose_neigh_daddr(rt, NULL, daddr);
232         if (!daddr)
233                 return;
234         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235                 return;
236         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237                 return;
238         __ipv6_confirm_neigh(dev, daddr);
239 }
240 
241 static struct dst_ops ip6_dst_ops_template = {
242         .family                 =       AF_INET6,
243         .gc                     =       ip6_dst_gc,
244         .gc_thresh              =       1024,
245         .check                  =       ip6_dst_check,
246         .default_advmss         =       ip6_default_advmss,
247         .mtu                    =       ip6_mtu,
248         .cow_metrics            =       ipv6_cow_metrics,
249         .destroy                =       ip6_dst_destroy,
250         .ifdown                 =       ip6_dst_ifdown,
251         .negative_advice        =       ip6_negative_advice,
252         .link_failure           =       ip6_link_failure,
253         .update_pmtu            =       ip6_rt_update_pmtu,
254         .redirect               =       rt6_do_redirect,
255         .local_out              =       __ip6_local_out,
256         .neigh_lookup           =       ip6_neigh_lookup,
257         .confirm_neigh          =       ip6_confirm_neigh,
258 };
259 
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 
264         return mtu ? : dst->dev->mtu;
265 }
266 
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268                                          struct sk_buff *skb, u32 mtu)
269 {
270 }
271 
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273                                       struct sk_buff *skb)
274 {
275 }
276 
277 static struct dst_ops ip6_dst_blackhole_ops = {
278         .family                 =       AF_INET6,
279         .destroy                =       ip6_dst_destroy,
280         .check                  =       ip6_dst_check,
281         .mtu                    =       ip6_blackhole_mtu,
282         .default_advmss         =       ip6_default_advmss,
283         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
284         .redirect               =       ip6_rt_blackhole_redirect,
285         .cow_metrics            =       dst_cow_metrics_generic,
286         .neigh_lookup           =       ip6_neigh_lookup,
287 };
288 
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290         [RTAX_HOPLIMIT - 1] = 0,
291 };
292 
293 static const struct rt6_info ip6_null_entry_template = {
294         .dst = {
295                 .__refcnt       = ATOMIC_INIT(1),
296                 .__use          = 1,
297                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
298                 .error          = -ENETUNREACH,
299                 .input          = ip6_pkt_discard,
300                 .output         = ip6_pkt_discard_out,
301         },
302         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
303         .rt6i_protocol  = RTPROT_KERNEL,
304         .rt6i_metric    = ~(u32) 0,
305         .rt6i_ref       = ATOMIC_INIT(1),
306 };
307 
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 
310 static const struct rt6_info ip6_prohibit_entry_template = {
311         .dst = {
312                 .__refcnt       = ATOMIC_INIT(1),
313                 .__use          = 1,
314                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
315                 .error          = -EACCES,
316                 .input          = ip6_pkt_prohibit,
317                 .output         = ip6_pkt_prohibit_out,
318         },
319         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
320         .rt6i_protocol  = RTPROT_KERNEL,
321         .rt6i_metric    = ~(u32) 0,
322         .rt6i_ref       = ATOMIC_INIT(1),
323 };
324 
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326         .dst = {
327                 .__refcnt       = ATOMIC_INIT(1),
328                 .__use          = 1,
329                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
330                 .error          = -EINVAL,
331                 .input          = dst_discard,
332                 .output         = dst_discard_out,
333         },
334         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
335         .rt6i_protocol  = RTPROT_KERNEL,
336         .rt6i_metric    = ~(u32) 0,
337         .rt6i_ref       = ATOMIC_INIT(1),
338 };
339 
340 #endif
341 
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344         struct dst_entry *dst = &rt->dst;
345 
346         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347         INIT_LIST_HEAD(&rt->rt6i_siblings);
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350 
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353                                         struct net_device *dev,
354                                         int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         0, DST_OBSOLETE_FORCE_CHK, flags);
358 
359         if (rt)
360                 rt6_info_init(rt);
361 
362         return rt;
363 }
364 
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366                                struct net_device *dev,
367                                int flags)
368 {
369         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370 
371         if (rt) {
372                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373                 if (rt->rt6i_pcpu) {
374                         int cpu;
375 
376                         for_each_possible_cpu(cpu) {
377                                 struct rt6_info **p;
378 
379                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380                                 /* no one shares rt */
381                                 *p =  NULL;
382                         }
383                 } else {
384                         dst_destroy((struct dst_entry *)rt);
385                         return NULL;
386                 }
387         }
388 
389         return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392 
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct dst_entry *from = dst->from;
397         struct inet6_dev *idev;
398 
399         dst_destroy_metrics_generic(dst);
400         free_percpu(rt->rt6i_pcpu);
401         rt6_uncached_list_del(rt);
402 
403         idev = rt->rt6i_idev;
404         if (idev) {
405                 rt->rt6i_idev = NULL;
406                 in6_dev_put(idev);
407         }
408 
409         dst->from = NULL;
410         dst_release(from);
411 }
412 
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414                            int how)
415 {
416         struct rt6_info *rt = (struct rt6_info *)dst;
417         struct inet6_dev *idev = rt->rt6i_idev;
418         struct net_device *loopback_dev =
419                 dev_net(dev)->loopback_dev;
420 
421         if (dev != loopback_dev) {
422                 if (idev && idev->dev == dev) {
423                         struct inet6_dev *loopback_idev =
424                                 in6_dev_get(loopback_dev);
425                         if (loopback_idev) {
426                                 rt->rt6i_idev = loopback_idev;
427                                 in6_dev_put(idev);
428                         }
429                 }
430         }
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448         }
449         return false;
450 }
451 
452 /* Multipath route selection:
453  *   Hash based function using packet header and flowlabel.
454  * Adapted from fib_info_hashfn()
455  */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457                                const struct flowi6 *fl6)
458 {
459         return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461 
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463                                              struct flowi6 *fl6, int oif,
464                                              int strict)
465 {
466         struct rt6_info *sibling, *next_sibling;
467         int route_choosen;
468 
469         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470         /* Don't change the route, if route_choosen == 0
471          * (siblings does not include ourself)
472          */
473         if (route_choosen)
474                 list_for_each_entry_safe(sibling, next_sibling,
475                                 &match->rt6i_siblings, rt6i_siblings) {
476                         route_choosen--;
477                         if (route_choosen == 0) {
478                                 if (rt6_score_route(sibling, oif, strict) < 0)
479                                         break;
480                                 match = sibling;
481                                 break;
482                         }
483                 }
484         return match;
485 }
486 
487 /*
488  *      Route lookup. Any table->tb6_lock is implied.
489  */
490 
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492                                                     struct rt6_info *rt,
493                                                     const struct in6_addr *saddr,
494                                                     int oif,
495                                                     int flags)
496 {
497         struct rt6_info *local = NULL;
498         struct rt6_info *sprt;
499 
500         if (!oif && ipv6_addr_any(saddr))
501                 goto out;
502 
503         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504                 struct net_device *dev = sprt->dst.dev;
505 
506                 if (oif) {
507                         if (dev->ifindex == oif)
508                                 return sprt;
509                         if (dev->flags & IFF_LOOPBACK) {
510                                 if (!sprt->rt6i_idev ||
511                                     sprt->rt6i_idev->dev->ifindex != oif) {
512                                         if (flags & RT6_LOOKUP_F_IFACE)
513                                                 continue;
514                                         if (local &&
515                                             local->rt6i_idev->dev->ifindex == oif)
516                                                 continue;
517                                 }
518                                 local = sprt;
519                         }
520                 } else {
521                         if (ipv6_chk_addr(net, saddr, dev,
522                                           flags & RT6_LOOKUP_F_IFACE))
523                                 return sprt;
524                 }
525         }
526 
527         if (oif) {
528                 if (local)
529                         return local;
530 
531                 if (flags & RT6_LOOKUP_F_IFACE)
532                         return net->ipv6.ip6_null_entry;
533         }
534 out:
535         return rt;
536 }
537 
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540         struct work_struct work;
541         struct in6_addr target;
542         struct net_device *dev;
543 };
544 
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547         struct in6_addr mcaddr;
548         struct __rt6_probe_work *work =
549                 container_of(w, struct __rt6_probe_work, work);
550 
551         addrconf_addr_solict_mult(&work->target, &mcaddr);
552         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553         dev_put(work->dev);
554         kfree(work);
555 }
556 
557 static void rt6_probe(struct rt6_info *rt)
558 {
559         struct __rt6_probe_work *work;
560         struct neighbour *neigh;
561         /*
562          * Okay, this does not seem to be appropriate
563          * for now, however, we need to check if it
564          * is really so; aka Router Reachability Probing.
565          *
566          * Router Reachability Probe MUST be rate-limited
567          * to no more than one per minute.
568          */
569         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570                 return;
571         rcu_read_lock_bh();
572         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573         if (neigh) {
574                 if (neigh->nud_state & NUD_VALID)
575                         goto out;
576 
577                 work = NULL;
578                 write_lock(&neigh->lock);
579                 if (!(neigh->nud_state & NUD_VALID) &&
580                     time_after(jiffies,
581                                neigh->updated +
582                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
583                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
584                         if (work)
585                                 __neigh_set_probe_once(neigh);
586                 }
587                 write_unlock(&neigh->lock);
588         } else {
589                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590         }
591 
592         if (work) {
593                 INIT_WORK(&work->work, rt6_probe_deferred);
594                 work->target = rt->rt6i_gateway;
595                 dev_hold(rt->dst.dev);
596                 work->dev = rt->dst.dev;
597                 schedule_work(&work->work);
598         }
599 
600 out:
601         rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608 
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614         struct net_device *dev = rt->dst.dev;
615         if (!oif || dev->ifindex == oif)
616                 return 2;
617         if ((dev->flags & IFF_LOOPBACK) &&
618             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619                 return 1;
620         return 0;
621 }
622 
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625         struct neighbour *neigh;
626         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627 
628         if (rt->rt6i_flags & RTF_NONEXTHOP ||
629             !(rt->rt6i_flags & RTF_GATEWAY))
630                 return RT6_NUD_SUCCEED;
631 
632         rcu_read_lock_bh();
633         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634         if (neigh) {
635                 read_lock(&neigh->lock);
636                 if (neigh->nud_state & NUD_VALID)
637                         ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639                 else if (!(neigh->nud_state & NUD_FAILED))
640                         ret = RT6_NUD_SUCCEED;
641                 else
642                         ret = RT6_NUD_FAIL_PROBE;
643 #endif
644                 read_unlock(&neigh->lock);
645         } else {
646                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648         }
649         rcu_read_unlock_bh();
650 
651         return ret;
652 }
653 
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655                            int strict)
656 {
657         int m;
658 
659         m = rt6_check_dev(rt, oif);
660         if (!m && (strict & RT6_LOOKUP_F_IFACE))
661                 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665         if (strict & RT6_LOOKUP_F_REACHABLE) {
666                 int n = rt6_check_neigh(rt);
667                 if (n < 0)
668                         return n;
669         }
670         return m;
671 }
672 
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674                                    int *mpri, struct rt6_info *match,
675                                    bool *do_rr)
676 {
677         int m;
678         bool match_do_rr = false;
679         struct inet6_dev *idev = rt->rt6i_idev;
680         struct net_device *dev = rt->dst.dev;
681 
682         if (dev && !netif_carrier_ok(dev) &&
683             idev->cnf.ignore_routes_with_linkdown &&
684             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685                 goto out;
686 
687         if (rt6_check_expired(rt))
688                 goto out;
689 
690         m = rt6_score_route(rt, oif, strict);
691         if (m == RT6_NUD_FAIL_DO_RR) {
692                 match_do_rr = true;
693                 m = 0; /* lowest valid score */
694         } else if (m == RT6_NUD_FAIL_HARD) {
695                 goto out;
696         }
697 
698         if (strict & RT6_LOOKUP_F_REACHABLE)
699                 rt6_probe(rt);
700 
701         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702         if (m > *mpri) {
703                 *do_rr = match_do_rr;
704                 *mpri = m;
705                 match = rt;
706         }
707 out:
708         return match;
709 }
710 
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718 
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726 
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729 
730         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731                 if (rt->rt6i_metric != metric) {
732                         cont = rt;
733                         break;
734                 }
735 
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737         }
738 
739         if (match || !cont)
740                 return match;
741 
742         for (rt = cont; rt; rt = rt->dst.rt6_next)
743                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744 
745         return match;
746 }
747 
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750         struct rt6_info *match, *rt0;
751         struct net *net;
752         bool do_rr = false;
753 
754         rt0 = fn->rr_ptr;
755         if (!rt0)
756                 fn->rr_ptr = rt0 = fn->leaf;
757 
758         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759                              &do_rr);
760 
761         if (do_rr) {
762                 struct rt6_info *next = rt0->dst.rt6_next;
763 
764                 /* no entries matched; do round-robin */
765                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766                         next = fn->leaf;
767 
768                 if (next != rt0)
769                         fn->rr_ptr = next;
770         }
771 
772         net = dev_net(rt0->dst.dev);
773         return match ? match : net->ipv6.ip6_null_entry;
774 }
775 
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780 
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783                   const struct in6_addr *gwaddr)
784 {
785         struct net *net = dev_net(dev);
786         struct route_info *rinfo = (struct route_info *) opt;
787         struct in6_addr prefix_buf, *prefix;
788         unsigned int pref;
789         unsigned long lifetime;
790         struct rt6_info *rt;
791 
792         if (len < sizeof(struct route_info)) {
793                 return -EINVAL;
794         }
795 
796         /* Sanity check for prefix_len and length */
797         if (rinfo->length > 3) {
798                 return -EINVAL;
799         } else if (rinfo->prefix_len > 128) {
800                 return -EINVAL;
801         } else if (rinfo->prefix_len > 64) {
802                 if (rinfo->length < 2) {
803                         return -EINVAL;
804                 }
805         } else if (rinfo->prefix_len > 0) {
806                 if (rinfo->length < 1) {
807                         return -EINVAL;
808                 }
809         }
810 
811         pref = rinfo->route_pref;
812         if (pref == ICMPV6_ROUTER_PREF_INVALID)
813                 return -EINVAL;
814 
815         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816 
817         if (rinfo->length == 3)
818                 prefix = (struct in6_addr *)rinfo->prefix;
819         else {
820                 /* this function is safe */
821                 ipv6_addr_prefix(&prefix_buf,
822                                  (struct in6_addr *)rinfo->prefix,
823                                  rinfo->prefix_len);
824                 prefix = &prefix_buf;
825         }
826 
827         if (rinfo->prefix_len == 0)
828                 rt = rt6_get_dflt_router(gwaddr, dev);
829         else
830                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831                                         gwaddr, dev);
832 
833         if (rt && !lifetime) {
834                 ip6_del_rt(rt);
835                 rt = NULL;
836         }
837 
838         if (!rt && lifetime)
839                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840                                         dev, pref);
841         else if (rt)
842                 rt->rt6i_flags = RTF_ROUTEINFO |
843                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844 
845         if (rt) {
846                 if (!addrconf_finite_timeout(lifetime))
847                         rt6_clean_expires(rt);
848                 else
849                         rt6_set_expires(rt, jiffies + HZ * lifetime);
850 
851                 ip6_rt_put(rt);
852         }
853         return 0;
854 }
855 #endif
856 
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858                                         struct in6_addr *saddr)
859 {
860         struct fib6_node *pn;
861         while (1) {
862                 if (fn->fn_flags & RTN_TL_ROOT)
863                         return NULL;
864                 pn = fn->parent;
865                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867                 else
868                         fn = pn;
869                 if (fn->fn_flags & RTN_RTINFO)
870                         return fn;
871         }
872 }
873 
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875                                              struct fib6_table *table,
876                                              struct flowi6 *fl6, int flags)
877 {
878         struct fib6_node *fn;
879         struct rt6_info *rt;
880 
881         read_lock_bh(&table->tb6_lock);
882         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884         rt = fn->leaf;
885         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888         if (rt == net->ipv6.ip6_null_entry) {
889                 fn = fib6_backtrack(fn, &fl6->saddr);
890                 if (fn)
891                         goto restart;
892         }
893         dst_use(&rt->dst, jiffies);
894         read_unlock_bh(&table->tb6_lock);
895 
896         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897 
898         return rt;
899 
900 }
901 
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903                                     int flags)
904 {
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908 
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910                             const struct in6_addr *saddr, int oif, int strict)
911 {
912         struct flowi6 fl6 = {
913                 .flowi6_oif = oif,
914                 .daddr = *daddr,
915         };
916         struct dst_entry *dst;
917         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918 
919         if (saddr) {
920                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921                 flags |= RT6_LOOKUP_F_HAS_SADDR;
922         }
923 
924         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925         if (dst->error == 0)
926                 return (struct rt6_info *) dst;
927 
928         dst_release(dst);
929 
930         return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933 
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935    It takes new route entry, the addition fails by any reason the
936    route is freed. In any case, if caller does not hold it, it may
937    be destroyed.
938  */
939 
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941                         struct mx6_config *mxc)
942 {
943         int err;
944         struct fib6_table *table;
945 
946         table = rt->rt6i_table;
947         write_lock_bh(&table->tb6_lock);
948         err = fib6_add(&table->tb6_root, rt, info, mxc);
949         write_unlock_bh(&table->tb6_lock);
950 
951         return err;
952 }
953 
954 int ip6_ins_rt(struct rt6_info *rt)
955 {
956         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
957         struct mx6_config mxc = { .mx = NULL, };
958 
959         return __ip6_ins_rt(rt, &info, &mxc);
960 }
961 
962 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
963                                            const struct in6_addr *daddr,
964                                            const struct in6_addr *saddr)
965 {
966         struct rt6_info *rt;
967 
968         /*
969          *      Clone the route.
970          */
971 
972         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
973                 ort = (struct rt6_info *)ort->dst.from;
974 
975         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
976 
977         if (!rt)
978                 return NULL;
979 
980         ip6_rt_copy_init(rt, ort);
981         rt->rt6i_flags |= RTF_CACHE;
982         rt->rt6i_metric = 0;
983         rt->dst.flags |= DST_HOST;
984         rt->rt6i_dst.addr = *daddr;
985         rt->rt6i_dst.plen = 128;
986 
987         if (!rt6_is_gw_or_nonexthop(ort)) {
988                 if (ort->rt6i_dst.plen != 128 &&
989                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
990                         rt->rt6i_flags |= RTF_ANYCAST;
991 #ifdef CONFIG_IPV6_SUBTREES
992                 if (rt->rt6i_src.plen && saddr) {
993                         rt->rt6i_src.addr = *saddr;
994                         rt->rt6i_src.plen = 128;
995                 }
996 #endif
997         }
998 
999         return rt;
1000 }
1001 
1002 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1003 {
1004         struct rt6_info *pcpu_rt;
1005 
1006         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1007                                   rt->dst.dev, rt->dst.flags);
1008 
1009         if (!pcpu_rt)
1010                 return NULL;
1011         ip6_rt_copy_init(pcpu_rt, rt);
1012         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1013         pcpu_rt->rt6i_flags |= RTF_PCPU;
1014         return pcpu_rt;
1015 }
1016 
1017 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1018 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1019 {
1020         struct rt6_info *pcpu_rt, **p;
1021 
1022         p = this_cpu_ptr(rt->rt6i_pcpu);
1023         pcpu_rt = *p;
1024 
1025         if (pcpu_rt) {
1026                 dst_hold(&pcpu_rt->dst);
1027                 rt6_dst_from_metrics_check(pcpu_rt);
1028         }
1029         return pcpu_rt;
1030 }
1031 
1032 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1033 {
1034         struct fib6_table *table = rt->rt6i_table;
1035         struct rt6_info *pcpu_rt, *prev, **p;
1036 
1037         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1038         if (!pcpu_rt) {
1039                 struct net *net = dev_net(rt->dst.dev);
1040 
1041                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1042                 return net->ipv6.ip6_null_entry;
1043         }
1044 
1045         read_lock_bh(&table->tb6_lock);
1046         if (rt->rt6i_pcpu) {
1047                 p = this_cpu_ptr(rt->rt6i_pcpu);
1048                 prev = cmpxchg(p, NULL, pcpu_rt);
1049                 if (prev) {
1050                         /* If someone did it before us, return prev instead */
1051                         dst_destroy(&pcpu_rt->dst);
1052                         pcpu_rt = prev;
1053                 }
1054         } else {
1055                 /* rt has been removed from the fib6 tree
1056                  * before we have a chance to acquire the read_lock.
1057                  * In this case, don't brother to create a pcpu rt
1058                  * since rt is going away anyway.  The next
1059                  * dst_check() will trigger a re-lookup.
1060                  */
1061                 dst_destroy(&pcpu_rt->dst);
1062                 pcpu_rt = rt;
1063         }
1064         dst_hold(&pcpu_rt->dst);
1065         rt6_dst_from_metrics_check(pcpu_rt);
1066         read_unlock_bh(&table->tb6_lock);
1067         return pcpu_rt;
1068 }
1069 
1070 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1071                                int oif, struct flowi6 *fl6, int flags)
1072 {
1073         struct fib6_node *fn, *saved_fn;
1074         struct rt6_info *rt;
1075         int strict = 0;
1076 
1077         strict |= flags & RT6_LOOKUP_F_IFACE;
1078         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1079         if (net->ipv6.devconf_all->forwarding == 0)
1080                 strict |= RT6_LOOKUP_F_REACHABLE;
1081 
1082         read_lock_bh(&table->tb6_lock);
1083 
1084         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1085         saved_fn = fn;
1086 
1087         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1088                 oif = 0;
1089 
1090 redo_rt6_select:
1091         rt = rt6_select(fn, oif, strict);
1092         if (rt->rt6i_nsiblings)
1093                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1094         if (rt == net->ipv6.ip6_null_entry) {
1095                 fn = fib6_backtrack(fn, &fl6->saddr);
1096                 if (fn)
1097                         goto redo_rt6_select;
1098                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1099                         /* also consider unreachable route */
1100                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1101                         fn = saved_fn;
1102                         goto redo_rt6_select;
1103                 }
1104         }
1105 
1106 
1107         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1108                 dst_use(&rt->dst, jiffies);
1109                 read_unlock_bh(&table->tb6_lock);
1110 
1111                 rt6_dst_from_metrics_check(rt);
1112 
1113                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1114                 return rt;
1115         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1116                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1117                 /* Create a RTF_CACHE clone which will not be
1118                  * owned by the fib6 tree.  It is for the special case where
1119                  * the daddr in the skb during the neighbor look-up is different
1120                  * from the fl6->daddr used to look-up route here.
1121                  */
1122 
1123                 struct rt6_info *uncached_rt;
1124 
1125                 dst_use(&rt->dst, jiffies);
1126                 read_unlock_bh(&table->tb6_lock);
1127 
1128                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1129                 dst_release(&rt->dst);
1130 
1131                 if (uncached_rt)
1132                         rt6_uncached_list_add(uncached_rt);
1133                 else
1134                         uncached_rt = net->ipv6.ip6_null_entry;
1135 
1136                 dst_hold(&uncached_rt->dst);
1137 
1138                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1139                 return uncached_rt;
1140 
1141         } else {
1142                 /* Get a percpu copy */
1143 
1144                 struct rt6_info *pcpu_rt;
1145 
1146                 rt->dst.lastuse = jiffies;
1147                 rt->dst.__use++;
1148                 pcpu_rt = rt6_get_pcpu_route(rt);
1149 
1150                 if (pcpu_rt) {
1151                         read_unlock_bh(&table->tb6_lock);
1152                 } else {
1153                         /* We have to do the read_unlock first
1154                          * because rt6_make_pcpu_route() may trigger
1155                          * ip6_dst_gc() which will take the write_lock.
1156                          */
1157                         dst_hold(&rt->dst);
1158                         read_unlock_bh(&table->tb6_lock);
1159                         pcpu_rt = rt6_make_pcpu_route(rt);
1160                         dst_release(&rt->dst);
1161                 }
1162 
1163                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1164                 return pcpu_rt;
1165 
1166         }
1167 }
1168 EXPORT_SYMBOL_GPL(ip6_pol_route);
1169 
1170 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1171                                             struct flowi6 *fl6, int flags)
1172 {
1173         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1174 }
1175 
1176 struct dst_entry *ip6_route_input_lookup(struct net *net,
1177                                          struct net_device *dev,
1178                                          struct flowi6 *fl6, int flags)
1179 {
1180         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1181                 flags |= RT6_LOOKUP_F_IFACE;
1182 
1183         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1184 }
1185 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1186 
1187 void ip6_route_input(struct sk_buff *skb)
1188 {
1189         const struct ipv6hdr *iph = ipv6_hdr(skb);
1190         struct net *net = dev_net(skb->dev);
1191         int flags = RT6_LOOKUP_F_HAS_SADDR;
1192         struct ip_tunnel_info *tun_info;
1193         struct flowi6 fl6 = {
1194                 .flowi6_iif = skb->dev->ifindex,
1195                 .daddr = iph->daddr,
1196                 .saddr = iph->saddr,
1197                 .flowlabel = ip6_flowinfo(iph),
1198                 .flowi6_mark = skb->mark,
1199                 .flowi6_proto = iph->nexthdr,
1200         };
1201 
1202         tun_info = skb_tunnel_info(skb);
1203         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1204                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1205         skb_dst_drop(skb);
1206         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1207 }
1208 
1209 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1210                                              struct flowi6 *fl6, int flags)
1211 {
1212         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1213 }
1214 
1215 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1216                                          struct flowi6 *fl6, int flags)
1217 {
1218         bool any_src;
1219 
1220         if (rt6_need_strict(&fl6->daddr)) {
1221                 struct dst_entry *dst;
1222 
1223                 dst = l3mdev_link_scope_lookup(net, fl6);
1224                 if (dst)
1225                         return dst;
1226         }
1227 
1228         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1229 
1230         any_src = ipv6_addr_any(&fl6->saddr);
1231         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1232             (fl6->flowi6_oif && any_src))
1233                 flags |= RT6_LOOKUP_F_IFACE;
1234 
1235         if (!any_src)
1236                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1237         else if (sk)
1238                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1239 
1240         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1241 }
1242 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1243 
1244 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1245 {
1246         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1247         struct dst_entry *new = NULL;
1248 
1249         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1250         if (rt) {
1251                 rt6_info_init(rt);
1252 
1253                 new = &rt->dst;
1254                 new->__use = 1;
1255                 new->input = dst_discard;
1256                 new->output = dst_discard_out;
1257 
1258                 dst_copy_metrics(new, &ort->dst);
1259                 rt->rt6i_idev = ort->rt6i_idev;
1260                 if (rt->rt6i_idev)
1261                         in6_dev_hold(rt->rt6i_idev);
1262 
1263                 rt->rt6i_gateway = ort->rt6i_gateway;
1264                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1265                 rt->rt6i_metric = 0;
1266 
1267                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1268 #ifdef CONFIG_IPV6_SUBTREES
1269                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1270 #endif
1271 
1272                 dst_free(new);
1273         }
1274 
1275         dst_release(dst_orig);
1276         return new ? new : ERR_PTR(-ENOMEM);
1277 }
1278 
1279 /*
1280  *      Destination cache support functions
1281  */
1282 
1283 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1284 {
1285         if (rt->dst.from &&
1286             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1287                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1288 }
1289 
1290 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1291 {
1292         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1293                 return NULL;
1294 
1295         if (rt6_check_expired(rt))
1296                 return NULL;
1297 
1298         return &rt->dst;
1299 }
1300 
1301 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1302 {
1303         if (!__rt6_check_expired(rt) &&
1304             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1305             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1306                 return &rt->dst;
1307         else
1308                 return NULL;
1309 }
1310 
1311 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1312 {
1313         struct rt6_info *rt;
1314 
1315         rt = (struct rt6_info *) dst;
1316 
1317         /* All IPV6 dsts are created with ->obsolete set to the value
1318          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1319          * into this function always.
1320          */
1321 
1322         rt6_dst_from_metrics_check(rt);
1323 
1324         if (rt->rt6i_flags & RTF_PCPU ||
1325             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1326                 return rt6_dst_from_check(rt, cookie);
1327         else
1328                 return rt6_check(rt, cookie);
1329 }
1330 
1331 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1332 {
1333         struct rt6_info *rt = (struct rt6_info *) dst;
1334 
1335         if (rt) {
1336                 if (rt->rt6i_flags & RTF_CACHE) {
1337                         if (rt6_check_expired(rt)) {
1338                                 ip6_del_rt(rt);
1339                                 dst = NULL;
1340                         }
1341                 } else {
1342                         dst_release(dst);
1343                         dst = NULL;
1344                 }
1345         }
1346         return dst;
1347 }
1348 
1349 static void ip6_link_failure(struct sk_buff *skb)
1350 {
1351         struct rt6_info *rt;
1352 
1353         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1354 
1355         rt = (struct rt6_info *) skb_dst(skb);
1356         if (rt) {
1357                 if (rt->rt6i_flags & RTF_CACHE) {
1358                         dst_hold(&rt->dst);
1359                         ip6_del_rt(rt);
1360                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1361                         rt->rt6i_node->fn_sernum = -1;
1362                 }
1363         }
1364 }
1365 
1366 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1367 {
1368         struct net *net = dev_net(rt->dst.dev);
1369 
1370         rt->rt6i_flags |= RTF_MODIFIED;
1371         rt->rt6i_pmtu = mtu;
1372         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1373 }
1374 
1375 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1376 {
1377         return !(rt->rt6i_flags & RTF_CACHE) &&
1378                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1379 }
1380 
1381 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1382                                  const struct ipv6hdr *iph, u32 mtu)
1383 {
1384         const struct in6_addr *daddr, *saddr;
1385         struct rt6_info *rt6 = (struct rt6_info *)dst;
1386 
1387         if (rt6->rt6i_flags & RTF_LOCAL)
1388                 return;
1389 
1390         if (dst_metric_locked(dst, RTAX_MTU))
1391                 return;
1392 
1393         if (iph) {
1394                 daddr = &iph->daddr;
1395                 saddr = &iph->saddr;
1396         } else if (sk) {
1397                 daddr = &sk->sk_v6_daddr;
1398                 saddr = &inet6_sk(sk)->saddr;
1399         } else {
1400                 daddr = NULL;
1401                 saddr = NULL;
1402         }
1403         dst_confirm_neigh(dst, daddr);
1404         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1405         if (mtu >= dst_mtu(dst))
1406                 return;
1407 
1408         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1409                 rt6_do_update_pmtu(rt6, mtu);
1410         } else if (daddr) {
1411                 struct rt6_info *nrt6;
1412 
1413                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1414                 if (nrt6) {
1415                         rt6_do_update_pmtu(nrt6, mtu);
1416 
1417                         /* ip6_ins_rt(nrt6) will bump the
1418                          * rt6->rt6i_node->fn_sernum
1419                          * which will fail the next rt6_check() and
1420                          * invalidate the sk->sk_dst_cache.
1421                          */
1422                         ip6_ins_rt(nrt6);
1423                 }
1424         }
1425 }
1426 
1427 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1428                                struct sk_buff *skb, u32 mtu)
1429 {
1430         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1431 }
1432 
1433 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1434                      int oif, u32 mark, kuid_t uid)
1435 {
1436         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1437         struct dst_entry *dst;
1438         struct flowi6 fl6;
1439 
1440         memset(&fl6, 0, sizeof(fl6));
1441         fl6.flowi6_oif = oif;
1442         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1443         fl6.daddr = iph->daddr;
1444         fl6.saddr = iph->saddr;
1445         fl6.flowlabel = ip6_flowinfo(iph);
1446         fl6.flowi6_uid = uid;
1447 
1448         dst = ip6_route_output(net, NULL, &fl6);
1449         if (!dst->error)
1450                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1451         dst_release(dst);
1452 }
1453 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1454 
1455 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1456 {
1457         struct dst_entry *dst;
1458 
1459         ip6_update_pmtu(skb, sock_net(sk), mtu,
1460                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1461 
1462         dst = __sk_dst_get(sk);
1463         if (!dst || !dst->obsolete ||
1464             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1465                 return;
1466 
1467         bh_lock_sock(sk);
1468         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1469                 ip6_datagram_dst_update(sk, false);
1470         bh_unlock_sock(sk);
1471 }
1472 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1473 
1474 /* Handle redirects */
1475 struct ip6rd_flowi {
1476         struct flowi6 fl6;
1477         struct in6_addr gateway;
1478 };
1479 
1480 static struct rt6_info *__ip6_route_redirect(struct net *net,
1481                                              struct fib6_table *table,
1482                                              struct flowi6 *fl6,
1483                                              int flags)
1484 {
1485         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1486         struct rt6_info *rt;
1487         struct fib6_node *fn;
1488 
1489         /* Get the "current" route for this destination and
1490          * check if the redirect has come from appropriate router.
1491          *
1492          * RFC 4861 specifies that redirects should only be
1493          * accepted if they come from the nexthop to the target.
1494          * Due to the way the routes are chosen, this notion
1495          * is a bit fuzzy and one might need to check all possible
1496          * routes.
1497          */
1498 
1499         read_lock_bh(&table->tb6_lock);
1500         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1501 restart:
1502         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1503                 if (rt6_check_expired(rt))
1504                         continue;
1505                 if (rt->dst.error)
1506                         break;
1507                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1508                         continue;
1509                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1510                         continue;
1511                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1512                         continue;
1513                 break;
1514         }
1515 
1516         if (!rt)
1517                 rt = net->ipv6.ip6_null_entry;
1518         else if (rt->dst.error) {
1519                 rt = net->ipv6.ip6_null_entry;
1520                 goto out;
1521         }
1522 
1523         if (rt == net->ipv6.ip6_null_entry) {
1524                 fn = fib6_backtrack(fn, &fl6->saddr);
1525                 if (fn)
1526                         goto restart;
1527         }
1528 
1529 out:
1530         dst_hold(&rt->dst);
1531 
1532         read_unlock_bh(&table->tb6_lock);
1533 
1534         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1535         return rt;
1536 };
1537 
1538 static struct dst_entry *ip6_route_redirect(struct net *net,
1539                                         const struct flowi6 *fl6,
1540                                         const struct in6_addr *gateway)
1541 {
1542         int flags = RT6_LOOKUP_F_HAS_SADDR;
1543         struct ip6rd_flowi rdfl;
1544 
1545         rdfl.fl6 = *fl6;
1546         rdfl.gateway = *gateway;
1547 
1548         return fib6_rule_lookup(net, &rdfl.fl6,
1549                                 flags, __ip6_route_redirect);
1550 }
1551 
1552 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1553                   kuid_t uid)
1554 {
1555         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1556         struct dst_entry *dst;
1557         struct flowi6 fl6;
1558 
1559         memset(&fl6, 0, sizeof(fl6));
1560         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1561         fl6.flowi6_oif = oif;
1562         fl6.flowi6_mark = mark;
1563         fl6.daddr = iph->daddr;
1564         fl6.saddr = iph->saddr;
1565         fl6.flowlabel = ip6_flowinfo(iph);
1566         fl6.flowi6_uid = uid;
1567 
1568         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1569         rt6_do_redirect(dst, NULL, skb);
1570         dst_release(dst);
1571 }
1572 EXPORT_SYMBOL_GPL(ip6_redirect);
1573 
1574 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1575                             u32 mark)
1576 {
1577         const struct ipv6hdr *iph = ipv6_hdr(skb);
1578         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1579         struct dst_entry *dst;
1580         struct flowi6 fl6;
1581 
1582         memset(&fl6, 0, sizeof(fl6));
1583         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1584         fl6.flowi6_oif = oif;
1585         fl6.flowi6_mark = mark;
1586         fl6.daddr = msg->dest;
1587         fl6.saddr = iph->daddr;
1588         fl6.flowi6_uid = sock_net_uid(net, NULL);
1589 
1590         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1591         rt6_do_redirect(dst, NULL, skb);
1592         dst_release(dst);
1593 }
1594 
1595 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1596 {
1597         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1598                      sk->sk_uid);
1599 }
1600 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1601 
1602 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1603 {
1604         struct net_device *dev = dst->dev;
1605         unsigned int mtu = dst_mtu(dst);
1606         struct net *net = dev_net(dev);
1607 
1608         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1609 
1610         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1611                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1612 
1613         /*
1614          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1615          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1616          * IPV6_MAXPLEN is also valid and means: "any MSS,
1617          * rely only on pmtu discovery"
1618          */
1619         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1620                 mtu = IPV6_MAXPLEN;
1621         return mtu;
1622 }
1623 
1624 static unsigned int ip6_mtu(const struct dst_entry *dst)
1625 {
1626         const struct rt6_info *rt = (const struct rt6_info *)dst;
1627         unsigned int mtu = rt->rt6i_pmtu;
1628         struct inet6_dev *idev;
1629 
1630         if (mtu)
1631                 goto out;
1632 
1633         mtu = dst_metric_raw(dst, RTAX_MTU);
1634         if (mtu)
1635                 goto out;
1636 
1637         mtu = IPV6_MIN_MTU;
1638 
1639         rcu_read_lock();
1640         idev = __in6_dev_get(dst->dev);
1641         if (idev)
1642                 mtu = idev->cnf.mtu6;
1643         rcu_read_unlock();
1644 
1645 out:
1646         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1647 
1648         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1649 }
1650 
1651 static struct dst_entry *icmp6_dst_gc_list;
1652 static DEFINE_SPINLOCK(icmp6_dst_lock);
1653 
1654 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1655                                   struct flowi6 *fl6)
1656 {
1657         struct dst_entry *dst;
1658         struct rt6_info *rt;
1659         struct inet6_dev *idev = in6_dev_get(dev);
1660         struct net *net = dev_net(dev);
1661 
1662         if (unlikely(!idev))
1663                 return ERR_PTR(-ENODEV);
1664 
1665         rt = ip6_dst_alloc(net, dev, 0);
1666         if (unlikely(!rt)) {
1667                 in6_dev_put(idev);
1668                 dst = ERR_PTR(-ENOMEM);
1669                 goto out;
1670         }
1671 
1672         rt->dst.flags |= DST_HOST;
1673         rt->dst.output  = ip6_output;
1674         atomic_set(&rt->dst.__refcnt, 1);
1675         rt->rt6i_gateway  = fl6->daddr;
1676         rt->rt6i_dst.addr = fl6->daddr;
1677         rt->rt6i_dst.plen = 128;
1678         rt->rt6i_idev     = idev;
1679         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1680 
1681         spin_lock_bh(&icmp6_dst_lock);
1682         rt->dst.next = icmp6_dst_gc_list;
1683         icmp6_dst_gc_list = &rt->dst;
1684         spin_unlock_bh(&icmp6_dst_lock);
1685 
1686         fib6_force_start_gc(net);
1687 
1688         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1689 
1690 out:
1691         return dst;
1692 }
1693 
1694 int icmp6_dst_gc(void)
1695 {
1696         struct dst_entry *dst, **pprev;
1697         int more = 0;
1698 
1699         spin_lock_bh(&icmp6_dst_lock);
1700         pprev = &icmp6_dst_gc_list;
1701 
1702         while ((dst = *pprev) != NULL) {
1703                 if (!atomic_read(&dst->__refcnt)) {
1704                         *pprev = dst->next;
1705                         dst_free(dst);
1706                 } else {
1707                         pprev = &dst->next;
1708                         ++more;
1709                 }
1710         }
1711 
1712         spin_unlock_bh(&icmp6_dst_lock);
1713 
1714         return more;
1715 }
1716 
1717 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1718                             void *arg)
1719 {
1720         struct dst_entry *dst, **pprev;
1721 
1722         spin_lock_bh(&icmp6_dst_lock);
1723         pprev = &icmp6_dst_gc_list;
1724         while ((dst = *pprev) != NULL) {
1725                 struct rt6_info *rt = (struct rt6_info *) dst;
1726                 if (func(rt, arg)) {
1727                         *pprev = dst->next;
1728                         dst_free(dst);
1729                 } else {
1730                         pprev = &dst->next;
1731                 }
1732         }
1733         spin_unlock_bh(&icmp6_dst_lock);
1734 }
1735 
1736 static int ip6_dst_gc(struct dst_ops *ops)
1737 {
1738         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1739         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1740         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1741         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1742         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1743         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1744         int entries;
1745 
1746         entries = dst_entries_get_fast(ops);
1747         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1748             entries <= rt_max_size)
1749                 goto out;
1750 
1751         net->ipv6.ip6_rt_gc_expire++;
1752         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1753         entries = dst_entries_get_slow(ops);
1754         if (entries < ops->gc_thresh)
1755                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1756 out:
1757         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1758         return entries > rt_max_size;
1759 }
1760 
1761 static int ip6_convert_metrics(struct mx6_config *mxc,
1762                                const struct fib6_config *cfg)
1763 {
1764         bool ecn_ca = false;
1765         struct nlattr *nla;
1766         int remaining;
1767         u32 *mp;
1768 
1769         if (!cfg->fc_mx)
1770                 return 0;
1771 
1772         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1773         if (unlikely(!mp))
1774                 return -ENOMEM;
1775 
1776         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1777                 int type = nla_type(nla);
1778                 u32 val;
1779 
1780                 if (!type)
1781                         continue;
1782                 if (unlikely(type > RTAX_MAX))
1783                         goto err;
1784 
1785                 if (type == RTAX_CC_ALGO) {
1786                         char tmp[TCP_CA_NAME_MAX];
1787 
1788                         nla_strlcpy(tmp, nla, sizeof(tmp));
1789                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1790                         if (val == TCP_CA_UNSPEC)
1791                                 goto err;
1792                 } else {
1793                         val = nla_get_u32(nla);
1794                 }
1795                 if (type == RTAX_HOPLIMIT && val > 255)
1796                         val = 255;
1797                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1798                         goto err;
1799 
1800                 mp[type - 1] = val;
1801                 __set_bit(type - 1, mxc->mx_valid);
1802         }
1803 
1804         if (ecn_ca) {
1805                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1806                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1807         }
1808 
1809         mxc->mx = mp;
1810         return 0;
1811  err:
1812         kfree(mp);
1813         return -EINVAL;
1814 }
1815 
1816 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1817                                             struct fib6_config *cfg,
1818                                             const struct in6_addr *gw_addr)
1819 {
1820         struct flowi6 fl6 = {
1821                 .flowi6_oif = cfg->fc_ifindex,
1822                 .daddr = *gw_addr,
1823                 .saddr = cfg->fc_prefsrc,
1824         };
1825         struct fib6_table *table;
1826         struct rt6_info *rt;
1827         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1828 
1829         table = fib6_get_table(net, cfg->fc_table);
1830         if (!table)
1831                 return NULL;
1832 
1833         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1834                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1835 
1836         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1837 
1838         /* if table lookup failed, fall back to full lookup */
1839         if (rt == net->ipv6.ip6_null_entry) {
1840                 ip6_rt_put(rt);
1841                 rt = NULL;
1842         }
1843 
1844         return rt;
1845 }
1846 
1847 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1848 {
1849         struct net *net = cfg->fc_nlinfo.nl_net;
1850         struct rt6_info *rt = NULL;
1851         struct net_device *dev = NULL;
1852         struct inet6_dev *idev = NULL;
1853         struct fib6_table *table;
1854         int addr_type;
1855         int err = -EINVAL;
1856 
1857         /* RTF_PCPU is an internal flag; can not be set by userspace */
1858         if (cfg->fc_flags & RTF_PCPU)
1859                 goto out;
1860 
1861         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1862                 goto out;
1863 #ifndef CONFIG_IPV6_SUBTREES
1864         if (cfg->fc_src_len)
1865                 goto out;
1866 #endif
1867         if (cfg->fc_ifindex) {
1868                 err = -ENODEV;
1869                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1870                 if (!dev)
1871                         goto out;
1872                 idev = in6_dev_get(dev);
1873                 if (!idev)
1874                         goto out;
1875         }
1876 
1877         if (cfg->fc_metric == 0)
1878                 cfg->fc_metric = IP6_RT_PRIO_USER;
1879 
1880         err = -ENOBUFS;
1881         if (cfg->fc_nlinfo.nlh &&
1882             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1883                 table = fib6_get_table(net, cfg->fc_table);
1884                 if (!table) {
1885                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1886                         table = fib6_new_table(net, cfg->fc_table);
1887                 }
1888         } else {
1889                 table = fib6_new_table(net, cfg->fc_table);
1890         }
1891 
1892         if (!table)
1893                 goto out;
1894 
1895         rt = ip6_dst_alloc(net, NULL,
1896                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1897 
1898         if (!rt) {
1899                 err = -ENOMEM;
1900                 goto out;
1901         }
1902 
1903         if (cfg->fc_flags & RTF_EXPIRES)
1904                 rt6_set_expires(rt, jiffies +
1905                                 clock_t_to_jiffies(cfg->fc_expires));
1906         else
1907                 rt6_clean_expires(rt);
1908 
1909         if (cfg->fc_protocol == RTPROT_UNSPEC)
1910                 cfg->fc_protocol = RTPROT_BOOT;
1911         rt->rt6i_protocol = cfg->fc_protocol;
1912 
1913         addr_type = ipv6_addr_type(&cfg->fc_dst);
1914 
1915         if (addr_type & IPV6_ADDR_MULTICAST)
1916                 rt->dst.input = ip6_mc_input;
1917         else if (cfg->fc_flags & RTF_LOCAL)
1918                 rt->dst.input = ip6_input;
1919         else
1920                 rt->dst.input = ip6_forward;
1921 
1922         rt->dst.output = ip6_output;
1923 
1924         if (cfg->fc_encap) {
1925                 struct lwtunnel_state *lwtstate;
1926 
1927                 err = lwtunnel_build_state(cfg->fc_encap_type,
1928                                            cfg->fc_encap, AF_INET6, cfg,
1929                                            &lwtstate);
1930                 if (err)
1931                         goto out;
1932                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1933                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1934                         rt->dst.lwtstate->orig_output = rt->dst.output;
1935                         rt->dst.output = lwtunnel_output;
1936                 }
1937                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1938                         rt->dst.lwtstate->orig_input = rt->dst.input;
1939                         rt->dst.input = lwtunnel_input;
1940                 }
1941         }
1942 
1943         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1944         rt->rt6i_dst.plen = cfg->fc_dst_len;
1945         if (rt->rt6i_dst.plen == 128)
1946                 rt->dst.flags |= DST_HOST;
1947 
1948 #ifdef CONFIG_IPV6_SUBTREES
1949         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1950         rt->rt6i_src.plen = cfg->fc_src_len;
1951 #endif
1952 
1953         rt->rt6i_metric = cfg->fc_metric;
1954 
1955         /* We cannot add true routes via loopback here,
1956            they would result in kernel looping; promote them to reject routes
1957          */
1958         if ((cfg->fc_flags & RTF_REJECT) ||
1959             (dev && (dev->flags & IFF_LOOPBACK) &&
1960              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1961              !(cfg->fc_flags & RTF_LOCAL))) {
1962                 /* hold loopback dev/idev if we haven't done so. */
1963                 if (dev != net->loopback_dev) {
1964                         if (dev) {
1965                                 dev_put(dev);
1966                                 in6_dev_put(idev);
1967                         }
1968                         dev = net->loopback_dev;
1969                         dev_hold(dev);
1970                         idev = in6_dev_get(dev);
1971                         if (!idev) {
1972                                 err = -ENODEV;
1973                                 goto out;
1974                         }
1975                 }
1976                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1977                 switch (cfg->fc_type) {
1978                 case RTN_BLACKHOLE:
1979                         rt->dst.error = -EINVAL;
1980                         rt->dst.output = dst_discard_out;
1981                         rt->dst.input = dst_discard;
1982                         break;
1983                 case RTN_PROHIBIT:
1984                         rt->dst.error = -EACCES;
1985                         rt->dst.output = ip6_pkt_prohibit_out;
1986                         rt->dst.input = ip6_pkt_prohibit;
1987                         break;
1988                 case RTN_THROW:
1989                 case RTN_UNREACHABLE:
1990                 default:
1991                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1992                                         : (cfg->fc_type == RTN_UNREACHABLE)
1993                                         ? -EHOSTUNREACH : -ENETUNREACH;
1994                         rt->dst.output = ip6_pkt_discard_out;
1995                         rt->dst.input = ip6_pkt_discard;
1996                         break;
1997                 }
1998                 goto install_route;
1999         }
2000 
2001         if (cfg->fc_flags & RTF_GATEWAY) {
2002                 const struct in6_addr *gw_addr;
2003                 int gwa_type;
2004 
2005                 gw_addr = &cfg->fc_gateway;
2006                 gwa_type = ipv6_addr_type(gw_addr);
2007 
2008                 /* if gw_addr is local we will fail to detect this in case
2009                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2010                  * will return already-added prefix route via interface that
2011                  * prefix route was assigned to, which might be non-loopback.
2012                  */
2013                 err = -EINVAL;
2014                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2015                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2016                                             dev : NULL, 0, 0))
2017                         goto out;
2018 
2019                 rt->rt6i_gateway = *gw_addr;
2020 
2021                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2022                         struct rt6_info *grt = NULL;
2023 
2024                         /* IPv6 strictly inhibits using not link-local
2025                            addresses as nexthop address.
2026                            Otherwise, router will not able to send redirects.
2027                            It is very good, but in some (rare!) circumstances
2028                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2029                            some exceptions. --ANK
2030                            We allow IPv4-mapped nexthops to support RFC4798-type
2031                            addressing
2032                          */
2033                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2034                                           IPV6_ADDR_MAPPED)))
2035                                 goto out;
2036 
2037                         if (cfg->fc_table) {
2038                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2039 
2040                                 if (grt) {
2041                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2042                                             (dev && dev != grt->dst.dev)) {
2043                                                 ip6_rt_put(grt);
2044                                                 grt = NULL;
2045                                         }
2046                                 }
2047                         }
2048 
2049                         if (!grt)
2050                                 grt = rt6_lookup(net, gw_addr, NULL,
2051                                                  cfg->fc_ifindex, 1);
2052 
2053                         err = -EHOSTUNREACH;
2054                         if (!grt)
2055                                 goto out;
2056                         if (dev) {
2057                                 if (dev != grt->dst.dev) {
2058                                         ip6_rt_put(grt);
2059                                         goto out;
2060                                 }
2061                         } else {
2062                                 dev = grt->dst.dev;
2063                                 idev = grt->rt6i_idev;
2064                                 dev_hold(dev);
2065                                 in6_dev_hold(grt->rt6i_idev);
2066                         }
2067                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2068                                 err = 0;
2069                         ip6_rt_put(grt);
2070 
2071                         if (err)
2072                                 goto out;
2073                 }
2074                 err = -EINVAL;
2075                 if (!dev || (dev->flags & IFF_LOOPBACK))
2076                         goto out;
2077         }
2078 
2079         err = -ENODEV;
2080         if (!dev)
2081                 goto out;
2082 
2083         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2084                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2085                         err = -EINVAL;
2086                         goto out;
2087                 }
2088                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2089                 rt->rt6i_prefsrc.plen = 128;
2090         } else
2091                 rt->rt6i_prefsrc.plen = 0;
2092 
2093         rt->rt6i_flags = cfg->fc_flags;
2094 
2095 install_route:
2096         rt->dst.dev = dev;
2097         rt->rt6i_idev = idev;
2098         rt->rt6i_table = table;
2099 
2100         cfg->fc_nlinfo.nl_net = dev_net(dev);
2101 
2102         return rt;
2103 out:
2104         if (dev)
2105                 dev_put(dev);
2106         if (idev)
2107                 in6_dev_put(idev);
2108         if (rt)
2109                 dst_free(&rt->dst);
2110 
2111         return ERR_PTR(err);
2112 }
2113 
2114 int ip6_route_add(struct fib6_config *cfg)
2115 {
2116         struct mx6_config mxc = { .mx = NULL, };
2117         struct rt6_info *rt;
2118         int err;
2119 
2120         rt = ip6_route_info_create(cfg);
2121         if (IS_ERR(rt)) {
2122                 err = PTR_ERR(rt);
2123                 rt = NULL;
2124                 goto out;
2125         }
2126 
2127         err = ip6_convert_metrics(&mxc, cfg);
2128         if (err)
2129                 goto out;
2130 
2131         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2132 
2133         kfree(mxc.mx);
2134 
2135         return err;
2136 out:
2137         if (rt)
2138                 dst_free(&rt->dst);
2139 
2140         return err;
2141 }
2142 
2143 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2144 {
2145         int err;
2146         struct fib6_table *table;
2147         struct net *net = dev_net(rt->dst.dev);
2148 
2149         if (rt == net->ipv6.ip6_null_entry ||
2150             rt->dst.flags & DST_NOCACHE) {
2151                 err = -ENOENT;
2152                 goto out;
2153         }
2154 
2155         table = rt->rt6i_table;
2156         write_lock_bh(&table->tb6_lock);
2157         err = fib6_del(rt, info);
2158         write_unlock_bh(&table->tb6_lock);
2159 
2160 out:
2161         ip6_rt_put(rt);
2162         return err;
2163 }
2164 
2165 int ip6_del_rt(struct rt6_info *rt)
2166 {
2167         struct nl_info info = {
2168                 .nl_net = dev_net(rt->dst.dev),
2169         };
2170         return __ip6_del_rt(rt, &info);
2171 }
2172 
2173 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2174 {
2175         struct nl_info *info = &cfg->fc_nlinfo;
2176         struct net *net = info->nl_net;
2177         struct sk_buff *skb = NULL;
2178         struct fib6_table *table;
2179         int err = -ENOENT;
2180 
2181         if (rt == net->ipv6.ip6_null_entry)
2182                 goto out_put;
2183         table = rt->rt6i_table;
2184         write_lock_bh(&table->tb6_lock);
2185 
2186         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2187                 struct rt6_info *sibling, *next_sibling;
2188 
2189                 /* prefer to send a single notification with all hops */
2190                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2191                 if (skb) {
2192                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2193 
2194                         if (rt6_fill_node(net, skb, rt,
2195                                           NULL, NULL, 0, RTM_DELROUTE,
2196                                           info->portid, seq, 0) < 0) {
2197                                 kfree_skb(skb);
2198                                 skb = NULL;
2199                         } else
2200                                 info->skip_notify = 1;
2201                 }
2202 
2203                 list_for_each_entry_safe(sibling, next_sibling,
2204                                          &rt->rt6i_siblings,
2205                                          rt6i_siblings) {
2206                         err = fib6_del(sibling, info);
2207                         if (err)
2208                                 goto out_unlock;
2209                 }
2210         }
2211 
2212         err = fib6_del(rt, info);
2213 out_unlock:
2214         write_unlock_bh(&table->tb6_lock);
2215 out_put:
2216         ip6_rt_put(rt);
2217 
2218         if (skb) {
2219                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2220                             info->nlh, gfp_any());
2221         }
2222         return err;
2223 }
2224 
2225 static int ip6_route_del(struct fib6_config *cfg)
2226 {
2227         struct fib6_table *table;
2228         struct fib6_node *fn;
2229         struct rt6_info *rt;
2230         int err = -ESRCH;
2231 
2232         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2233         if (!table)
2234                 return err;
2235 
2236         read_lock_bh(&table->tb6_lock);
2237 
2238         fn = fib6_locate(&table->tb6_root,
2239                          &cfg->fc_dst, cfg->fc_dst_len,
2240                          &cfg->fc_src, cfg->fc_src_len);
2241 
2242         if (fn) {
2243                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2244                         if ((rt->rt6i_flags & RTF_CACHE) &&
2245                             !(cfg->fc_flags & RTF_CACHE))
2246                                 continue;
2247                         if (cfg->fc_ifindex &&
2248                             (!rt->dst.dev ||
2249                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2250                                 continue;
2251                         if (cfg->fc_flags & RTF_GATEWAY &&
2252                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2253                                 continue;
2254                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2255                                 continue;
2256                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2257                                 continue;
2258                         dst_hold(&rt->dst);
2259                         read_unlock_bh(&table->tb6_lock);
2260 
2261                         /* if gateway was specified only delete the one hop */
2262                         if (cfg->fc_flags & RTF_GATEWAY)
2263                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2264 
2265                         return __ip6_del_rt_siblings(rt, cfg);
2266                 }
2267         }
2268         read_unlock_bh(&table->tb6_lock);
2269 
2270         return err;
2271 }
2272 
2273 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2274 {
2275         struct netevent_redirect netevent;
2276         struct rt6_info *rt, *nrt = NULL;
2277         struct ndisc_options ndopts;
2278         struct inet6_dev *in6_dev;
2279         struct neighbour *neigh;
2280         struct rd_msg *msg;
2281         int optlen, on_link;
2282         u8 *lladdr;
2283 
2284         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2285         optlen -= sizeof(*msg);
2286 
2287         if (optlen < 0) {
2288                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2289                 return;
2290         }
2291 
2292         msg = (struct rd_msg *)icmp6_hdr(skb);
2293 
2294         if (ipv6_addr_is_multicast(&msg->dest)) {
2295                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2296                 return;
2297         }
2298 
2299         on_link = 0;
2300         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2301                 on_link = 1;
2302         } else if (ipv6_addr_type(&msg->target) !=
2303                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2304                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2305                 return;
2306         }
2307 
2308         in6_dev = __in6_dev_get(skb->dev);
2309         if (!in6_dev)
2310                 return;
2311         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2312                 return;
2313 
2314         /* RFC2461 8.1:
2315          *      The IP source address of the Redirect MUST be the same as the current
2316          *      first-hop router for the specified ICMP Destination Address.
2317          */
2318 
2319         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2320                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2321                 return;
2322         }
2323 
2324         lladdr = NULL;
2325         if (ndopts.nd_opts_tgt_lladdr) {
2326                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2327                                              skb->dev);
2328                 if (!lladdr) {
2329                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2330                         return;
2331                 }
2332         }
2333 
2334         rt = (struct rt6_info *) dst;
2335         if (rt->rt6i_flags & RTF_REJECT) {
2336                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2337                 return;
2338         }
2339 
2340         /* Redirect received -> path was valid.
2341          * Look, redirects are sent only in response to data packets,
2342          * so that this nexthop apparently is reachable. --ANK
2343          */
2344         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2345 
2346         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2347         if (!neigh)
2348                 return;
2349 
2350         /*
2351          *      We have finally decided to accept it.
2352          */
2353 
2354         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2355                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2356                      NEIGH_UPDATE_F_OVERRIDE|
2357                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2358                                      NEIGH_UPDATE_F_ISROUTER)),
2359                      NDISC_REDIRECT, &ndopts);
2360 
2361         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2362         if (!nrt)
2363                 goto out;
2364 
2365         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2366         if (on_link)
2367                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2368 
2369         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2370 
2371         if (ip6_ins_rt(nrt))
2372                 goto out;
2373 
2374         netevent.old = &rt->dst;
2375         netevent.new = &nrt->dst;
2376         netevent.daddr = &msg->dest;
2377         netevent.neigh = neigh;
2378         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2379 
2380         if (rt->rt6i_flags & RTF_CACHE) {
2381                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2382                 ip6_del_rt(rt);
2383         }
2384 
2385 out:
2386         neigh_release(neigh);
2387 }
2388 
2389 /*
2390  *      Misc support functions
2391  */
2392 
2393 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2394 {
2395         BUG_ON(from->dst.from);
2396 
2397         rt->rt6i_flags &= ~RTF_EXPIRES;
2398         dst_hold(&from->dst);
2399         rt->dst.from = &from->dst;
2400         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2401 }
2402 
2403 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2404 {
2405         rt->dst.input = ort->dst.input;
2406         rt->dst.output = ort->dst.output;
2407         rt->rt6i_dst = ort->rt6i_dst;
2408         rt->dst.error = ort->dst.error;
2409         rt->rt6i_idev = ort->rt6i_idev;
2410         if (rt->rt6i_idev)
2411                 in6_dev_hold(rt->rt6i_idev);
2412         rt->dst.lastuse = jiffies;
2413         rt->rt6i_gateway = ort->rt6i_gateway;
2414         rt->rt6i_flags = ort->rt6i_flags;
2415         rt6_set_from(rt, ort);
2416         rt->rt6i_metric = ort->rt6i_metric;
2417 #ifdef CONFIG_IPV6_SUBTREES
2418         rt->rt6i_src = ort->rt6i_src;
2419 #endif
2420         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2421         rt->rt6i_table = ort->rt6i_table;
2422         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2423 }
2424 
2425 #ifdef CONFIG_IPV6_ROUTE_INFO
2426 static struct rt6_info *rt6_get_route_info(struct net *net,
2427                                            const struct in6_addr *prefix, int prefixlen,
2428                                            const struct in6_addr *gwaddr,
2429                                            struct net_device *dev)
2430 {
2431         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2432         int ifindex = dev->ifindex;
2433         struct fib6_node *fn;
2434         struct rt6_info *rt = NULL;
2435         struct fib6_table *table;
2436 
2437         table = fib6_get_table(net, tb_id);
2438         if (!table)
2439                 return NULL;
2440 
2441         read_lock_bh(&table->tb6_lock);
2442         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2443         if (!fn)
2444                 goto out;
2445 
2446         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2447                 if (rt->dst.dev->ifindex != ifindex)
2448                         continue;
2449                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2450                         continue;
2451                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2452                         continue;
2453                 dst_hold(&rt->dst);
2454                 break;
2455         }
2456 out:
2457         read_unlock_bh(&table->tb6_lock);
2458         return rt;
2459 }
2460 
2461 static struct rt6_info *rt6_add_route_info(struct net *net,
2462                                            const struct in6_addr *prefix, int prefixlen,
2463                                            const struct in6_addr *gwaddr,
2464                                            struct net_device *dev,
2465                                            unsigned int pref)
2466 {
2467         struct fib6_config cfg = {
2468                 .fc_metric      = IP6_RT_PRIO_USER,
2469                 .fc_ifindex     = dev->ifindex,
2470                 .fc_dst_len     = prefixlen,
2471                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2472                                   RTF_UP | RTF_PREF(pref),
2473                 .fc_nlinfo.portid = 0,
2474                 .fc_nlinfo.nlh = NULL,
2475                 .fc_nlinfo.nl_net = net,
2476         };
2477 
2478         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2479         cfg.fc_dst = *prefix;
2480         cfg.fc_gateway = *gwaddr;
2481 
2482         /* We should treat it as a default route if prefix length is 0. */
2483         if (!prefixlen)
2484                 cfg.fc_flags |= RTF_DEFAULT;
2485 
2486         ip6_route_add(&cfg);
2487 
2488         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2489 }
2490 #endif
2491 
2492 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2493 {
2494         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2495         struct rt6_info *rt;
2496         struct fib6_table *table;
2497 
2498         table = fib6_get_table(dev_net(dev), tb_id);
2499         if (!table)
2500                 return NULL;
2501 
2502         read_lock_bh(&table->tb6_lock);
2503         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2504                 if (dev == rt->dst.dev &&
2505                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2506                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2507                         break;
2508         }
2509         if (rt)
2510                 dst_hold(&rt->dst);
2511         read_unlock_bh(&table->tb6_lock);
2512         return rt;
2513 }
2514 
2515 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2516                                      struct net_device *dev,
2517                                      unsigned int pref)
2518 {
2519         struct fib6_config cfg = {
2520                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2521                 .fc_metric      = IP6_RT_PRIO_USER,
2522                 .fc_ifindex     = dev->ifindex,
2523                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2524                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2525                 .fc_nlinfo.portid = 0,
2526                 .fc_nlinfo.nlh = NULL,
2527                 .fc_nlinfo.nl_net = dev_net(dev),
2528         };
2529 
2530         cfg.fc_gateway = *gwaddr;
2531 
2532         if (!ip6_route_add(&cfg)) {
2533                 struct fib6_table *table;
2534 
2535                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2536                 if (table)
2537                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2538         }
2539 
2540         return rt6_get_dflt_router(gwaddr, dev);
2541 }
2542 
2543 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2544 {
2545         struct rt6_info *rt;
2546 
2547 restart:
2548         read_lock_bh(&table->tb6_lock);
2549         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2550                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2551                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2552                         dst_hold(&rt->dst);
2553                         read_unlock_bh(&table->tb6_lock);
2554                         ip6_del_rt(rt);
2555                         goto restart;
2556                 }
2557         }
2558         read_unlock_bh(&table->tb6_lock);
2559 
2560         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2561 }
2562 
2563 void rt6_purge_dflt_routers(struct net *net)
2564 {
2565         struct fib6_table *table;
2566         struct hlist_head *head;
2567         unsigned int h;
2568 
2569         rcu_read_lock();
2570 
2571         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2572                 head = &net->ipv6.fib_table_hash[h];
2573                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2574                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2575                                 __rt6_purge_dflt_routers(table);
2576                 }
2577         }
2578 
2579         rcu_read_unlock();
2580 }
2581 
2582 static void rtmsg_to_fib6_config(struct net *net,
2583                                  struct in6_rtmsg *rtmsg,
2584                                  struct fib6_config *cfg)
2585 {
2586         memset(cfg, 0, sizeof(*cfg));
2587 
2588         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2589                          : RT6_TABLE_MAIN;
2590         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2591         cfg->fc_metric = rtmsg->rtmsg_metric;
2592         cfg->fc_expires = rtmsg->rtmsg_info;
2593         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2594         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2595         cfg->fc_flags = rtmsg->rtmsg_flags;
2596 
2597         cfg->fc_nlinfo.nl_net = net;
2598 
2599         cfg->fc_dst = rtmsg->rtmsg_dst;
2600         cfg->fc_src = rtmsg->rtmsg_src;
2601         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2602 }
2603 
2604 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2605 {
2606         struct fib6_config cfg;
2607         struct in6_rtmsg rtmsg;
2608         int err;
2609 
2610         switch (cmd) {
2611         case SIOCADDRT:         /* Add a route */
2612         case SIOCDELRT:         /* Delete a route */
2613                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2614                         return -EPERM;
2615                 err = copy_from_user(&rtmsg, arg,
2616                                      sizeof(struct in6_rtmsg));
2617                 if (err)
2618                         return -EFAULT;
2619 
2620                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2621 
2622                 rtnl_lock();
2623                 switch (cmd) {
2624                 case SIOCADDRT:
2625                         err = ip6_route_add(&cfg);
2626                         break;
2627                 case SIOCDELRT:
2628                         err = ip6_route_del(&cfg);
2629                         break;
2630                 default:
2631                         err = -EINVAL;
2632                 }
2633                 rtnl_unlock();
2634 
2635                 return err;
2636         }
2637 
2638         return -EINVAL;
2639 }
2640 
2641 /*
2642  *      Drop the packet on the floor
2643  */
2644 
2645 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2646 {
2647         int type;
2648         struct dst_entry *dst = skb_dst(skb);
2649         switch (ipstats_mib_noroutes) {
2650         case IPSTATS_MIB_INNOROUTES:
2651                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2652                 if (type == IPV6_ADDR_ANY) {
2653                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2654                                       IPSTATS_MIB_INADDRERRORS);
2655                         break;
2656                 }
2657                 /* FALLTHROUGH */
2658         case IPSTATS_MIB_OUTNOROUTES:
2659                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2660                               ipstats_mib_noroutes);
2661                 break;
2662         }
2663         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2664         kfree_skb(skb);
2665         return 0;
2666 }
2667 
2668 static int ip6_pkt_discard(struct sk_buff *skb)
2669 {
2670         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2671 }
2672 
2673 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2674 {
2675         skb->dev = skb_dst(skb)->dev;
2676         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2677 }
2678 
2679 static int ip6_pkt_prohibit(struct sk_buff *skb)
2680 {
2681         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2682 }
2683 
2684 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2685 {
2686         skb->dev = skb_dst(skb)->dev;
2687         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2688 }
2689 
2690 /*
2691  *      Allocate a dst for local (unicast / anycast) address.
2692  */
2693 
2694 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2695                                     const struct in6_addr *addr,
2696                                     bool anycast)
2697 {
2698         u32 tb_id;
2699         struct net *net = dev_net(idev->dev);
2700         struct net_device *dev = net->loopback_dev;
2701         struct rt6_info *rt;
2702 
2703         /* use L3 Master device as loopback for host routes if device
2704          * is enslaved and address is not link local or multicast
2705          */
2706         if (!rt6_need_strict(addr))
2707                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2708 
2709         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2710         if (!rt)
2711                 return ERR_PTR(-ENOMEM);
2712 
2713         in6_dev_hold(idev);
2714 
2715         rt->dst.flags |= DST_HOST;
2716         rt->dst.input = ip6_input;
2717         rt->dst.output = ip6_output;
2718         rt->rt6i_idev = idev;
2719 
2720         rt->rt6i_protocol = RTPROT_KERNEL;
2721         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2722         if (anycast)
2723                 rt->rt6i_flags |= RTF_ANYCAST;
2724         else
2725                 rt->rt6i_flags |= RTF_LOCAL;
2726 
2727         rt->rt6i_gateway  = *addr;
2728         rt->rt6i_dst.addr = *addr;
2729         rt->rt6i_dst.plen = 128;
2730         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2731         rt->rt6i_table = fib6_get_table(net, tb_id);
2732         rt->dst.flags |= DST_NOCACHE;
2733 
2734         atomic_set(&rt->dst.__refcnt, 1);
2735 
2736         return rt;
2737 }
2738 
2739 /* remove deleted ip from prefsrc entries */
2740 struct arg_dev_net_ip {
2741         struct net_device *dev;
2742         struct net *net;
2743         struct in6_addr *addr;
2744 };
2745 
2746 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2747 {
2748         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2749         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2750         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2751 
2752         if (((void *)rt->dst.dev == dev || !dev) &&
2753             rt != net->ipv6.ip6_null_entry &&
2754             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2755                 /* remove prefsrc entry */
2756                 rt->rt6i_prefsrc.plen = 0;
2757         }
2758         return 0;
2759 }
2760 
2761 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2762 {
2763         struct net *net = dev_net(ifp->idev->dev);
2764         struct arg_dev_net_ip adni = {
2765                 .dev = ifp->idev->dev,
2766                 .net = net,
2767                 .addr = &ifp->addr,
2768         };
2769         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2770 }
2771 
2772 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2773 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2774 
2775 /* Remove routers and update dst entries when gateway turn into host. */
2776 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2777 {
2778         struct in6_addr *gateway = (struct in6_addr *)arg;
2779 
2780         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2781              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2782              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2783                 return -1;
2784         }
2785         return 0;
2786 }
2787 
2788 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2789 {
2790         fib6_clean_all(net, fib6_clean_tohost, gateway);
2791 }
2792 
2793 struct arg_dev_net {
2794         struct net_device *dev;
2795         struct net *net;
2796 };
2797 
2798 /* called with write lock held for table with rt */
2799 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2800 {
2801         const struct arg_dev_net *adn = arg;
2802         const struct net_device *dev = adn->dev;
2803 
2804         if ((rt->dst.dev == dev || !dev) &&
2805             rt != adn->net->ipv6.ip6_null_entry &&
2806             (rt->rt6i_nsiblings == 0 ||
2807              (dev && netdev_unregistering(dev)) ||
2808              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2809                 return -1;
2810 
2811         return 0;
2812 }
2813 
2814 void rt6_ifdown(struct net *net, struct net_device *dev)
2815 {
2816         struct arg_dev_net adn = {
2817                 .dev = dev,
2818                 .net = net,
2819         };
2820 
2821         fib6_clean_all(net, fib6_ifdown, &adn);
2822         icmp6_clean_all(fib6_ifdown, &adn);
2823         if (dev)
2824                 rt6_uncached_list_flush_dev(net, dev);
2825 }
2826 
2827 struct rt6_mtu_change_arg {
2828         struct net_device *dev;
2829         unsigned int mtu;
2830 };
2831 
2832 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2833 {
2834         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2835         struct inet6_dev *idev;
2836 
2837         /* In IPv6 pmtu discovery is not optional,
2838            so that RTAX_MTU lock cannot disable it.
2839            We still use this lock to block changes
2840            caused by addrconf/ndisc.
2841         */
2842 
2843         idev = __in6_dev_get(arg->dev);
2844         if (!idev)
2845                 return 0;
2846 
2847         /* For administrative MTU increase, there is no way to discover
2848            IPv6 PMTU increase, so PMTU increase should be updated here.
2849            Since RFC 1981 doesn't include administrative MTU increase
2850            update PMTU increase is a MUST. (i.e. jumbo frame)
2851          */
2852         /*
2853            If new MTU is less than route PMTU, this new MTU will be the
2854            lowest MTU in the path, update the route PMTU to reflect PMTU
2855            decreases; if new MTU is greater than route PMTU, and the
2856            old MTU is the lowest MTU in the path, update the route PMTU
2857            to reflect the increase. In this case if the other nodes' MTU
2858            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2859            PMTU discovery.
2860          */
2861         if (rt->dst.dev == arg->dev &&
2862             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2863             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2864                 if (rt->rt6i_flags & RTF_CACHE) {
2865                         /* For RTF_CACHE with rt6i_pmtu == 0
2866                          * (i.e. a redirected route),
2867                          * the metrics of its rt->dst.from has already
2868                          * been updated.
2869                          */
2870                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2871                                 rt->rt6i_pmtu = arg->mtu;
2872                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2873                            (dst_mtu(&rt->dst) < arg->mtu &&
2874                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2875                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2876                 }
2877         }
2878         return 0;
2879 }
2880 
2881 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2882 {
2883         struct rt6_mtu_change_arg arg = {
2884                 .dev = dev,
2885                 .mtu = mtu,
2886         };
2887 
2888         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2889 }
2890 
2891 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2892         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2893         [RTA_OIF]               = { .type = NLA_U32 },
2894         [RTA_IIF]               = { .type = NLA_U32 },
2895         [RTA_PRIORITY]          = { .type = NLA_U32 },
2896         [RTA_METRICS]           = { .type = NLA_NESTED },
2897         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2898         [RTA_PREF]              = { .type = NLA_U8 },
2899         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2900         [RTA_ENCAP]             = { .type = NLA_NESTED },
2901         [RTA_EXPIRES]           = { .type = NLA_U32 },
2902         [RTA_UID]               = { .type = NLA_U32 },
2903         [RTA_MARK]              = { .type = NLA_U32 },
2904 };
2905 
2906 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2907                               struct fib6_config *cfg)
2908 {
2909         struct rtmsg *rtm;
2910         struct nlattr *tb[RTA_MAX+1];
2911         unsigned int pref;
2912         int err;
2913 
2914         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2915         if (err < 0)
2916                 goto errout;
2917 
2918         err = -EINVAL;
2919         rtm = nlmsg_data(nlh);
2920         memset(cfg, 0, sizeof(*cfg));
2921 
2922         cfg->fc_table = rtm->rtm_table;
2923         cfg->fc_dst_len = rtm->rtm_dst_len;
2924         cfg->fc_src_len = rtm->rtm_src_len;
2925         cfg->fc_flags = RTF_UP;
2926         cfg->fc_protocol = rtm->rtm_protocol;
2927         cfg->fc_type = rtm->rtm_type;
2928 
2929         if (rtm->rtm_type == RTN_UNREACHABLE ||
2930             rtm->rtm_type == RTN_BLACKHOLE ||
2931             rtm->rtm_type == RTN_PROHIBIT ||
2932             rtm->rtm_type == RTN_THROW)
2933                 cfg->fc_flags |= RTF_REJECT;
2934 
2935         if (rtm->rtm_type == RTN_LOCAL)
2936                 cfg->fc_flags |= RTF_LOCAL;
2937 
2938         if (rtm->rtm_flags & RTM_F_CLONED)
2939                 cfg->fc_flags |= RTF_CACHE;
2940 
2941         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2942         cfg->fc_nlinfo.nlh = nlh;
2943         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2944 
2945         if (tb[RTA_GATEWAY]) {
2946                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2947                 cfg->fc_flags |= RTF_GATEWAY;
2948         }
2949 
2950         if (tb[RTA_DST]) {
2951                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2952 
2953                 if (nla_len(tb[RTA_DST]) < plen)
2954                         goto errout;
2955 
2956                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2957         }
2958 
2959         if (tb[RTA_SRC]) {
2960                 int plen = (rtm->rtm_src_len + 7) >> 3;
2961 
2962                 if (nla_len(tb[RTA_SRC]) < plen)
2963                         goto errout;
2964 
2965                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2966         }
2967 
2968         if (tb[RTA_PREFSRC])
2969                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2970 
2971         if (tb[RTA_OIF])
2972                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2973 
2974         if (tb[RTA_PRIORITY])
2975                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2976 
2977         if (tb[RTA_METRICS]) {
2978                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2979                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2980         }
2981 
2982         if (tb[RTA_TABLE])
2983                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2984 
2985         if (tb[RTA_MULTIPATH]) {
2986                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2987                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2988 
2989                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2990                                                      cfg->fc_mp_len);
2991                 if (err < 0)
2992                         goto errout;
2993         }
2994 
2995         if (tb[RTA_PREF]) {
2996                 pref = nla_get_u8(tb[RTA_PREF]);
2997                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2998                     pref != ICMPV6_ROUTER_PREF_HIGH)
2999                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3000                 cfg->fc_flags |= RTF_PREF(pref);
3001         }
3002 
3003         if (tb[RTA_ENCAP])
3004                 cfg->fc_encap = tb[RTA_ENCAP];
3005 
3006         if (tb[RTA_ENCAP_TYPE]) {
3007                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3008 
3009                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
3010                 if (err < 0)
3011                         goto errout;
3012         }
3013 
3014         if (tb[RTA_EXPIRES]) {
3015                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3016 
3017                 if (addrconf_finite_timeout(timeout)) {
3018                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3019                         cfg->fc_flags |= RTF_EXPIRES;
3020                 }
3021         }
3022 
3023         err = 0;
3024 errout:
3025         return err;
3026 }
3027 
3028 struct rt6_nh {
3029         struct rt6_info *rt6_info;
3030         struct fib6_config r_cfg;
3031         struct mx6_config mxc;
3032         struct list_head next;
3033 };
3034 
3035 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3036 {
3037         struct rt6_nh *nh;
3038 
3039         list_for_each_entry(nh, rt6_nh_list, next) {
3040                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3041                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3042                         nh->r_cfg.fc_ifindex);
3043         }
3044 }
3045 
3046 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3047                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3048 {
3049         struct rt6_nh *nh;
3050         int err = -EEXIST;
3051 
3052         list_for_each_entry(nh, rt6_nh_list, next) {
3053                 /* check if rt6_info already exists */
3054                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3055                         return err;
3056         }
3057 
3058         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3059         if (!nh)
3060                 return -ENOMEM;
3061         nh->rt6_info = rt;
3062         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3063         if (err) {
3064                 kfree(nh);
3065                 return err;
3066         }
3067         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3068         list_add_tail(&nh->next, rt6_nh_list);
3069 
3070         return 0;
3071 }
3072 
3073 static void ip6_route_mpath_notify(struct rt6_info *rt,
3074                                    struct rt6_info *rt_last,
3075                                    struct nl_info *info,
3076                                    __u16 nlflags)
3077 {
3078         /* if this is an APPEND route, then rt points to the first route
3079          * inserted and rt_last points to last route inserted. Userspace
3080          * wants a consistent dump of the route which starts at the first
3081          * nexthop. Since sibling routes are always added at the end of
3082          * the list, find the first sibling of the last route appended
3083          */
3084         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3085                 rt = list_first_entry(&rt_last->rt6i_siblings,
3086                                       struct rt6_info,
3087                                       rt6i_siblings);
3088         }
3089 
3090         if (rt)
3091                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3092 }
3093 
3094 static int ip6_route_multipath_add(struct fib6_config *cfg)
3095 {
3096         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3097         struct nl_info *info = &cfg->fc_nlinfo;
3098         struct fib6_config r_cfg;
3099         struct rtnexthop *rtnh;
3100         struct rt6_info *rt;
3101         struct rt6_nh *err_nh;
3102         struct rt6_nh *nh, *nh_safe;
3103         __u16 nlflags;
3104         int remaining;
3105         int attrlen;
3106         int err = 1;
3107         int nhn = 0;
3108         int replace = (cfg->fc_nlinfo.nlh &&
3109                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3110         LIST_HEAD(rt6_nh_list);
3111 
3112         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3113         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3114                 nlflags |= NLM_F_APPEND;
3115 
3116         remaining = cfg->fc_mp_len;
3117         rtnh = (struct rtnexthop *)cfg->fc_mp;
3118 
3119         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3120          * rt6_info structs per nexthop
3121          */
3122         while (rtnh_ok(rtnh, remaining)) {
3123                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3124                 if (rtnh->rtnh_ifindex)
3125                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3126 
3127                 attrlen = rtnh_attrlen(rtnh);
3128                 if (attrlen > 0) {
3129                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3130 
3131                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3132                         if (nla) {
3133                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3134                                 r_cfg.fc_flags |= RTF_GATEWAY;
3135                         }
3136                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3137                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3138                         if (nla)
3139                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3140                 }
3141 
3142                 rt = ip6_route_info_create(&r_cfg);
3143                 if (IS_ERR(rt)) {
3144                         err = PTR_ERR(rt);
3145                         rt = NULL;
3146                         goto cleanup;
3147                 }
3148 
3149                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3150                 if (err) {
3151                         dst_free(&rt->dst);
3152                         goto cleanup;
3153                 }
3154 
3155                 rtnh = rtnh_next(rtnh, &remaining);
3156         }
3157 
3158         /* for add and replace send one notification with all nexthops.
3159          * Skip the notification in fib6_add_rt2node and send one with
3160          * the full route when done
3161          */
3162         info->skip_notify = 1;
3163 
3164         err_nh = NULL;
3165         list_for_each_entry(nh, &rt6_nh_list, next) {
3166                 rt_last = nh->rt6_info;
3167                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3168                 /* save reference to first route for notification */
3169                 if (!rt_notif && !err)
3170                         rt_notif = nh->rt6_info;
3171 
3172                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3173                 nh->rt6_info = NULL;
3174                 if (err) {
3175                         if (replace && nhn)
3176                                 ip6_print_replace_route_err(&rt6_nh_list);
3177                         err_nh = nh;
3178                         goto add_errout;
3179                 }
3180 
3181                 /* Because each route is added like a single route we remove
3182                  * these flags after the first nexthop: if there is a collision,
3183                  * we have already failed to add the first nexthop:
3184                  * fib6_add_rt2node() has rejected it; when replacing, old
3185                  * nexthops have been replaced by first new, the rest should
3186                  * be added to it.
3187                  */
3188                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3189                                                      NLM_F_REPLACE);
3190                 nhn++;
3191         }
3192 
3193         /* success ... tell user about new route */
3194         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3195         goto cleanup;
3196 
3197 add_errout:
3198         /* send notification for routes that were added so that
3199          * the delete notifications sent by ip6_route_del are
3200          * coherent
3201          */
3202         if (rt_notif)
3203                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3204 
3205         /* Delete routes that were already added */
3206         list_for_each_entry(nh, &rt6_nh_list, next) {
3207                 if (err_nh == nh)
3208                         break;
3209                 ip6_route_del(&nh->r_cfg);
3210         }
3211 
3212 cleanup:
3213         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3214                 if (nh->rt6_info)
3215                         dst_free(&nh->rt6_info->dst);
3216                 kfree(nh->mxc.mx);
3217                 list_del(&nh->next);
3218                 kfree(nh);
3219         }
3220 
3221         return err;
3222 }
3223 
3224 static int ip6_route_multipath_del(struct fib6_config *cfg)
3225 {
3226         struct fib6_config r_cfg;
3227         struct rtnexthop *rtnh;
3228         int remaining;
3229         int attrlen;
3230         int err = 1, last_err = 0;
3231 
3232         remaining = cfg->fc_mp_len;
3233         rtnh = (struct rtnexthop *)cfg->fc_mp;
3234 
3235         /* Parse a Multipath Entry */
3236         while (rtnh_ok(rtnh, remaining)) {
3237                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3238                 if (rtnh->rtnh_ifindex)
3239                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3240 
3241                 attrlen = rtnh_attrlen(rtnh);
3242                 if (attrlen > 0) {
3243                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3244 
3245                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3246                         if (nla) {
3247                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3248                                 r_cfg.fc_flags |= RTF_GATEWAY;
3249                         }
3250                 }
3251                 err = ip6_route_del(&r_cfg);
3252                 if (err)
3253                         last_err = err;
3254 
3255                 rtnh = rtnh_next(rtnh, &remaining);
3256         }
3257 
3258         return last_err;
3259 }
3260 
3261 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3262 {
3263         struct fib6_config cfg;
3264         int err;
3265 
3266         err = rtm_to_fib6_config(skb, nlh, &cfg);
3267         if (err < 0)
3268                 return err;
3269 
3270         if (cfg.fc_mp)
3271                 return ip6_route_multipath_del(&cfg);
3272         else {
3273                 cfg.fc_delete_all_nh = 1;
3274                 return ip6_route_del(&cfg);
3275         }
3276 }
3277 
3278 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3279 {
3280         struct fib6_config cfg;
3281         int err;
3282 
3283         err = rtm_to_fib6_config(skb, nlh, &cfg);
3284         if (err < 0)
3285                 return err;
3286 
3287         if (cfg.fc_mp)
3288                 return ip6_route_multipath_add(&cfg);
3289         else
3290                 return ip6_route_add(&cfg);
3291 }
3292 
3293 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3294 {
3295         int nexthop_len = 0;
3296 
3297         if (rt->rt6i_nsiblings) {
3298                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3299                             + NLA_ALIGN(sizeof(struct rtnexthop))
3300                             + nla_total_size(16) /* RTA_GATEWAY */
3301                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3302 
3303                 nexthop_len *= rt->rt6i_nsiblings;
3304         }
3305 
3306         return NLMSG_ALIGN(sizeof(struct rtmsg))
3307                + nla_total_size(16) /* RTA_SRC */
3308                + nla_total_size(16) /* RTA_DST */
3309                + nla_total_size(16) /* RTA_GATEWAY */
3310                + nla_total_size(16) /* RTA_PREFSRC */
3311                + nla_total_size(4) /* RTA_TABLE */
3312                + nla_total_size(4) /* RTA_IIF */
3313                + nla_total_size(4) /* RTA_OIF */
3314                + nla_total_size(4) /* RTA_PRIORITY */
3315                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3316                + nla_total_size(sizeof(struct rta_cacheinfo))
3317                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3318                + nla_total_size(1) /* RTA_PREF */
3319                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3320                + nexthop_len;
3321 }
3322 
3323 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3324                             unsigned int *flags, bool skip_oif)
3325 {
3326         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3327                 *flags |= RTNH_F_LINKDOWN;
3328                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3329                         *flags |= RTNH_F_DEAD;
3330         }
3331 
3332         if (rt->rt6i_flags & RTF_GATEWAY) {
3333                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3334                         goto nla_put_failure;
3335         }
3336 
3337         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3338         if (!skip_oif && rt->dst.dev &&
3339             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3340                 goto nla_put_failure;
3341 
3342         if (rt->dst.lwtstate &&
3343             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3344                 goto nla_put_failure;
3345 
3346         return 0;
3347 
3348 nla_put_failure:
3349         return -EMSGSIZE;
3350 }
3351 
3352 /* add multipath next hop */
3353 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3354 {
3355         struct rtnexthop *rtnh;
3356         unsigned int flags = 0;
3357 
3358         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3359         if (!rtnh)
3360                 goto nla_put_failure;
3361 
3362         rtnh->rtnh_hops = 0;
3363         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3364 
3365         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3366                 goto nla_put_failure;
3367 
3368         rtnh->rtnh_flags = flags;
3369 
3370         /* length of rtnetlink header + attributes */
3371         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3372 
3373         return 0;
3374 
3375 nla_put_failure:
3376         return -EMSGSIZE;
3377 }
3378 
3379 static int rt6_fill_node(struct net *net,
3380                          struct sk_buff *skb, struct rt6_info *rt,
3381                          struct in6_addr *dst, struct in6_addr *src,
3382                          int iif, int type, u32 portid, u32 seq,
3383                          unsigned int flags)
3384 {
3385         u32 metrics[RTAX_MAX];
3386         struct rtmsg *rtm;
3387         struct nlmsghdr *nlh;
3388         long expires;
3389         u32 table;
3390 
3391         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3392         if (!nlh)
3393                 return -EMSGSIZE;
3394 
3395         rtm = nlmsg_data(nlh);
3396         rtm->rtm_family = AF_INET6;
3397         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3398         rtm->rtm_src_len = rt->rt6i_src.plen;
3399         rtm->rtm_tos = 0;
3400         if (rt->rt6i_table)
3401                 table = rt->rt6i_table->tb6_id;
3402         else
3403                 table = RT6_TABLE_UNSPEC;
3404         rtm->rtm_table = table;
3405         if (nla_put_u32(skb, RTA_TABLE, table))
3406                 goto nla_put_failure;
3407         if (rt->rt6i_flags & RTF_REJECT) {
3408                 switch (rt->dst.error) {
3409                 case -EINVAL:
3410                         rtm->rtm_type = RTN_BLACKHOLE;
3411                         break;
3412                 case -EACCES:
3413                         rtm->rtm_type = RTN_PROHIBIT;
3414                         break;
3415                 case -EAGAIN:
3416                         rtm->rtm_type = RTN_THROW;
3417                         break;
3418                 default:
3419                         rtm->rtm_type = RTN_UNREACHABLE;
3420                         break;
3421                 }
3422         }
3423         else if (rt->rt6i_flags & RTF_LOCAL)
3424                 rtm->rtm_type = RTN_LOCAL;
3425         else if (rt->rt6i_flags & RTF_ANYCAST)
3426                 rtm->rtm_type = RTN_ANYCAST;
3427         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3428                 rtm->rtm_type = RTN_LOCAL;
3429         else
3430                 rtm->rtm_type = RTN_UNICAST;
3431         rtm->rtm_flags = 0;
3432         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3433         rtm->rtm_protocol = rt->rt6i_protocol;
3434         if (rt->rt6i_flags & RTF_DYNAMIC)
3435                 rtm->rtm_protocol = RTPROT_REDIRECT;
3436         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3437                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3438                         rtm->rtm_protocol = RTPROT_RA;
3439                 else
3440                         rtm->rtm_protocol = RTPROT_KERNEL;
3441         }
3442 
3443         if (rt->rt6i_flags & RTF_CACHE)
3444                 rtm->rtm_flags |= RTM_F_CLONED;
3445 
3446         if (dst) {
3447                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3448                         goto nla_put_failure;
3449                 rtm->rtm_dst_len = 128;
3450         } else if (rtm->rtm_dst_len)
3451                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3452                         goto nla_put_failure;
3453 #ifdef CONFIG_IPV6_SUBTREES
3454         if (src) {
3455                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3456                         goto nla_put_failure;
3457                 rtm->rtm_src_len = 128;
3458         } else if (rtm->rtm_src_len &&
3459                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3460                 goto nla_put_failure;
3461 #endif
3462         if (iif) {
3463 #ifdef CONFIG_IPV6_MROUTE
3464                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3465                         int err = ip6mr_get_route(net, skb, rtm, portid);
3466 
3467                         if (err == 0)
3468                                 return 0;
3469                         if (err < 0)
3470                                 goto nla_put_failure;
3471                 } else
3472 #endif
3473                         if (nla_put_u32(skb, RTA_IIF, iif))
3474                                 goto nla_put_failure;
3475         } else if (dst) {
3476                 struct in6_addr saddr_buf;
3477                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3478                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3479                         goto nla_put_failure;
3480         }
3481 
3482         if (rt->rt6i_prefsrc.plen) {
3483                 struct in6_addr saddr_buf;
3484                 saddr_buf = rt->rt6i_prefsrc.addr;
3485                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3486                         goto nla_put_failure;
3487         }
3488 
3489         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3490         if (rt->rt6i_pmtu)
3491                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3492         if (rtnetlink_put_metrics(skb, metrics) < 0)
3493                 goto nla_put_failure;
3494 
3495         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3496                 goto nla_put_failure;
3497 
3498         /* For multipath routes, walk the siblings list and add
3499          * each as a nexthop within RTA_MULTIPATH.
3500          */
3501         if (rt->rt6i_nsiblings) {
3502                 struct rt6_info *sibling, *next_sibling;
3503                 struct nlattr *mp;
3504 
3505                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3506                 if (!mp)
3507                         goto nla_put_failure;
3508 
3509                 if (rt6_add_nexthop(skb, rt) < 0)
3510                         goto nla_put_failure;
3511 
3512                 list_for_each_entry_safe(sibling, next_sibling,
3513                                          &rt->rt6i_siblings, rt6i_siblings) {
3514                         if (rt6_add_nexthop(skb, sibling) < 0)
3515                                 goto nla_put_failure;
3516                 }
3517 
3518                 nla_nest_end(skb, mp);
3519         } else {
3520                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3521                         goto nla_put_failure;
3522         }
3523 
3524         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3525 
3526         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3527                 goto nla_put_failure;
3528 
3529         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3530                 goto nla_put_failure;
3531 
3532 
3533         nlmsg_end(skb, nlh);
3534         return 0;
3535 
3536 nla_put_failure:
3537         nlmsg_cancel(skb, nlh);
3538         return -EMSGSIZE;
3539 }
3540 
3541 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3542 {
3543         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3544         struct net *net = arg->net;
3545 
3546         if (rt == net->ipv6.ip6_null_entry)
3547                 return 0;
3548 
3549         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3550                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3551 
3552                 /* user wants prefix routes only */
3553                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3554                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3555                         /* success since this is not a prefix route */
3556                         return 1;
3557                 }
3558         }
3559 
3560         return rt6_fill_node(net,
3561                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3562                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3563                      NLM_F_MULTI);
3564 }
3565 
3566 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3567 {
3568         struct net *net = sock_net(in_skb->sk);
3569         struct nlattr *tb[RTA_MAX+1];
3570         struct rt6_info *rt;
3571         struct sk_buff *skb;
3572         struct rtmsg *rtm;
3573         struct flowi6 fl6;
3574         int err, iif = 0, oif = 0;
3575 
3576         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3577         if (err < 0)
3578                 goto errout;
3579 
3580         err = -EINVAL;
3581         memset(&fl6, 0, sizeof(fl6));
3582         rtm = nlmsg_data(nlh);
3583         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3584 
3585         if (tb[RTA_SRC]) {
3586                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3587                         goto errout;
3588 
3589                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3590         }
3591 
3592         if (tb[RTA_DST]) {
3593                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3594                         goto errout;
3595 
3596                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3597         }
3598 
3599         if (tb[RTA_IIF])
3600                 iif = nla_get_u32(tb[RTA_IIF]);
3601 
3602         if (tb[RTA_OIF])
3603                 oif = nla_get_u32(tb[RTA_OIF]);
3604 
3605         if (tb[RTA_MARK])
3606                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3607 
3608         if (tb[RTA_UID])
3609                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3610                                            nla_get_u32(tb[RTA_UID]));
3611         else
3612                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3613 
3614         if (iif) {
3615                 struct net_device *dev;
3616                 int flags = 0;
3617 
3618                 dev = __dev_get_by_index(net, iif);
3619                 if (!dev) {
3620                         err = -ENODEV;
3621                         goto errout;
3622                 }
3623 
3624                 fl6.flowi6_iif = iif;
3625 
3626                 if (!ipv6_addr_any(&fl6.saddr))
3627                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3628 
3629                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3630                                                                flags);
3631         } else {
3632                 fl6.flowi6_oif = oif;
3633 
3634                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3635         }
3636 
3637         if (rt == net->ipv6.ip6_null_entry) {
3638                 err = rt->dst.error;
3639                 ip6_rt_put(rt);
3640                 goto errout;
3641         }
3642 
3643         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3644         if (!skb) {
3645                 ip6_rt_put(rt);
3646                 err = -ENOBUFS;
3647                 goto errout;
3648         }
3649 
3650         skb_dst_set(skb, &rt->dst);
3651 
3652         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3653                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3654                             nlh->nlmsg_seq, 0);
3655         if (err < 0) {
3656                 kfree_skb(skb);
3657                 goto errout;
3658         }
3659 
3660         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3661 errout:
3662         return err;
3663 }
3664 
3665 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3666                      unsigned int nlm_flags)
3667 {
3668         struct sk_buff *skb;
3669         struct net *net = info->nl_net;
3670         u32 seq;
3671         int err;
3672 
3673         err = -ENOBUFS;
3674         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3675 
3676         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3677         if (!skb)
3678                 goto errout;
3679 
3680         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3681                                 event, info->portid, seq, nlm_flags);
3682         if (err < 0) {
3683                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3684                 WARN_ON(err == -EMSGSIZE);
3685                 kfree_skb(skb);
3686                 goto errout;
3687         }
3688         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3689                     info->nlh, gfp_any());
3690         return;
3691 errout:
3692         if (err < 0)
3693                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3694 }
3695 
3696 static int ip6_route_dev_notify(struct notifier_block *this,
3697                                 unsigned long event, void *ptr)
3698 {
3699         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3700         struct net *net = dev_net(dev);
3701 
3702         if (!(dev->flags & IFF_LOOPBACK))
3703                 return NOTIFY_OK;
3704 
3705         if (event == NETDEV_REGISTER) {
3706                 net->ipv6.ip6_null_entry->dst.dev = dev;
3707                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3708 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3709                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3710                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3711                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3712                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3713 #endif
3714          } else if (event == NETDEV_UNREGISTER) {
3715                 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3716 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3717                 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3718                 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3719 #endif
3720         }
3721 
3722         return NOTIFY_OK;
3723 }
3724 
3725 /*
3726  *      /proc
3727  */
3728 
3729 #ifdef CONFIG_PROC_FS
3730 
3731 static const struct file_operations ipv6_route_proc_fops = {
3732         .owner          = THIS_MODULE,
3733         .open           = ipv6_route_open,
3734         .read           = seq_read,
3735         .llseek         = seq_lseek,
3736         .release        = seq_release_net,
3737 };
3738 
3739 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3740 {
3741         struct net *net = (struct net *)seq->private;
3742         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3743                    net->ipv6.rt6_stats->fib_nodes,
3744                    net->ipv6.rt6_stats->fib_route_nodes,
3745                    net->ipv6.rt6_stats->fib_rt_alloc,
3746                    net->ipv6.rt6_stats->fib_rt_entries,
3747                    net->ipv6.rt6_stats->fib_rt_cache,
3748                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3749                    net->ipv6.rt6_stats->fib_discarded_routes);
3750 
3751         return 0;
3752 }
3753 
3754 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3755 {
3756         return single_open_net(inode, file, rt6_stats_seq_show);
3757 }
3758 
3759 static const struct file_operations rt6_stats_seq_fops = {
3760         .owner   = THIS_MODULE,
3761         .open    = rt6_stats_seq_open,
3762         .read    = seq_read,
3763         .llseek  = seq_lseek,
3764         .release = single_release_net,
3765 };
3766 #endif  /* CONFIG_PROC_FS */
3767 
3768 #ifdef CONFIG_SYSCTL
3769 
3770 static
3771 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3772                               void __user *buffer, size_t *lenp, loff_t *ppos)
3773 {
3774         struct net *net;
3775         int delay;
3776         if (!write)
3777                 return -EINVAL;
3778 
3779         net = (struct net *)ctl->extra1;
3780         delay = net->ipv6.sysctl.flush_delay;
3781         proc_dointvec(ctl, write, buffer, lenp, ppos);
3782         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3783         return 0;
3784 }
3785 
3786 struct ctl_table ipv6_route_table_template[] = {
3787         {
3788                 .procname       =       "flush",
3789                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3790                 .maxlen         =       sizeof(int),
3791                 .mode           =       0200,
3792                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3793         },
3794         {
3795                 .procname       =       "gc_thresh",
3796                 .data           =       &ip6_dst_ops_template.gc_thresh,
3797                 .maxlen         =       sizeof(int),
3798                 .mode           =       0644,
3799                 .proc_handler   =       proc_dointvec,
3800         },
3801         {
3802                 .procname       =       "max_size",
3803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3804                 .maxlen         =       sizeof(int),
3805                 .mode           =       0644,
3806                 .proc_handler   =       proc_dointvec,
3807         },
3808         {
3809                 .procname       =       "gc_min_interval",
3810                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3811                 .maxlen         =       sizeof(int),
3812                 .mode           =       0644,
3813                 .proc_handler   =       proc_dointvec_jiffies,
3814         },
3815         {
3816                 .procname       =       "gc_timeout",
3817                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3818                 .maxlen         =       sizeof(int),
3819                 .mode           =       0644,
3820                 .proc_handler   =       proc_dointvec_jiffies,
3821         },
3822         {
3823                 .procname       =       "gc_interval",
3824                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3825                 .maxlen         =       sizeof(int),
3826                 .mode           =       0644,
3827                 .proc_handler   =       proc_dointvec_jiffies,
3828         },
3829         {
3830                 .procname       =       "gc_elasticity",
3831                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3832                 .maxlen         =       sizeof(int),
3833                 .mode           =       0644,
3834                 .proc_handler   =       proc_dointvec,
3835         },
3836         {
3837                 .procname       =       "mtu_expires",
3838                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3839                 .maxlen         =       sizeof(int),
3840                 .mode           =       0644,
3841                 .proc_handler   =       proc_dointvec_jiffies,
3842         },
3843         {
3844                 .procname       =       "min_adv_mss",
3845                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3846                 .maxlen         =       sizeof(int),
3847                 .mode           =       0644,
3848                 .proc_handler   =       proc_dointvec,
3849         },
3850         {
3851                 .procname       =       "gc_min_interval_ms",
3852                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3853                 .maxlen         =       sizeof(int),
3854                 .mode           =       0644,
3855                 .proc_handler   =       proc_dointvec_ms_jiffies,
3856         },
3857         { }
3858 };
3859 
3860 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3861 {
3862         struct ctl_table *table;
3863 
3864         table = kmemdup(ipv6_route_table_template,
3865                         sizeof(ipv6_route_table_template),
3866                         GFP_KERNEL);
3867 
3868         if (table) {
3869                 table[0].data = &net->ipv6.sysctl.flush_delay;
3870                 table[0].extra1 = net;
3871                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3872                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3873                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3874                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3875                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3876                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3877                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3878                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3879                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3880 
3881                 /* Don't export sysctls to unprivileged users */
3882                 if (net->user_ns != &init_user_ns)
3883                         table[0].procname = NULL;
3884         }
3885 
3886         return table;
3887 }
3888 #endif
3889 
3890 static int __net_init ip6_route_net_init(struct net *net)
3891 {
3892         int ret = -ENOMEM;
3893 
3894         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3895                sizeof(net->ipv6.ip6_dst_ops));
3896 
3897         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3898                 goto out_ip6_dst_ops;
3899 
3900         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3901                                            sizeof(*net->ipv6.ip6_null_entry),
3902                                            GFP_KERNEL);
3903         if (!net->ipv6.ip6_null_entry)
3904                 goto out_ip6_dst_entries;
3905         net->ipv6.ip6_null_entry->dst.path =
3906                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3907         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3908         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3909                          ip6_template_metrics, true);
3910 
3911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3912         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3913                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3914                                                GFP_KERNEL);
3915         if (!net->ipv6.ip6_prohibit_entry)
3916                 goto out_ip6_null_entry;
3917         net->ipv6.ip6_prohibit_entry->dst.path =
3918                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3919         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3920         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3921                          ip6_template_metrics, true);
3922 
3923         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3924                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3925                                                GFP_KERNEL);
3926         if (!net->ipv6.ip6_blk_hole_entry)
3927                 goto out_ip6_prohibit_entry;
3928         net->ipv6.ip6_blk_hole_entry->dst.path =
3929                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3930         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3931         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3932                          ip6_template_metrics, true);
3933 #endif
3934 
3935         net->ipv6.sysctl.flush_delay = 0;
3936         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3937         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3938         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3939         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3940         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3941         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3942         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3943 
3944         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3945 
3946         ret = 0;
3947 out:
3948         return ret;
3949 
3950 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3951 out_ip6_prohibit_entry:
3952         kfree(net->ipv6.ip6_prohibit_entry);
3953 out_ip6_null_entry:
3954         kfree(net->ipv6.ip6_null_entry);
3955 #endif
3956 out_ip6_dst_entries:
3957         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3958 out_ip6_dst_ops:
3959         goto out;
3960 }
3961 
3962 static void __net_exit ip6_route_net_exit(struct net *net)
3963 {
3964         kfree(net->ipv6.ip6_null_entry);
3965 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3966         kfree(net->ipv6.ip6_prohibit_entry);
3967         kfree(net->ipv6.ip6_blk_hole_entry);
3968 #endif
3969         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3970 }
3971 
3972 static int __net_init ip6_route_net_init_late(struct net *net)
3973 {
3974 #ifdef CONFIG_PROC_FS
3975         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3976         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3977 #endif
3978         return 0;
3979 }
3980 
3981 static void __net_exit ip6_route_net_exit_late(struct net *net)
3982 {
3983 #ifdef CONFIG_PROC_FS
3984         remove_proc_entry("ipv6_route", net->proc_net);
3985         remove_proc_entry("rt6_stats", net->proc_net);
3986 #endif
3987 }
3988 
3989 static struct pernet_operations ip6_route_net_ops = {
3990         .init = ip6_route_net_init,
3991         .exit = ip6_route_net_exit,
3992 };
3993 
3994 static int __net_init ipv6_inetpeer_init(struct net *net)
3995 {
3996         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3997 
3998         if (!bp)
3999                 return -ENOMEM;
4000         inet_peer_base_init(bp);
4001         net->ipv6.peers = bp;
4002         return 0;
4003 }
4004 
4005 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4006 {
4007         struct inet_peer_base *bp = net->ipv6.peers;
4008 
4009         net->ipv6.peers = NULL;
4010         inetpeer_invalidate_tree(bp);
4011         kfree(bp);
4012 }
4013 
4014 static struct pernet_operations ipv6_inetpeer_ops = {
4015         .init   =       ipv6_inetpeer_init,
4016         .exit   =       ipv6_inetpeer_exit,
4017 };
4018 
4019 static struct pernet_operations ip6_route_net_late_ops = {
4020         .init = ip6_route_net_init_late,
4021         .exit = ip6_route_net_exit_late,
4022 };
4023 
4024 static struct notifier_block ip6_route_dev_notifier = {
4025         .notifier_call = ip6_route_dev_notify,
4026         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4027 };
4028 
4029 void __init ip6_route_init_special_entries(void)
4030 {
4031         /* Registering of the loopback is done before this portion of code,
4032          * the loopback reference in rt6_info will not be taken, do it
4033          * manually for init_net */
4034         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4035         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4036   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4037         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4038         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4039         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4040         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4041   #endif
4042 }
4043 
4044 int __init ip6_route_init(void)
4045 {
4046         int ret;
4047         int cpu;
4048 
4049         ret = -ENOMEM;
4050         ip6_dst_ops_template.kmem_cachep =
4051                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4052                                   SLAB_HWCACHE_ALIGN, NULL);
4053         if (!ip6_dst_ops_template.kmem_cachep)
4054                 goto out;
4055 
4056         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4057         if (ret)
4058                 goto out_kmem_cache;
4059 
4060         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4061         if (ret)
4062                 goto out_dst_entries;
4063 
4064         ret = register_pernet_subsys(&ip6_route_net_ops);
4065         if (ret)
4066                 goto out_register_inetpeer;
4067 
4068         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4069 
4070         ret = fib6_init();
4071         if (ret)
4072                 goto out_register_subsys;
4073 
4074         ret = xfrm6_init();
4075         if (ret)
4076                 goto out_fib6_init;
4077 
4078         ret = fib6_rules_init();
4079         if (ret)
4080                 goto xfrm6_init;
4081 
4082         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4083         if (ret)
4084                 goto fib6_rules_init;
4085 
4086         ret = -ENOBUFS;
4087         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4088             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4089             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4090                 goto out_register_late_subsys;
4091 
4092         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4093         if (ret)
4094                 goto out_register_late_subsys;
4095 
4096         for_each_possible_cpu(cpu) {
4097                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4098 
4099                 INIT_LIST_HEAD(&ul->head);
4100                 spin_lock_init(&ul->lock);
4101         }
4102 
4103 out:
4104         return ret;
4105 
4106 out_register_late_subsys:
4107         unregister_pernet_subsys(&ip6_route_net_late_ops);
4108 fib6_rules_init:
4109         fib6_rules_cleanup();
4110 xfrm6_init:
4111         xfrm6_fini();
4112 out_fib6_init:
4113         fib6_gc_cleanup();
4114 out_register_subsys:
4115         unregister_pernet_subsys(&ip6_route_net_ops);
4116 out_register_inetpeer:
4117         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4118 out_dst_entries:
4119         dst_entries_destroy(&ip6_dst_blackhole_ops);
4120 out_kmem_cache:
4121         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4122         goto out;
4123 }
4124 
4125 void ip6_route_cleanup(void)
4126 {
4127         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4128         unregister_pernet_subsys(&ip6_route_net_late_ops);
4129         fib6_rules_cleanup();
4130         xfrm6_fini();
4131         fib6_gc_cleanup();
4132         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4133         unregister_pernet_subsys(&ip6_route_net_ops);
4134         dst_entries_destroy(&ip6_dst_blackhole_ops);
4135         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4136 }
4137 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp