~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/route.c

Version: ~ [ linux-5.11 ] ~ [ linux-5.10.17 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.99 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.176 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.221 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.257 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.257 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  4  *              operating system.  INET is implemented using the  BSD Socket
  5  *              interface as the means of communication with the user level.
  6  *
  7  *              ROUTE - implementation of the IP router.
  8  *
  9  * Authors:     Ross Biro
 10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 14  *
 15  * Fixes:
 16  *              Alan Cox        :       Verify area fixes.
 17  *              Alan Cox        :       cli() protects routing changes
 18  *              Rui Oliveira    :       ICMP routing table updates
 19  *              (rco@di.uminho.pt)      Routing table insertion and update
 20  *              Linus Torvalds  :       Rewrote bits to be sensible
 21  *              Alan Cox        :       Added BSD route gw semantics
 22  *              Alan Cox        :       Super /proc >4K
 23  *              Alan Cox        :       MTU in route table
 24  *              Alan Cox        :       MSS actually. Also added the window
 25  *                                      clamper.
 26  *              Sam Lantinga    :       Fixed route matching in rt_del()
 27  *              Alan Cox        :       Routing cache support.
 28  *              Alan Cox        :       Removed compatibility cruft.
 29  *              Alan Cox        :       RTF_REJECT support.
 30  *              Alan Cox        :       TCP irtt support.
 31  *              Jonathan Naylor :       Added Metric support.
 32  *      Miquel van Smoorenburg  :       BSD API fixes.
 33  *      Miquel van Smoorenburg  :       Metrics.
 34  *              Alan Cox        :       Use __u32 properly
 35  *              Alan Cox        :       Aligned routing errors more closely with BSD
 36  *                                      our system is still very different.
 37  *              Alan Cox        :       Faster /proc handling
 38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 39  *                                      routing caches and better behaviour.
 40  *
 41  *              Olaf Erb        :       irtt wasn't being copied right.
 42  *              Bjorn Ekwall    :       Kerneld route support.
 43  *              Alan Cox        :       Multicast fixed (I hope)
 44  *              Pavel Krauz     :       Limited broadcast fixed
 45  *              Mike McLagan    :       Routing by source
 46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 47  *                                      route.c and rewritten from scratch.
 48  *              Andi Kleen      :       Load-limit warning messages.
 49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 53  *              Marc Boucher    :       routing by fwmark
 54  *      Robert Olsson           :       Added rt_cache statistics
 55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
 59  */
 60 
 61 #define pr_fmt(fmt) "IPv4: " fmt
 62 
 63 #include <linux/module.h>
 64 #include <linux/uaccess.h>
 65 #include <linux/bitops.h>
 66 #include <linux/types.h>
 67 #include <linux/kernel.h>
 68 #include <linux/mm.h>
 69 #include <linux/string.h>
 70 #include <linux/socket.h>
 71 #include <linux/sockios.h>
 72 #include <linux/errno.h>
 73 #include <linux/in.h>
 74 #include <linux/inet.h>
 75 #include <linux/netdevice.h>
 76 #include <linux/proc_fs.h>
 77 #include <linux/init.h>
 78 #include <linux/skbuff.h>
 79 #include <linux/inetdevice.h>
 80 #include <linux/igmp.h>
 81 #include <linux/pkt_sched.h>
 82 #include <linux/mroute.h>
 83 #include <linux/netfilter_ipv4.h>
 84 #include <linux/random.h>
 85 #include <linux/rcupdate.h>
 86 #include <linux/times.h>
 87 #include <linux/slab.h>
 88 #include <linux/jhash.h>
 89 #include <net/dst.h>
 90 #include <net/dst_metadata.h>
 91 #include <net/net_namespace.h>
 92 #include <net/protocol.h>
 93 #include <net/ip.h>
 94 #include <net/route.h>
 95 #include <net/inetpeer.h>
 96 #include <net/sock.h>
 97 #include <net/ip_fib.h>
 98 #include <net/nexthop.h>
 99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129 
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131 
132 /*
133  *      Interface to generic destination cache.
134  */
135 
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153 
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158 
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174 
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176 
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196 
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199 
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207 
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213 
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217 
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227 
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234 
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239 
240 static const struct file_operations rt_cache_seq_fops = {
241         .open    = rt_cache_seq_open,
242         .read    = seq_read,
243         .llseek  = seq_lseek,
244         .release = seq_release,
245 };
246 
247 
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251 
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254 
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263 
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267 
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276 
277 }
278 
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281 
282 }
283 
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287 
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292 
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303 
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307 
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317 
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324 
325 
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330 
331 static const struct file_operations rt_cpu_seq_fops = {
332         .open    = rt_cpu_seq_open,
333         .read    = seq_read,
334         .llseek  = seq_lseek,
335         .release = seq_release,
336 };
337 
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343 
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347 
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357 
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363 
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367 
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_seq_fops);
370         if (!pde)
371                 goto err1;
372 
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_seq_fops);
375         if (!pde)
376                 goto err2;
377 
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385 
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395 
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404 
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409 
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414 
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421 
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426 
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431 
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439 
440         rcu_read_lock_bh();
441 
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448 
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452 
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455 
456         rcu_read_unlock_bh();
457 
458         return n;
459 }
460 
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466 
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478 
479 #define IP_IDENTS_SZ 2048u
480 
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483 
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 new, delta = 0;
495 
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498 
499         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
500         do {
501                 old = (u32)atomic_read(p_id);
502                 new = old + delta + segs;
503         } while (atomic_cmpxchg(p_id, old, new) != old);
504 
505         return new - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508 
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511         u32 hash, id;
512 
513         /* Note the following code is not safe, but this is okay. */
514         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515                 get_random_bytes(&net->ipv4.ip_id_key,
516                                  sizeof(net->ipv4.ip_id_key));
517 
518         hash = siphash_3u32((__force u32)iph->daddr,
519                             (__force u32)iph->saddr,
520                             iph->protocol,
521                             &net->ipv4.ip_id_key);
522         id = ip_idents_reserve(hash, segs);
523         iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526 
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528                              const struct sock *sk,
529                              const struct iphdr *iph,
530                              int oif, u8 tos,
531                              u8 prot, u32 mark, int flow_flags)
532 {
533         if (sk) {
534                 const struct inet_sock *inet = inet_sk(sk);
535 
536                 oif = sk->sk_bound_dev_if;
537                 mark = sk->sk_mark;
538                 tos = RT_CONN_FLAGS(sk);
539                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540         }
541         flowi4_init_output(fl4, oif, mark, tos,
542                            RT_SCOPE_UNIVERSE, prot,
543                            flow_flags,
544                            iph->daddr, iph->saddr, 0, 0,
545                            sock_net_uid(net, sk));
546 }
547 
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549                                const struct sock *sk)
550 {
551         const struct net *net = dev_net(skb->dev);
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557 
558         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560 
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566 
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576         rcu_read_unlock();
577 }
578 
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587 
588 static DEFINE_SPINLOCK(fnhe_lock);
589 
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592         struct rtable *rt;
593 
594         rt = rcu_dereference(fnhe->fnhe_rth_input);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597                 dst_dev_put(&rt->dst);
598                 dst_release(&rt->dst);
599         }
600         rt = rcu_dereference(fnhe->fnhe_rth_output);
601         if (rt) {
602                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603                 dst_dev_put(&rt->dst);
604                 dst_release(&rt->dst);
605         }
606 }
607 
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610         struct fib_nh_exception *fnhe, *oldest;
611 
612         oldest = rcu_dereference(hash->chain);
613         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614              fnhe = rcu_dereference(fnhe->fnhe_next)) {
615                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616                         oldest = fnhe;
617         }
618         fnhe_flush_routes(oldest);
619         return oldest;
620 }
621 
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624         static u32 fnhe_hashrnd __read_mostly;
625         u32 hval;
626 
627         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
629         return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631 
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634         rt->rt_pmtu = fnhe->fnhe_pmtu;
635         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636         rt->dst.expires = fnhe->fnhe_expires;
637 
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_uses_gateway = 1;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645 
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656 
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659 
660         spin_lock_bh(&fnhe_lock);
661 
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669 
670         hash += hval;
671 
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679 
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704 
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714 
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722 
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731 
732         fnhe->fnhe_stamp = jiffies;
733 
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737 
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748 
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755 
756         default:
757                 return;
758         }
759 
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762 
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766 
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772 
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782 
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
792 
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804 
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811 
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820 
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831 
832         rt = (struct rtable *) dst;
833 
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837 
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842 
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855 
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871 
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880 
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890 
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898 
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906 
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914 
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->rate_tokens == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->n_redirects)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923 
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->n_redirects;
927 #ifdef CONFIG_IP_ROUTE_VERBOSE
928                 if (log_martians &&
929                     peer->n_redirects == ip_rt_redirect_number)
930                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
931                                              &ip_hdr(skb)->saddr, inet_iif(skb),
932                                              &ip_hdr(skb)->daddr, &gw);
933 #endif
934         }
935 out_put_peer:
936         inet_putpeer(peer);
937 }
938 
939 static int ip_error(struct sk_buff *skb)
940 {
941         struct rtable *rt = skb_rtable(skb);
942         struct net_device *dev = skb->dev;
943         struct in_device *in_dev;
944         struct inet_peer *peer;
945         unsigned long now;
946         struct net *net;
947         bool send;
948         int code;
949 
950         if (netif_is_l3_master(skb->dev)) {
951                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952                 if (!dev)
953                         goto out;
954         }
955 
956         in_dev = __in_dev_get_rcu(dev);
957 
958         /* IP on this device is disabled. */
959         if (!in_dev)
960                 goto out;
961 
962         net = dev_net(rt->dst.dev);
963         if (!IN_DEV_FORWARD(in_dev)) {
964                 switch (rt->dst.error) {
965                 case EHOSTUNREACH:
966                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
967                         break;
968 
969                 case ENETUNREACH:
970                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
971                         break;
972                 }
973                 goto out;
974         }
975 
976         switch (rt->dst.error) {
977         case EINVAL:
978         default:
979                 goto out;
980         case EHOSTUNREACH:
981                 code = ICMP_HOST_UNREACH;
982                 break;
983         case ENETUNREACH:
984                 code = ICMP_NET_UNREACH;
985                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986                 break;
987         case EACCES:
988                 code = ICMP_PKT_FILTERED;
989                 break;
990         }
991 
992         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
993                                l3mdev_master_ifindex(skb->dev), 1);
994 
995         send = true;
996         if (peer) {
997                 now = jiffies;
998                 peer->rate_tokens += now - peer->rate_last;
999                 if (peer->rate_tokens > ip_rt_error_burst)
1000                         peer->rate_tokens = ip_rt_error_burst;
1001                 peer->rate_last = now;
1002                 if (peer->rate_tokens >= ip_rt_error_cost)
1003                         peer->rate_tokens -= ip_rt_error_cost;
1004                 else
1005                         send = false;
1006                 inet_putpeer(peer);
1007         }
1008         if (send)
1009                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010 
1011 out:    kfree_skb(skb);
1012         return 0;
1013 }
1014 
1015 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 {
1017         struct dst_entry *dst = &rt->dst;
1018         u32 old_mtu = ipv4_mtu(dst);
1019         struct fib_result res;
1020         bool lock = false;
1021 
1022         if (ip_mtu_locked(dst))
1023                 return;
1024 
1025         if (old_mtu < mtu)
1026                 return;
1027 
1028         if (mtu < ip_rt_min_pmtu) {
1029                 lock = true;
1030                 mtu = min(old_mtu, ip_rt_min_pmtu);
1031         }
1032 
1033         if (rt->rt_pmtu == mtu && !lock &&
1034             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1035                 return;
1036 
1037         rcu_read_lock();
1038         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1039                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1040 
1041                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1042                                       jiffies + ip_rt_mtu_expires);
1043         }
1044         rcu_read_unlock();
1045 }
1046 
1047 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1048                               struct sk_buff *skb, u32 mtu,
1049                               bool confirm_neigh)
1050 {
1051         struct rtable *rt = (struct rtable *) dst;
1052         struct flowi4 fl4;
1053 
1054         ip_rt_build_flow_key(&fl4, sk, skb);
1055         __ip_rt_update_pmtu(rt, &fl4, mtu);
1056 }
1057 
1058 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1059                       int oif, u8 protocol)
1060 {
1061         const struct iphdr *iph = (const struct iphdr *) skb->data;
1062         struct flowi4 fl4;
1063         struct rtable *rt;
1064         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1065 
1066         __build_flow_key(net, &fl4, NULL, iph, oif,
1067                          RT_TOS(iph->tos), protocol, mark, 0);
1068         rt = __ip_route_output_key(net, &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075 
1076 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1077 {
1078         const struct iphdr *iph = (const struct iphdr *) skb->data;
1079         struct flowi4 fl4;
1080         struct rtable *rt;
1081 
1082         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1083 
1084         if (!fl4.flowi4_mark)
1085                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086 
1087         rt = __ip_route_output_key(sock_net(sk), &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093 
1094 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099         struct dst_entry *odst = NULL;
1100         bool new = false;
1101         struct net *net = sock_net(sk);
1102 
1103         bh_lock_sock(sk);
1104 
1105         if (!ip_sk_accept_pmtu(sk))
1106                 goto out;
1107 
1108         odst = sk_dst_get(sk);
1109 
1110         if (sock_owned_by_user(sk) || !odst) {
1111                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1112                 goto out;
1113         }
1114 
1115         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1116 
1117         rt = (struct rtable *)odst;
1118         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1119                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120                 if (IS_ERR(rt))
1121                         goto out;
1122 
1123                 new = true;
1124         }
1125 
1126         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1127 
1128         if (!dst_check(&rt->dst, 0)) {
1129                 if (new)
1130                         dst_release(&rt->dst);
1131 
1132                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1133                 if (IS_ERR(rt))
1134                         goto out;
1135 
1136                 new = true;
1137         }
1138 
1139         if (new)
1140                 sk_dst_set(sk, &rt->dst);
1141 
1142 out:
1143         bh_unlock_sock(sk);
1144         dst_release(odst);
1145 }
1146 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1147 
1148 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1149                    int oif, u8 protocol)
1150 {
1151         const struct iphdr *iph = (const struct iphdr *) skb->data;
1152         struct flowi4 fl4;
1153         struct rtable *rt;
1154 
1155         __build_flow_key(net, &fl4, NULL, iph, oif,
1156                          RT_TOS(iph->tos), protocol, 0, 0);
1157         rt = __ip_route_output_key(net, &fl4);
1158         if (!IS_ERR(rt)) {
1159                 __ip_do_redirect(rt, skb, &fl4, false);
1160                 ip_rt_put(rt);
1161         }
1162 }
1163 EXPORT_SYMBOL_GPL(ipv4_redirect);
1164 
1165 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166 {
1167         const struct iphdr *iph = (const struct iphdr *) skb->data;
1168         struct flowi4 fl4;
1169         struct rtable *rt;
1170         struct net *net = sock_net(sk);
1171 
1172         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1173         rt = __ip_route_output_key(net, &fl4);
1174         if (!IS_ERR(rt)) {
1175                 __ip_do_redirect(rt, skb, &fl4, false);
1176                 ip_rt_put(rt);
1177         }
1178 }
1179 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180 
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183         struct rtable *rt = (struct rtable *) dst;
1184 
1185         /* All IPV4 dsts are created with ->obsolete set to the value
1186          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1187          * into this function always.
1188          *
1189          * When a PMTU/redirect information update invalidates a route,
1190          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1191          * DST_OBSOLETE_DEAD.
1192          */
1193         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1194                 return NULL;
1195         return dst;
1196 }
1197 
1198 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1199 {
1200         struct ip_options opt;
1201         int res;
1202 
1203         /* Recompile ip options since IPCB may not be valid anymore.
1204          * Also check we have a reasonable ipv4 header.
1205          */
1206         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1207             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1208                 return;
1209 
1210         memset(&opt, 0, sizeof(opt));
1211         if (ip_hdr(skb)->ihl > 5) {
1212                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1213                         return;
1214                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1215 
1216                 rcu_read_lock();
1217                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1218                 rcu_read_unlock();
1219 
1220                 if (res)
1221                         return;
1222         }
1223         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1224 }
1225 
1226 static void ipv4_link_failure(struct sk_buff *skb)
1227 {
1228         struct rtable *rt;
1229 
1230         ipv4_send_dest_unreach(skb);
1231 
1232         rt = skb_rtable(skb);
1233         if (rt)
1234                 dst_set_expires(&rt->dst, 0);
1235 }
1236 
1237 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1238 {
1239         pr_debug("%s: %pI4 -> %pI4, %s\n",
1240                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1241                  skb->dev ? skb->dev->name : "?");
1242         kfree_skb(skb);
1243         WARN_ON(1);
1244         return 0;
1245 }
1246 
1247 /*
1248    We do not cache source address of outgoing interface,
1249    because it is used only by IP RR, TS and SRR options,
1250    so that it out of fast path.
1251 
1252    BTW remember: "addr" is allowed to be not aligned
1253    in IP options!
1254  */
1255 
1256 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1257 {
1258         __be32 src;
1259 
1260         if (rt_is_output_route(rt))
1261                 src = ip_hdr(skb)->saddr;
1262         else {
1263                 struct fib_result res;
1264                 struct iphdr *iph = ip_hdr(skb);
1265                 struct flowi4 fl4 = {
1266                         .daddr = iph->daddr,
1267                         .saddr = iph->saddr,
1268                         .flowi4_tos = RT_TOS(iph->tos),
1269                         .flowi4_oif = rt->dst.dev->ifindex,
1270                         .flowi4_iif = skb->dev->ifindex,
1271                         .flowi4_mark = skb->mark,
1272                 };
1273 
1274                 rcu_read_lock();
1275                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1276                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1277                 else
1278                         src = inet_select_addr(rt->dst.dev,
1279                                                rt_nexthop(rt, iph->daddr),
1280                                                RT_SCOPE_UNIVERSE);
1281                 rcu_read_unlock();
1282         }
1283         memcpy(addr, &src, 4);
1284 }
1285 
1286 #ifdef CONFIG_IP_ROUTE_CLASSID
1287 static void set_class_tag(struct rtable *rt, u32 tag)
1288 {
1289         if (!(rt->dst.tclassid & 0xFFFF))
1290                 rt->dst.tclassid |= tag & 0xFFFF;
1291         if (!(rt->dst.tclassid & 0xFFFF0000))
1292                 rt->dst.tclassid |= tag & 0xFFFF0000;
1293 }
1294 #endif
1295 
1296 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297 {
1298         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1299         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1300                                     ip_rt_min_advmss);
1301 
1302         return min(advmss, IPV4_MAX_PMTU - header_size);
1303 }
1304 
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309 
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312 
1313         if (mtu)
1314                 return mtu;
1315 
1316         mtu = READ_ONCE(dst->dev->mtu);
1317 
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322 
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324 
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327 
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash;
1331         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332         u32 hval = fnhe_hashfun(daddr);
1333 
1334         spin_lock_bh(&fnhe_lock);
1335 
1336         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337                                          lockdep_is_held(&fnhe_lock));
1338         hash += hval;
1339 
1340         fnhe_p = &hash->chain;
1341         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342         while (fnhe) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346                         /* set fnhe_daddr to 0 to ensure it won't bind with
1347                          * new dsts in rt_bind_exception().
1348                          */
1349                         fnhe->fnhe_daddr = 0;
1350                         fnhe_flush_routes(fnhe);
1351                         kfree_rcu(fnhe, rcu);
1352                         break;
1353                 }
1354                 fnhe_p = &fnhe->fnhe_next;
1355                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                                                  lockdep_is_held(&fnhe_lock));
1357         }
1358 
1359         spin_unlock_bh(&fnhe_lock);
1360 }
1361 
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363                                                __be32 daddr)
1364 {
1365         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366         struct fib_nh_exception *fnhe;
1367         u32 hval;
1368 
1369         if (!hash)
1370                 return NULL;
1371 
1372         hval = fnhe_hashfun(daddr);
1373 
1374         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376                 if (fnhe->fnhe_daddr == daddr) {
1377                         if (fnhe->fnhe_expires &&
1378                             time_after(jiffies, fnhe->fnhe_expires)) {
1379                                 ip_del_fnhe(nhc, daddr);
1380                                 break;
1381                         }
1382                         return fnhe;
1383                 }
1384         }
1385         return NULL;
1386 }
1387 
1388 /* MTU selection:
1389  * 1. mtu on route is locked - use it
1390  * 2. mtu from nexthop exception
1391  * 3. mtu from egress device
1392  */
1393 
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396         struct fib_nh_common *nhc = res->nhc;
1397         struct net_device *dev = nhc->nhc_dev;
1398         struct fib_info *fi = res->fi;
1399         u32 mtu = 0;
1400 
1401         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1402             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403                 mtu = fi->fib_mtu;
1404 
1405         if (likely(!mtu)) {
1406                 struct fib_nh_exception *fnhe;
1407 
1408                 fnhe = find_exception(nhc, daddr);
1409                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410                         mtu = fnhe->fnhe_pmtu;
1411         }
1412 
1413         if (likely(!mtu))
1414                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415 
1416         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418 
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420                               __be32 daddr, const bool do_cache)
1421 {
1422         bool ret = false;
1423 
1424         spin_lock_bh(&fnhe_lock);
1425 
1426         if (daddr == fnhe->fnhe_daddr) {
1427                 struct rtable __rcu **porig;
1428                 struct rtable *orig;
1429                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1430 
1431                 if (rt_is_input_route(rt))
1432                         porig = &fnhe->fnhe_rth_input;
1433                 else
1434                         porig = &fnhe->fnhe_rth_output;
1435                 orig = rcu_dereference(*porig);
1436 
1437                 if (fnhe->fnhe_genid != genid) {
1438                         fnhe->fnhe_genid = genid;
1439                         fnhe->fnhe_gw = 0;
1440                         fnhe->fnhe_pmtu = 0;
1441                         fnhe->fnhe_expires = 0;
1442                         fnhe->fnhe_mtu_locked = false;
1443                         fnhe_flush_routes(fnhe);
1444                         orig = NULL;
1445                 }
1446                 fill_route_from_fnhe(rt, fnhe);
1447                 if (!rt->rt_gw4) {
1448                         rt->rt_gw4 = daddr;
1449                         rt->rt_gw_family = AF_INET;
1450                 }
1451 
1452                 if (do_cache) {
1453                         dst_hold(&rt->dst);
1454                         rcu_assign_pointer(*porig, rt);
1455                         if (orig) {
1456                                 dst_dev_put(&orig->dst);
1457                                 dst_release(&orig->dst);
1458                         }
1459                         ret = true;
1460                 }
1461 
1462                 fnhe->fnhe_stamp = jiffies;
1463         }
1464         spin_unlock_bh(&fnhe_lock);
1465 
1466         return ret;
1467 }
1468 
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471         struct rtable *orig, *prev, **p;
1472         bool ret = true;
1473 
1474         if (rt_is_input_route(rt)) {
1475                 p = (struct rtable **)&nhc->nhc_rth_input;
1476         } else {
1477                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478         }
1479         orig = *p;
1480 
1481         /* hold dst before doing cmpxchg() to avoid race condition
1482          * on this dst
1483          */
1484         dst_hold(&rt->dst);
1485         prev = cmpxchg(p, orig, rt);
1486         if (prev == orig) {
1487                 if (orig) {
1488                         rt_add_uncached_list(orig);
1489                         dst_release(&orig->dst);
1490                 }
1491         } else {
1492                 dst_release(&rt->dst);
1493                 ret = false;
1494         }
1495 
1496         return ret;
1497 }
1498 
1499 struct uncached_list {
1500         spinlock_t              lock;
1501         struct list_head        head;
1502 };
1503 
1504 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1505 
1506 void rt_add_uncached_list(struct rtable *rt)
1507 {
1508         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1509 
1510         rt->rt_uncached_list = ul;
1511 
1512         spin_lock_bh(&ul->lock);
1513         list_add_tail(&rt->rt_uncached, &ul->head);
1514         spin_unlock_bh(&ul->lock);
1515 }
1516 
1517 void rt_del_uncached_list(struct rtable *rt)
1518 {
1519         if (!list_empty(&rt->rt_uncached)) {
1520                 struct uncached_list *ul = rt->rt_uncached_list;
1521 
1522                 spin_lock_bh(&ul->lock);
1523                 list_del(&rt->rt_uncached);
1524                 spin_unlock_bh(&ul->lock);
1525         }
1526 }
1527 
1528 static void ipv4_dst_destroy(struct dst_entry *dst)
1529 {
1530         struct rtable *rt = (struct rtable *)dst;
1531 
1532         ip_dst_metrics_put(dst);
1533         rt_del_uncached_list(rt);
1534 }
1535 
1536 void rt_flush_dev(struct net_device *dev)
1537 {
1538         struct rtable *rt;
1539         int cpu;
1540 
1541         for_each_possible_cpu(cpu) {
1542                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1543 
1544                 spin_lock_bh(&ul->lock);
1545                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1546                         if (rt->dst.dev != dev)
1547                                 continue;
1548                         rt->dst.dev = blackhole_netdev;
1549                         dev_hold(rt->dst.dev);
1550                         dev_put(dev);
1551                 }
1552                 spin_unlock_bh(&ul->lock);
1553         }
1554 }
1555 
1556 static bool rt_cache_valid(const struct rtable *rt)
1557 {
1558         return  rt &&
1559                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1560                 !rt_is_expired(rt);
1561 }
1562 
1563 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1564                            const struct fib_result *res,
1565                            struct fib_nh_exception *fnhe,
1566                            struct fib_info *fi, u16 type, u32 itag,
1567                            const bool do_cache)
1568 {
1569         bool cached = false;
1570 
1571         if (fi) {
1572                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1573 
1574                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1575                         rt->rt_uses_gateway = 1;
1576                         rt->rt_gw_family = nhc->nhc_gw_family;
1577                         /* only INET and INET6 are supported */
1578                         if (likely(nhc->nhc_gw_family == AF_INET))
1579                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1580                         else
1581                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1582                 }
1583 
1584                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1585 
1586 #ifdef CONFIG_IP_ROUTE_CLASSID
1587                 if (nhc->nhc_family == AF_INET) {
1588                         struct fib_nh *nh;
1589 
1590                         nh = container_of(nhc, struct fib_nh, nh_common);
1591                         rt->dst.tclassid = nh->nh_tclassid;
1592                 }
1593 #endif
1594                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1595                 if (unlikely(fnhe))
1596                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1597                 else if (do_cache)
1598                         cached = rt_cache_route(nhc, rt);
1599                 if (unlikely(!cached)) {
1600                         /* Routes we intend to cache in nexthop exception or
1601                          * FIB nexthop have the DST_NOCACHE bit clear.
1602                          * However, if we are unsuccessful at storing this
1603                          * route into the cache we really need to set it.
1604                          */
1605                         if (!rt->rt_gw4) {
1606                                 rt->rt_gw_family = AF_INET;
1607                                 rt->rt_gw4 = daddr;
1608                         }
1609                         rt_add_uncached_list(rt);
1610                 }
1611         } else
1612                 rt_add_uncached_list(rt);
1613 
1614 #ifdef CONFIG_IP_ROUTE_CLASSID
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, res->tclassid);
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620 }
1621 
1622 struct rtable *rt_dst_alloc(struct net_device *dev,
1623                             unsigned int flags, u16 type,
1624                             bool nopolicy, bool noxfrm, bool will_cache)
1625 {
1626         struct rtable *rt;
1627 
1628         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1629                        (will_cache ? 0 : DST_HOST) |
1630                        (nopolicy ? DST_NOPOLICY : 0) |
1631                        (noxfrm ? DST_NOXFRM : 0));
1632 
1633         if (rt) {
1634                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1635                 rt->rt_flags = flags;
1636                 rt->rt_type = type;
1637                 rt->rt_is_input = 0;
1638                 rt->rt_iif = 0;
1639                 rt->rt_pmtu = 0;
1640                 rt->rt_mtu_locked = 0;
1641                 rt->rt_uses_gateway = 0;
1642                 rt->rt_gw_family = 0;
1643                 rt->rt_gw4 = 0;
1644                 INIT_LIST_HEAD(&rt->rt_uncached);
1645 
1646                 rt->dst.output = ip_output;
1647                 if (flags & RTCF_LOCAL)
1648                         rt->dst.input = ip_local_deliver;
1649         }
1650 
1651         return rt;
1652 }
1653 EXPORT_SYMBOL(rt_dst_alloc);
1654 
1655 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1656 {
1657         struct rtable *new_rt;
1658 
1659         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1660                            rt->dst.flags);
1661 
1662         if (new_rt) {
1663                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1664                 new_rt->rt_flags = rt->rt_flags;
1665                 new_rt->rt_type = rt->rt_type;
1666                 new_rt->rt_is_input = rt->rt_is_input;
1667                 new_rt->rt_iif = rt->rt_iif;
1668                 new_rt->rt_pmtu = rt->rt_pmtu;
1669                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1670                 new_rt->rt_gw_family = rt->rt_gw_family;
1671                 if (rt->rt_gw_family == AF_INET)
1672                         new_rt->rt_gw4 = rt->rt_gw4;
1673                 else if (rt->rt_gw_family == AF_INET6)
1674                         new_rt->rt_gw6 = rt->rt_gw6;
1675                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1676 
1677                 new_rt->dst.flags |= DST_HOST;
1678                 new_rt->dst.input = rt->dst.input;
1679                 new_rt->dst.output = rt->dst.output;
1680                 new_rt->dst.error = rt->dst.error;
1681                 new_rt->dst.lastuse = jiffies;
1682                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1683         }
1684         return new_rt;
1685 }
1686 EXPORT_SYMBOL(rt_dst_clone);
1687 
1688 /* called in rcu_read_lock() section */
1689 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1690                           u8 tos, struct net_device *dev,
1691                           struct in_device *in_dev, u32 *itag)
1692 {
1693         int err;
1694 
1695         /* Primary sanity checks. */
1696         if (!in_dev)
1697                 return -EINVAL;
1698 
1699         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1700             skb->protocol != htons(ETH_P_IP))
1701                 return -EINVAL;
1702 
1703         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1704                 return -EINVAL;
1705 
1706         if (ipv4_is_zeronet(saddr)) {
1707                 if (!ipv4_is_local_multicast(daddr) &&
1708                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1709                         return -EINVAL;
1710         } else {
1711                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1712                                           in_dev, itag);
1713                 if (err < 0)
1714                         return err;
1715         }
1716         return 0;
1717 }
1718 
1719 /* called in rcu_read_lock() section */
1720 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1721                              u8 tos, struct net_device *dev, int our)
1722 {
1723         struct in_device *in_dev = __in_dev_get_rcu(dev);
1724         unsigned int flags = RTCF_MULTICAST;
1725         struct rtable *rth;
1726         u32 itag = 0;
1727         int err;
1728 
1729         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1730         if (err)
1731                 return err;
1732 
1733         if (our)
1734                 flags |= RTCF_LOCAL;
1735 
1736         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1737                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1738         if (!rth)
1739                 return -ENOBUFS;
1740 
1741 #ifdef CONFIG_IP_ROUTE_CLASSID
1742         rth->dst.tclassid = itag;
1743 #endif
1744         rth->dst.output = ip_rt_bug;
1745         rth->rt_is_input= 1;
1746 
1747 #ifdef CONFIG_IP_MROUTE
1748         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1749                 rth->dst.input = ip_mr_input;
1750 #endif
1751         RT_CACHE_STAT_INC(in_slow_mc);
1752 
1753         skb_dst_set(skb, &rth->dst);
1754         return 0;
1755 }
1756 
1757 
1758 static void ip_handle_martian_source(struct net_device *dev,
1759                                      struct in_device *in_dev,
1760                                      struct sk_buff *skb,
1761                                      __be32 daddr,
1762                                      __be32 saddr)
1763 {
1764         RT_CACHE_STAT_INC(in_martian_src);
1765 #ifdef CONFIG_IP_ROUTE_VERBOSE
1766         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1767                 /*
1768                  *      RFC1812 recommendation, if source is martian,
1769                  *      the only hint is MAC header.
1770                  */
1771                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1772                         &daddr, &saddr, dev->name);
1773                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1774                         print_hex_dump(KERN_WARNING, "ll header: ",
1775                                        DUMP_PREFIX_OFFSET, 16, 1,
1776                                        skb_mac_header(skb),
1777                                        dev->hard_header_len, false);
1778                 }
1779         }
1780 #endif
1781 }
1782 
1783 /* called in rcu_read_lock() section */
1784 static int __mkroute_input(struct sk_buff *skb,
1785                            const struct fib_result *res,
1786                            struct in_device *in_dev,
1787                            __be32 daddr, __be32 saddr, u32 tos)
1788 {
1789         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1790         struct net_device *dev = nhc->nhc_dev;
1791         struct fib_nh_exception *fnhe;
1792         struct rtable *rth;
1793         int err;
1794         struct in_device *out_dev;
1795         bool do_cache;
1796         u32 itag = 0;
1797 
1798         /* get a working reference to the output device */
1799         out_dev = __in_dev_get_rcu(dev);
1800         if (!out_dev) {
1801                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1802                 return -EINVAL;
1803         }
1804 
1805         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1806                                   in_dev->dev, in_dev, &itag);
1807         if (err < 0) {
1808                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1809                                          saddr);
1810 
1811                 goto cleanup;
1812         }
1813 
1814         do_cache = res->fi && !itag;
1815         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1816             skb->protocol == htons(ETH_P_IP)) {
1817                 __be32 gw;
1818 
1819                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1820                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1821                     inet_addr_onlink(out_dev, saddr, gw))
1822                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1823         }
1824 
1825         if (skb->protocol != htons(ETH_P_IP)) {
1826                 /* Not IP (i.e. ARP). Do not create route, if it is
1827                  * invalid for proxy arp. DNAT routes are always valid.
1828                  *
1829                  * Proxy arp feature have been extended to allow, ARP
1830                  * replies back to the same interface, to support
1831                  * Private VLAN switch technologies. See arp.c.
1832                  */
1833                 if (out_dev == in_dev &&
1834                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1835                         err = -EINVAL;
1836                         goto cleanup;
1837                 }
1838         }
1839 
1840         fnhe = find_exception(nhc, daddr);
1841         if (do_cache) {
1842                 if (fnhe)
1843                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1844                 else
1845                         rth = rcu_dereference(nhc->nhc_rth_input);
1846                 if (rt_cache_valid(rth)) {
1847                         skb_dst_set_noref(skb, &rth->dst);
1848                         goto out;
1849                 }
1850         }
1851 
1852         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1853                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1854                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1855         if (!rth) {
1856                 err = -ENOBUFS;
1857                 goto cleanup;
1858         }
1859 
1860         rth->rt_is_input = 1;
1861         RT_CACHE_STAT_INC(in_slow_tot);
1862 
1863         rth->dst.input = ip_forward;
1864 
1865         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1866                        do_cache);
1867         lwtunnel_set_redirect(&rth->dst);
1868         skb_dst_set(skb, &rth->dst);
1869 out:
1870         err = 0;
1871  cleanup:
1872         return err;
1873 }
1874 
1875 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1876 /* To make ICMP packets follow the right flow, the multipath hash is
1877  * calculated from the inner IP addresses.
1878  */
1879 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1880                                  struct flow_keys *hash_keys)
1881 {
1882         const struct iphdr *outer_iph = ip_hdr(skb);
1883         const struct iphdr *key_iph = outer_iph;
1884         const struct iphdr *inner_iph;
1885         const struct icmphdr *icmph;
1886         struct iphdr _inner_iph;
1887         struct icmphdr _icmph;
1888 
1889         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1890                 goto out;
1891 
1892         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1893                 goto out;
1894 
1895         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1896                                    &_icmph);
1897         if (!icmph)
1898                 goto out;
1899 
1900         if (!icmp_is_err(icmph->type))
1901                 goto out;
1902 
1903         inner_iph = skb_header_pointer(skb,
1904                                        outer_iph->ihl * 4 + sizeof(_icmph),
1905                                        sizeof(_inner_iph), &_inner_iph);
1906         if (!inner_iph)
1907                 goto out;
1908 
1909         key_iph = inner_iph;
1910 out:
1911         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1912         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1913 }
1914 
1915 /* if skb is set it will be used and fl4 can be NULL */
1916 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1917                        const struct sk_buff *skb, struct flow_keys *flkeys)
1918 {
1919         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1920         struct flow_keys hash_keys;
1921         u32 mhash;
1922 
1923         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1924         case 0:
1925                 memset(&hash_keys, 0, sizeof(hash_keys));
1926                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927                 if (skb) {
1928                         ip_multipath_l3_keys(skb, &hash_keys);
1929                 } else {
1930                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1931                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932                 }
1933                 break;
1934         case 1:
1935                 /* skb is currently provided only when forwarding */
1936                 if (skb) {
1937                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1938                         struct flow_keys keys;
1939 
1940                         /* short-circuit if we already have L4 hash present */
1941                         if (skb->l4_hash)
1942                                 return skb_get_hash_raw(skb) >> 1;
1943 
1944                         memset(&hash_keys, 0, sizeof(hash_keys));
1945 
1946                         if (!flkeys) {
1947                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1948                                 flkeys = &keys;
1949                         }
1950 
1951                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1952                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1953                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1954                         hash_keys.ports.src = flkeys->ports.src;
1955                         hash_keys.ports.dst = flkeys->ports.dst;
1956                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1957                 } else {
1958                         memset(&hash_keys, 0, sizeof(hash_keys));
1959                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1960                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1961                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1962                         hash_keys.ports.src = fl4->fl4_sport;
1963                         hash_keys.ports.dst = fl4->fl4_dport;
1964                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1965                 }
1966                 break;
1967         case 2:
1968                 memset(&hash_keys, 0, sizeof(hash_keys));
1969                 /* skb is currently provided only when forwarding */
1970                 if (skb) {
1971                         struct flow_keys keys;
1972 
1973                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1974                         /* Inner can be v4 or v6 */
1975                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1978                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1979                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1980                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1981                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1982                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1983                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1984                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1985                         } else {
1986                                 /* Same as case 0 */
1987                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1988                                 ip_multipath_l3_keys(skb, &hash_keys);
1989                         }
1990                 } else {
1991                         /* Same as case 0 */
1992                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1993                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1994                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1995                 }
1996                 break;
1997         }
1998         mhash = flow_hash_from_keys(&hash_keys);
1999 
2000         if (multipath_hash)
2001                 mhash = jhash_2words(mhash, multipath_hash, 0);
2002 
2003         return mhash >> 1;
2004 }
2005 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2006 
2007 static int ip_mkroute_input(struct sk_buff *skb,
2008                             struct fib_result *res,
2009                             struct in_device *in_dev,
2010                             __be32 daddr, __be32 saddr, u32 tos,
2011                             struct flow_keys *hkeys)
2012 {
2013 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2014         if (res->fi && fib_info_num_path(res->fi) > 1) {
2015                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2016 
2017                 fib_select_multipath(res, h);
2018         }
2019 #endif
2020 
2021         /* create a routing cache entry */
2022         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2023 }
2024 
2025 /* Implements all the saddr-related checks as ip_route_input_slow(),
2026  * assuming daddr is valid and the destination is not a local broadcast one.
2027  * Uses the provided hint instead of performing a route lookup.
2028  */
2029 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2030                       u8 tos, struct net_device *dev,
2031                       const struct sk_buff *hint)
2032 {
2033         struct in_device *in_dev = __in_dev_get_rcu(dev);
2034         struct rtable *rt = (struct rtable *)hint;
2035         struct net *net = dev_net(dev);
2036         int err = -EINVAL;
2037         u32 tag = 0;
2038 
2039         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2040                 goto martian_source;
2041 
2042         if (ipv4_is_zeronet(saddr))
2043                 goto martian_source;
2044 
2045         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2046                 goto martian_source;
2047 
2048         if (rt->rt_type != RTN_LOCAL)
2049                 goto skip_validate_source;
2050 
2051         tos &= IPTOS_RT_MASK;
2052         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2053         if (err < 0)
2054                 goto martian_source;
2055 
2056 skip_validate_source:
2057         skb_dst_copy(skb, hint);
2058         return 0;
2059 
2060 martian_source:
2061         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2062         return err;
2063 }
2064 
2065 /*
2066  *      NOTE. We drop all the packets that has local source
2067  *      addresses, because every properly looped back packet
2068  *      must have correct destination already attached by output routine.
2069  *      Changes in the enforced policies must be applied also to
2070  *      ip_route_use_hint().
2071  *
2072  *      Such approach solves two big problems:
2073  *      1. Not simplex devices are handled properly.
2074  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2075  *      called with rcu_read_lock()
2076  */
2077 
2078 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2079                                u8 tos, struct net_device *dev,
2080                                struct fib_result *res)
2081 {
2082         struct in_device *in_dev = __in_dev_get_rcu(dev);
2083         struct flow_keys *flkeys = NULL, _flkeys;
2084         struct net    *net = dev_net(dev);
2085         struct ip_tunnel_info *tun_info;
2086         int             err = -EINVAL;
2087         unsigned int    flags = 0;
2088         u32             itag = 0;
2089         struct rtable   *rth;
2090         struct flowi4   fl4;
2091         bool do_cache = true;
2092 
2093         /* IP on this device is disabled. */
2094 
2095         if (!in_dev)
2096                 goto out;
2097 
2098         /* Check for the most weird martians, which can be not detected
2099            by fib_lookup.
2100          */
2101 
2102         tun_info = skb_tunnel_info(skb);
2103         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2104                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2105         else
2106                 fl4.flowi4_tun_key.tun_id = 0;
2107         skb_dst_drop(skb);
2108 
2109         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2110                 goto martian_source;
2111 
2112         res->fi = NULL;
2113         res->table = NULL;
2114         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2115                 goto brd_input;
2116 
2117         /* Accept zero addresses only to limited broadcast;
2118          * I even do not know to fix it or not. Waiting for complains :-)
2119          */
2120         if (ipv4_is_zeronet(saddr))
2121                 goto martian_source;
2122 
2123         if (ipv4_is_zeronet(daddr))
2124                 goto martian_destination;
2125 
2126         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2127          * and call it once if daddr or/and saddr are loopback addresses
2128          */
2129         if (ipv4_is_loopback(daddr)) {
2130                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2131                         goto martian_destination;
2132         } else if (ipv4_is_loopback(saddr)) {
2133                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2134                         goto martian_source;
2135         }
2136 
2137         /*
2138          *      Now we are ready to route packet.
2139          */
2140         fl4.flowi4_oif = 0;
2141         fl4.flowi4_iif = dev->ifindex;
2142         fl4.flowi4_mark = skb->mark;
2143         fl4.flowi4_tos = tos;
2144         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2145         fl4.flowi4_flags = 0;
2146         fl4.daddr = daddr;
2147         fl4.saddr = saddr;
2148         fl4.flowi4_uid = sock_net_uid(net, NULL);
2149 
2150         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2151                 flkeys = &_flkeys;
2152         } else {
2153                 fl4.flowi4_proto = 0;
2154                 fl4.fl4_sport = 0;
2155                 fl4.fl4_dport = 0;
2156         }
2157 
2158         err = fib_lookup(net, &fl4, res, 0);
2159         if (err != 0) {
2160                 if (!IN_DEV_FORWARD(in_dev))
2161                         err = -EHOSTUNREACH;
2162                 goto no_route;
2163         }
2164 
2165         if (res->type == RTN_BROADCAST) {
2166                 if (IN_DEV_BFORWARD(in_dev))
2167                         goto make_route;
2168                 /* not do cache if bc_forwarding is enabled */
2169                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2170                         do_cache = false;
2171                 goto brd_input;
2172         }
2173 
2174         if (res->type == RTN_LOCAL) {
2175                 err = fib_validate_source(skb, saddr, daddr, tos,
2176                                           0, dev, in_dev, &itag);
2177                 if (err < 0)
2178                         goto martian_source;
2179                 goto local_input;
2180         }
2181 
2182         if (!IN_DEV_FORWARD(in_dev)) {
2183                 err = -EHOSTUNREACH;
2184                 goto no_route;
2185         }
2186         if (res->type != RTN_UNICAST)
2187                 goto martian_destination;
2188 
2189 make_route:
2190         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2191 out:    return err;
2192 
2193 brd_input:
2194         if (skb->protocol != htons(ETH_P_IP))
2195                 goto e_inval;
2196 
2197         if (!ipv4_is_zeronet(saddr)) {
2198                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2199                                           in_dev, &itag);
2200                 if (err < 0)
2201                         goto martian_source;
2202         }
2203         flags |= RTCF_BROADCAST;
2204         res->type = RTN_BROADCAST;
2205         RT_CACHE_STAT_INC(in_brd);
2206 
2207 local_input:
2208         do_cache &= res->fi && !itag;
2209         if (do_cache) {
2210                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2211 
2212                 rth = rcu_dereference(nhc->nhc_rth_input);
2213                 if (rt_cache_valid(rth)) {
2214                         skb_dst_set_noref(skb, &rth->dst);
2215                         err = 0;
2216                         goto out;
2217                 }
2218         }
2219 
2220         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2221                            flags | RTCF_LOCAL, res->type,
2222                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2223         if (!rth)
2224                 goto e_nobufs;
2225 
2226         rth->dst.output= ip_rt_bug;
2227 #ifdef CONFIG_IP_ROUTE_CLASSID
2228         rth->dst.tclassid = itag;
2229 #endif
2230         rth->rt_is_input = 1;
2231 
2232         RT_CACHE_STAT_INC(in_slow_tot);
2233         if (res->type == RTN_UNREACHABLE) {
2234                 rth->dst.input= ip_error;
2235                 rth->dst.error= -err;
2236                 rth->rt_flags   &= ~RTCF_LOCAL;
2237         }
2238 
2239         if (do_cache) {
2240                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2241 
2242                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2243                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2244                         WARN_ON(rth->dst.input == lwtunnel_input);
2245                         rth->dst.lwtstate->orig_input = rth->dst.input;
2246                         rth->dst.input = lwtunnel_input;
2247                 }
2248 
2249                 if (unlikely(!rt_cache_route(nhc, rth)))
2250                         rt_add_uncached_list(rth);
2251         }
2252         skb_dst_set(skb, &rth->dst);
2253         err = 0;
2254         goto out;
2255 
2256 no_route:
2257         RT_CACHE_STAT_INC(in_no_route);
2258         res->type = RTN_UNREACHABLE;
2259         res->fi = NULL;
2260         res->table = NULL;
2261         goto local_input;
2262 
2263         /*
2264          *      Do not cache martian addresses: they should be logged (RFC1812)
2265          */
2266 martian_destination:
2267         RT_CACHE_STAT_INC(in_martian_dst);
2268 #ifdef CONFIG_IP_ROUTE_VERBOSE
2269         if (IN_DEV_LOG_MARTIANS(in_dev))
2270                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2271                                      &daddr, &saddr, dev->name);
2272 #endif
2273 
2274 e_inval:
2275         err = -EINVAL;
2276         goto out;
2277 
2278 e_nobufs:
2279         err = -ENOBUFS;
2280         goto out;
2281 
2282 martian_source:
2283         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2284         goto out;
2285 }
2286 
2287 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288                          u8 tos, struct net_device *dev)
2289 {
2290         struct fib_result res;
2291         int err;
2292 
2293         tos &= IPTOS_RT_MASK;
2294         rcu_read_lock();
2295         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2296         rcu_read_unlock();
2297 
2298         return err;
2299 }
2300 EXPORT_SYMBOL(ip_route_input_noref);
2301 
2302 /* called with rcu_read_lock held */
2303 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2304                        u8 tos, struct net_device *dev, struct fib_result *res)
2305 {
2306         /* Multicast recognition logic is moved from route cache to here.
2307            The problem was that too many Ethernet cards have broken/missing
2308            hardware multicast filters :-( As result the host on multicasting
2309            network acquires a lot of useless route cache entries, sort of
2310            SDR messages from all the world. Now we try to get rid of them.
2311            Really, provided software IP multicast filter is organized
2312            reasonably (at least, hashed), it does not result in a slowdown
2313            comparing with route cache reject entries.
2314            Note, that multicast routers are not affected, because
2315            route cache entry is created eventually.
2316          */
2317         if (ipv4_is_multicast(daddr)) {
2318                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2319                 int our = 0;
2320                 int err = -EINVAL;
2321 
2322                 if (!in_dev)
2323                         return err;
2324                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2325                                       ip_hdr(skb)->protocol);
2326 
2327                 /* check l3 master if no match yet */
2328                 if (!our && netif_is_l3_slave(dev)) {
2329                         struct in_device *l3_in_dev;
2330 
2331                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2332                         if (l3_in_dev)
2333                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2334                                                       ip_hdr(skb)->protocol);
2335                 }
2336 
2337                 if (our
2338 #ifdef CONFIG_IP_MROUTE
2339                         ||
2340                     (!ipv4_is_local_multicast(daddr) &&
2341                      IN_DEV_MFORWARD(in_dev))
2342 #endif
2343                    ) {
2344                         err = ip_route_input_mc(skb, daddr, saddr,
2345                                                 tos, dev, our);
2346                 }
2347                 return err;
2348         }
2349 
2350         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2351 }
2352 
2353 /* called with rcu_read_lock() */
2354 static struct rtable *__mkroute_output(const struct fib_result *res,
2355                                        const struct flowi4 *fl4, int orig_oif,
2356                                        struct net_device *dev_out,
2357                                        unsigned int flags)
2358 {
2359         struct fib_info *fi = res->fi;
2360         struct fib_nh_exception *fnhe;
2361         struct in_device *in_dev;
2362         u16 type = res->type;
2363         struct rtable *rth;
2364         bool do_cache;
2365 
2366         in_dev = __in_dev_get_rcu(dev_out);
2367         if (!in_dev)
2368                 return ERR_PTR(-EINVAL);
2369 
2370         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2371                 if (ipv4_is_loopback(fl4->saddr) &&
2372                     !(dev_out->flags & IFF_LOOPBACK) &&
2373                     !netif_is_l3_master(dev_out))
2374                         return ERR_PTR(-EINVAL);
2375 
2376         if (ipv4_is_lbcast(fl4->daddr))
2377                 type = RTN_BROADCAST;
2378         else if (ipv4_is_multicast(fl4->daddr))
2379                 type = RTN_MULTICAST;
2380         else if (ipv4_is_zeronet(fl4->daddr))
2381                 return ERR_PTR(-EINVAL);
2382 
2383         if (dev_out->flags & IFF_LOOPBACK)
2384                 flags |= RTCF_LOCAL;
2385 
2386         do_cache = true;
2387         if (type == RTN_BROADCAST) {
2388                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2389                 fi = NULL;
2390         } else if (type == RTN_MULTICAST) {
2391                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2392                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2393                                      fl4->flowi4_proto))
2394                         flags &= ~RTCF_LOCAL;
2395                 else
2396                         do_cache = false;
2397                 /* If multicast route do not exist use
2398                  * default one, but do not gateway in this case.
2399                  * Yes, it is hack.
2400                  */
2401                 if (fi && res->prefixlen < 4)
2402                         fi = NULL;
2403         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2404                    (orig_oif != dev_out->ifindex)) {
2405                 /* For local routes that require a particular output interface
2406                  * we do not want to cache the result.  Caching the result
2407                  * causes incorrect behaviour when there are multiple source
2408                  * addresses on the interface, the end result being that if the
2409                  * intended recipient is waiting on that interface for the
2410                  * packet he won't receive it because it will be delivered on
2411                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2412                  * be set to the loopback interface as well.
2413                  */
2414                 do_cache = false;
2415         }
2416 
2417         fnhe = NULL;
2418         do_cache &= fi != NULL;
2419         if (fi) {
2420                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2421                 struct rtable __rcu **prth;
2422 
2423                 fnhe = find_exception(nhc, fl4->daddr);
2424                 if (!do_cache)
2425                         goto add;
2426                 if (fnhe) {
2427                         prth = &fnhe->fnhe_rth_output;
2428                 } else {
2429                         if (unlikely(fl4->flowi4_flags &
2430                                      FLOWI_FLAG_KNOWN_NH &&
2431                                      !(nhc->nhc_gw_family &&
2432                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2433                                 do_cache = false;
2434                                 goto add;
2435                         }
2436                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2437                 }
2438                 rth = rcu_dereference(*prth);
2439                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2440                         return rth;
2441         }
2442 
2443 add:
2444         rth = rt_dst_alloc(dev_out, flags, type,
2445                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2446                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2447                            do_cache);
2448         if (!rth)
2449                 return ERR_PTR(-ENOBUFS);
2450 
2451         rth->rt_iif = orig_oif;
2452 
2453         RT_CACHE_STAT_INC(out_slow_tot);
2454 
2455         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2456                 if (flags & RTCF_LOCAL &&
2457                     !(dev_out->flags & IFF_LOOPBACK)) {
2458                         rth->dst.output = ip_mc_output;
2459                         RT_CACHE_STAT_INC(out_slow_mc);
2460                 }
2461 #ifdef CONFIG_IP_MROUTE
2462                 if (type == RTN_MULTICAST) {
2463                         if (IN_DEV_MFORWARD(in_dev) &&
2464                             !ipv4_is_local_multicast(fl4->daddr)) {
2465                                 rth->dst.input = ip_mr_input;
2466                                 rth->dst.output = ip_mc_output;
2467                         }
2468                 }
2469 #endif
2470         }
2471 
2472         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2473         lwtunnel_set_redirect(&rth->dst);
2474 
2475         return rth;
2476 }
2477 
2478 /*
2479  * Major route resolver routine.
2480  */
2481 
2482 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2483                                         const struct sk_buff *skb)
2484 {
2485         __u8 tos = RT_FL_TOS(fl4);
2486         struct fib_result res = {
2487                 .type           = RTN_UNSPEC,
2488                 .fi             = NULL,
2489                 .table          = NULL,
2490                 .tclassid       = 0,
2491         };
2492         struct rtable *rth;
2493 
2494         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2495         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2496         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2497                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2498 
2499         rcu_read_lock();
2500         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2501         rcu_read_unlock();
2502 
2503         return rth;
2504 }
2505 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2506 
2507 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2508                                             struct fib_result *res,
2509                                             const struct sk_buff *skb)
2510 {
2511         struct net_device *dev_out = NULL;
2512         int orig_oif = fl4->flowi4_oif;
2513         unsigned int flags = 0;
2514         struct rtable *rth;
2515         int err;
2516 
2517         if (fl4->saddr) {
2518                 if (ipv4_is_multicast(fl4->saddr) ||
2519                     ipv4_is_lbcast(fl4->saddr) ||
2520                     ipv4_is_zeronet(fl4->saddr)) {
2521                         rth = ERR_PTR(-EINVAL);
2522                         goto out;
2523                 }
2524 
2525                 rth = ERR_PTR(-ENETUNREACH);
2526 
2527                 /* I removed check for oif == dev_out->oif here.
2528                    It was wrong for two reasons:
2529                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2530                       is assigned to multiple interfaces.
2531                    2. Moreover, we are allowed to send packets with saddr
2532                       of another iface. --ANK
2533                  */
2534 
2535                 if (fl4->flowi4_oif == 0 &&
2536                     (ipv4_is_multicast(fl4->daddr) ||
2537                      ipv4_is_lbcast(fl4->daddr))) {
2538                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2539                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2540                         if (!dev_out)
2541                                 goto out;
2542 
2543                         /* Special hack: user can direct multicasts
2544                            and limited broadcast via necessary interface
2545                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2546                            This hack is not just for fun, it allows
2547                            vic,vat and friends to work.
2548                            They bind socket to loopback, set ttl to zero
2549                            and expect that it will work.
2550                            From the viewpoint of routing cache they are broken,
2551                            because we are not allowed to build multicast path
2552                            with loopback source addr (look, routing cache
2553                            cannot know, that ttl is zero, so that packet
2554                            will not leave this host and route is valid).
2555                            Luckily, this hack is good workaround.
2556                          */
2557 
2558                         fl4->flowi4_oif = dev_out->ifindex;
2559                         goto make_route;
2560                 }
2561 
2562                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2563                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2564                         if (!__ip_dev_find(net, fl4->saddr, false))
2565                                 goto out;
2566                 }
2567         }
2568 
2569 
2570         if (fl4->flowi4_oif) {
2571                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2572                 rth = ERR_PTR(-ENODEV);
2573                 if (!dev_out)
2574                         goto out;
2575 
2576                 /* RACE: Check return value of inet_select_addr instead. */
2577                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2578                         rth = ERR_PTR(-ENETUNREACH);
2579                         goto out;
2580                 }
2581                 if (ipv4_is_local_multicast(fl4->daddr) ||
2582                     ipv4_is_lbcast(fl4->daddr) ||
2583                     fl4->flowi4_proto == IPPROTO_IGMP) {
2584                         if (!fl4->saddr)
2585                                 fl4->saddr = inet_select_addr(dev_out, 0,
2586                                                               RT_SCOPE_LINK);
2587                         goto make_route;
2588                 }
2589                 if (!fl4->saddr) {
2590                         if (ipv4_is_multicast(fl4->daddr))
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               fl4->flowi4_scope);
2593                         else if (!fl4->daddr)
2594                                 fl4->saddr = inet_select_addr(dev_out, 0,
2595                                                               RT_SCOPE_HOST);
2596                 }
2597         }
2598 
2599         if (!fl4->daddr) {
2600                 fl4->daddr = fl4->saddr;
2601                 if (!fl4->daddr)
2602                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2603                 dev_out = net->loopback_dev;
2604                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2605                 res->type = RTN_LOCAL;
2606                 flags |= RTCF_LOCAL;
2607                 goto make_route;
2608         }
2609 
2610         err = fib_lookup(net, fl4, res, 0);
2611         if (err) {
2612                 res->fi = NULL;
2613                 res->table = NULL;
2614                 if (fl4->flowi4_oif &&
2615                     (ipv4_is_multicast(fl4->daddr) ||
2616                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2617                         /* Apparently, routing tables are wrong. Assume,
2618                            that the destination is on link.
2619 
2620                            WHY? DW.
2621                            Because we are allowed to send to iface
2622                            even if it has NO routes and NO assigned
2623                            addresses. When oif is specified, routing
2624                            tables are looked up with only one purpose:
2625                            to catch if destination is gatewayed, rather than
2626                            direct. Moreover, if MSG_DONTROUTE is set,
2627                            we send packet, ignoring both routing tables
2628                            and ifaddr state. --ANK
2629 
2630 
2631                            We could make it even if oif is unknown,
2632                            likely IPv6, but we do not.
2633                          */
2634 
2635                         if (fl4->saddr == 0)
2636                                 fl4->saddr = inet_select_addr(dev_out, 0,
2637                                                               RT_SCOPE_LINK);
2638                         res->type = RTN_UNICAST;
2639                         goto make_route;
2640                 }
2641                 rth = ERR_PTR(err);
2642                 goto out;
2643         }
2644 
2645         if (res->type == RTN_LOCAL) {
2646                 if (!fl4->saddr) {
2647                         if (res->fi->fib_prefsrc)
2648                                 fl4->saddr = res->fi->fib_prefsrc;
2649                         else
2650                                 fl4->saddr = fl4->daddr;
2651                 }
2652 
2653                 /* L3 master device is the loopback for that domain */
2654                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2655                         net->loopback_dev;
2656 
2657                 /* make sure orig_oif points to fib result device even
2658                  * though packet rx/tx happens over loopback or l3mdev
2659                  */
2660                 orig_oif = FIB_RES_OIF(*res);
2661 
2662                 fl4->flowi4_oif = dev_out->ifindex;
2663                 flags |= RTCF_LOCAL;
2664                 goto make_route;
2665         }
2666 
2667         fib_select_path(net, res, fl4, skb);
2668 
2669         dev_out = FIB_RES_DEV(*res);
2670         fl4->flowi4_oif = dev_out->ifindex;
2671 
2672 
2673 make_route:
2674         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2675 
2676 out:
2677         return rth;
2678 }
2679 
2680 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2681 {
2682         return NULL;
2683 }
2684 
2685 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2686 {
2687         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2688 
2689         return mtu ? : dst->dev->mtu;
2690 }
2691 
2692 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2693                                           struct sk_buff *skb, u32 mtu,
2694                                           bool confirm_neigh)
2695 {
2696 }
2697 
2698 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2699                                        struct sk_buff *skb)
2700 {
2701 }
2702 
2703 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2704                                           unsigned long old)
2705 {
2706         return NULL;
2707 }
2708 
2709 static struct dst_ops ipv4_dst_blackhole_ops = {
2710         .family                 =       AF_INET,
2711         .check                  =       ipv4_blackhole_dst_check,
2712         .mtu                    =       ipv4_blackhole_mtu,
2713         .default_advmss         =       ipv4_default_advmss,
2714         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2715         .redirect               =       ipv4_rt_blackhole_redirect,
2716         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2717         .neigh_lookup           =       ipv4_neigh_lookup,
2718 };
2719 
2720 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2721 {
2722         struct rtable *ort = (struct rtable *) dst_orig;
2723         struct rtable *rt;
2724 
2725         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2726         if (rt) {
2727                 struct dst_entry *new = &rt->dst;
2728 
2729                 new->__use = 1;
2730                 new->input = dst_discard;
2731                 new->output = dst_discard_out;
2732 
2733                 new->dev = net->loopback_dev;
2734                 if (new->dev)
2735                         dev_hold(new->dev);
2736 
2737                 rt->rt_is_input = ort->rt_is_input;
2738                 rt->rt_iif = ort->rt_iif;
2739                 rt->rt_pmtu = ort->rt_pmtu;
2740                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2741 
2742                 rt->rt_genid = rt_genid_ipv4(net);
2743                 rt->rt_flags = ort->rt_flags;
2744                 rt->rt_type = ort->rt_type;
2745                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2746                 rt->rt_gw_family = ort->rt_gw_family;
2747                 if (rt->rt_gw_family == AF_INET)
2748                         rt->rt_gw4 = ort->rt_gw4;
2749                 else if (rt->rt_gw_family == AF_INET6)
2750                         rt->rt_gw6 = ort->rt_gw6;
2751 
2752                 INIT_LIST_HEAD(&rt->rt_uncached);
2753         }
2754 
2755         dst_release(dst_orig);
2756 
2757         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2758 }
2759 
2760 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2761                                     const struct sock *sk)
2762 {
2763         struct rtable *rt = __ip_route_output_key(net, flp4);
2764 
2765         if (IS_ERR(rt))
2766                 return rt;
2767 
2768         if (flp4->flowi4_proto)
2769                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2770                                                         flowi4_to_flowi(flp4),
2771                                                         sk, 0);
2772 
2773         return rt;
2774 }
2775 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776 
2777 /* called with rcu_read_lock held */
2778 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2779                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2780                         struct sk_buff *skb, u32 portid, u32 seq,
2781                         unsigned int flags)
2782 {
2783         struct rtmsg *r;
2784         struct nlmsghdr *nlh;
2785         unsigned long expires = 0;
2786         u32 error;
2787         u32 metrics[RTAX_MAX];
2788 
2789         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2790         if (!nlh)
2791                 return -EMSGSIZE;
2792 
2793         r = nlmsg_data(nlh);
2794         r->rtm_family    = AF_INET;
2795         r->rtm_dst_len  = 32;
2796         r->rtm_src_len  = 0;
2797         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2798         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2799         if (nla_put_u32(skb, RTA_TABLE, table_id))
2800                 goto nla_put_failure;
2801         r->rtm_type     = rt->rt_type;
2802         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2803         r->rtm_protocol = RTPROT_UNSPEC;
2804         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2805         if (rt->rt_flags & RTCF_NOTIFY)
2806                 r->rtm_flags |= RTM_F_NOTIFY;
2807         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2808                 r->rtm_flags |= RTCF_DOREDIRECT;
2809 
2810         if (nla_put_in_addr(skb, RTA_DST, dst))
2811                 goto nla_put_failure;
2812         if (src) {
2813                 r->rtm_src_len = 32;
2814                 if (nla_put_in_addr(skb, RTA_SRC, src))
2815                         goto nla_put_failure;
2816         }
2817         if (rt->dst.dev &&
2818             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2819                 goto nla_put_failure;
2820 #ifdef CONFIG_IP_ROUTE_CLASSID
2821         if (rt->dst.tclassid &&
2822             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2823                 goto nla_put_failure;
2824 #endif
2825         if (fl4 && !rt_is_input_route(rt) &&
2826             fl4->saddr != src) {
2827                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2828                         goto nla_put_failure;
2829         }
2830         if (rt->rt_uses_gateway) {
2831                 if (rt->rt_gw_family == AF_INET &&
2832                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2833                         goto nla_put_failure;
2834                 } else if (rt->rt_gw_family == AF_INET6) {
2835                         int alen = sizeof(struct in6_addr);
2836                         struct nlattr *nla;
2837                         struct rtvia *via;
2838 
2839                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2840                         if (!nla)
2841                                 goto nla_put_failure;
2842 
2843                         via = nla_data(nla);
2844                         via->rtvia_family = AF_INET6;
2845                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2846                 }
2847         }
2848 
2849         expires = rt->dst.expires;
2850         if (expires) {
2851                 unsigned long now = jiffies;
2852 
2853                 if (time_before(now, expires))
2854                         expires -= now;
2855                 else
2856                         expires = 0;
2857         }
2858 
2859         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2860         if (rt->rt_pmtu && expires)
2861                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2862         if (rt->rt_mtu_locked && expires)
2863                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2864         if (rtnetlink_put_metrics(skb, metrics) < 0)
2865                 goto nla_put_failure;
2866 
2867         if (fl4) {
2868                 if (fl4->flowi4_mark &&
2869                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2870                         goto nla_put_failure;
2871 
2872                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2873                     nla_put_u32(skb, RTA_UID,
2874                                 from_kuid_munged(current_user_ns(),
2875                                                  fl4->flowi4_uid)))
2876                         goto nla_put_failure;
2877 
2878                 if (rt_is_input_route(rt)) {
2879 #ifdef CONFIG_IP_MROUTE
2880                         if (ipv4_is_multicast(dst) &&
2881                             !ipv4_is_local_multicast(dst) &&
2882                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2883                                 int err = ipmr_get_route(net, skb,
2884                                                          fl4->saddr, fl4->daddr,
2885                                                          r, portid);
2886 
2887                                 if (err <= 0) {
2888                                         if (err == 0)
2889                                                 return 0;
2890                                         goto nla_put_failure;
2891                                 }
2892                         } else
2893 #endif
2894                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2895                                         goto nla_put_failure;
2896                 }
2897         }
2898 
2899         error = rt->dst.error;
2900 
2901         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2902                 goto nla_put_failure;
2903 
2904         nlmsg_end(skb, nlh);
2905         return 0;
2906 
2907 nla_put_failure:
2908         nlmsg_cancel(skb, nlh);
2909         return -EMSGSIZE;
2910 }
2911 
2912 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2913                             struct netlink_callback *cb, u32 table_id,
2914                             struct fnhe_hash_bucket *bucket, int genid,
2915                             int *fa_index, int fa_start, unsigned int flags)
2916 {
2917         int i;
2918 
2919         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2920                 struct fib_nh_exception *fnhe;
2921 
2922                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2923                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2924                         struct rtable *rt;
2925                         int err;
2926 
2927                         if (*fa_index < fa_start)
2928                                 goto next;
2929 
2930                         if (fnhe->fnhe_genid != genid)
2931                                 goto next;
2932 
2933                         if (fnhe->fnhe_expires &&
2934                             time_after(jiffies, fnhe->fnhe_expires))
2935                                 goto next;
2936 
2937                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2938                         if (!rt)
2939                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2940                         if (!rt)
2941                                 goto next;
2942 
2943                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2944                                            table_id, NULL, skb,
2945                                            NETLINK_CB(cb->skb).portid,
2946                                            cb->nlh->nlmsg_seq, flags);
2947                         if (err)
2948                                 return err;
2949 next:
2950                         (*fa_index)++;
2951                 }
2952         }
2953 
2954         return 0;
2955 }
2956 
2957 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2958                        u32 table_id, struct fib_info *fi,
2959                        int *fa_index, int fa_start, unsigned int flags)
2960 {
2961         struct net *net = sock_net(cb->skb->sk);
2962         int nhsel, genid = fnhe_genid(net);
2963 
2964         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2965                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2966                 struct fnhe_hash_bucket *bucket;
2967                 int err;
2968 
2969                 if (nhc->nhc_flags & RTNH_F_DEAD)
2970                         continue;
2971 
2972                 rcu_read_lock();
2973                 bucket = rcu_dereference(nhc->nhc_exceptions);
2974                 err = 0;
2975                 if (bucket)
2976                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2977                                                genid, fa_index, fa_start,
2978                                                flags);
2979                 rcu_read_unlock();
2980                 if (err)
2981                         return err;
2982         }
2983 
2984         return 0;
2985 }
2986 
2987 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2988                                                    u8 ip_proto, __be16 sport,
2989                                                    __be16 dport)
2990 {
2991         struct sk_buff *skb;
2992         struct iphdr *iph;
2993 
2994         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2995         if (!skb)
2996                 return NULL;
2997 
2998         /* Reserve room for dummy headers, this skb can pass
2999          * through good chunk of routing engine.
3000          */
3001         skb_reset_mac_header(skb);
3002         skb_reset_network_header(skb);
3003         skb->protocol = htons(ETH_P_IP);
3004         iph = skb_put(skb, sizeof(struct iphdr));
3005         iph->protocol = ip_proto;
3006         iph->saddr = src;
3007         iph->daddr = dst;
3008         iph->version = 0x4;
3009         iph->frag_off = 0;
3010         iph->ihl = 0x5;
3011         skb_set_transport_header(skb, skb->len);
3012 
3013         switch (iph->protocol) {
3014         case IPPROTO_UDP: {
3015                 struct udphdr *udph;
3016 
3017                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3018                 udph->source = sport;
3019                 udph->dest = dport;
3020                 udph->len = sizeof(struct udphdr);
3021                 udph->check = 0;
3022                 break;
3023         }
3024         case IPPROTO_TCP: {
3025                 struct tcphdr *tcph;
3026 
3027                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3028                 tcph->source    = sport;
3029                 tcph->dest      = dport;
3030                 tcph->doff      = sizeof(struct tcphdr) / 4;
3031                 tcph->rst = 1;
3032                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3033                                             src, dst, 0);
3034                 break;
3035         }
3036         case IPPROTO_ICMP: {
3037                 struct icmphdr *icmph;
3038 
3039                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3040                 icmph->type = ICMP_ECHO;
3041                 icmph->code = 0;
3042         }
3043         }
3044 
3045         return skb;
3046 }
3047 
3048 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3049                                        const struct nlmsghdr *nlh,
3050                                        struct nlattr **tb,
3051                                        struct netlink_ext_ack *extack)
3052 {
3053         struct rtmsg *rtm;
3054         int i, err;
3055 
3056         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3057                 NL_SET_ERR_MSG(extack,
3058                                "ipv4: Invalid header for route get request");
3059                 return -EINVAL;
3060         }
3061 
3062         if (!netlink_strict_get_check(skb))
3063                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3064                                               rtm_ipv4_policy, extack);
3065 
3066         rtm = nlmsg_data(nlh);
3067         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3068             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3069             rtm->rtm_table || rtm->rtm_protocol ||
3070             rtm->rtm_scope || rtm->rtm_type) {
3071                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3072                 return -EINVAL;
3073         }
3074 
3075         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3076                                RTM_F_LOOKUP_TABLE |
3077                                RTM_F_FIB_MATCH)) {
3078                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3079                 return -EINVAL;
3080         }
3081 
3082         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3083                                             rtm_ipv4_policy, extack);
3084         if (err)
3085                 return err;
3086 
3087         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3088             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3089                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3090                 return -EINVAL;
3091         }
3092 
3093         for (i = 0; i <= RTA_MAX; i++) {
3094                 if (!tb[i])
3095                         continue;
3096 
3097                 switch (i) {
3098                 case RTA_IIF:
3099                 case RTA_OIF:
3100                 case RTA_SRC:
3101                 case RTA_DST:
3102                 case RTA_IP_PROTO:
3103                 case RTA_SPORT:
3104                 case RTA_DPORT:
3105                 case RTA_MARK:
3106                 case RTA_UID:
3107                         break;
3108                 default:
3109                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3110                         return -EINVAL;
3111                 }
3112         }
3113 
3114         return 0;
3115 }
3116 
3117 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3118                              struct netlink_ext_ack *extack)
3119 {
3120         struct net *net = sock_net(in_skb->sk);
3121         struct nlattr *tb[RTA_MAX+1];
3122         u32 table_id = RT_TABLE_MAIN;
3123         __be16 sport = 0, dport = 0;
3124         struct fib_result res = {};
3125         u8 ip_proto = IPPROTO_UDP;
3126         struct rtable *rt = NULL;
3127         struct sk_buff *skb;
3128         struct rtmsg *rtm;
3129         struct flowi4 fl4 = {};
3130         __be32 dst = 0;
3131         __be32 src = 0;
3132         kuid_t uid;
3133         u32 iif;
3134         int err;
3135         int mark;
3136 
3137         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3138         if (err < 0)
3139                 return err;
3140 
3141         rtm = nlmsg_data(nlh);
3142         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3143         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3144         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3145         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3146         if (tb[RTA_UID])
3147                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3148         else
3149                 uid = (iif ? INVALID_UID : current_uid());
3150 
3151         if (tb[RTA_IP_PROTO]) {
3152                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3153                                                   &ip_proto, AF_INET, extack);
3154                 if (err)
3155                         return err;
3156         }
3157 
3158         if (tb[RTA_SPORT])
3159                 sport = nla_get_be16(tb[RTA_SPORT]);
3160 
3161         if (tb[RTA_DPORT])
3162                 dport = nla_get_be16(tb[RTA_DPORT]);
3163 
3164         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3165         if (!skb)
3166                 return -ENOBUFS;
3167 
3168         fl4.daddr = dst;
3169         fl4.saddr = src;
3170         fl4.flowi4_tos = rtm->rtm_tos;
3171         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3172         fl4.flowi4_mark = mark;
3173         fl4.flowi4_uid = uid;
3174         if (sport)
3175                 fl4.fl4_sport = sport;
3176         if (dport)
3177                 fl4.fl4_dport = dport;
3178         fl4.flowi4_proto = ip_proto;
3179 
3180         rcu_read_lock();
3181 
3182         if (iif) {
3183                 struct net_device *dev;
3184 
3185                 dev = dev_get_by_index_rcu(net, iif);
3186                 if (!dev) {
3187                         err = -ENODEV;
3188                         goto errout_rcu;
3189                 }
3190 
3191                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3192                 skb->dev        = dev;
3193                 skb->mark       = mark;
3194                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3195                                          dev, &res);
3196 
3197                 rt = skb_rtable(skb);
3198                 if (err == 0 && rt->dst.error)
3199                         err = -rt->dst.error;
3200         } else {
3201                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3202                 skb->dev = net->loopback_dev;
3203                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3204                 err = 0;
3205                 if (IS_ERR(rt))
3206                         err = PTR_ERR(rt);
3207                 else
3208                         skb_dst_set(skb, &rt->dst);
3209         }
3210 
3211         if (err)
3212                 goto errout_rcu;
3213 
3214         if (rtm->rtm_flags & RTM_F_NOTIFY)
3215                 rt->rt_flags |= RTCF_NOTIFY;
3216 
3217         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3218                 table_id = res.table ? res.table->tb_id : 0;
3219 
3220         /* reset skb for netlink reply msg */
3221         skb_trim(skb, 0);
3222         skb_reset_network_header(skb);
3223         skb_reset_transport_header(skb);
3224         skb_reset_mac_header(skb);
3225 
3226         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3227                 if (!res.fi) {
3228                         err = fib_props[res.type].error;
3229                         if (!err)
3230                                 err = -EHOSTUNREACH;
3231                         goto errout_rcu;
3232                 }
3233                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3234                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3235                                     rt->rt_type, res.prefix, res.prefixlen,
3236                                     fl4.flowi4_tos, res.fi, 0);
3237         } else {
3238                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3239                                    NETLINK_CB(in_skb).portid,
3240                                    nlh->nlmsg_seq, 0);
3241         }
3242         if (err < 0)
3243                 goto errout_rcu;
3244 
3245         rcu_read_unlock();
3246 
3247         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3248 
3249 errout_free:
3250         return err;
3251 errout_rcu:
3252         rcu_read_unlock();
3253         kfree_skb(skb);
3254         goto errout_free;
3255 }
3256 
3257 void ip_rt_multicast_event(struct in_device *in_dev)
3258 {
3259         rt_cache_flush(dev_net(in_dev->dev));
3260 }
3261 
3262 #ifdef CONFIG_SYSCTL
3263 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3264 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3265 static int ip_rt_gc_elasticity __read_mostly    = 8;
3266 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3267 
3268 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3269                                         void __user *buffer,
3270                                         size_t *lenp, loff_t *ppos)
3271 {
3272         struct net *net = (struct net *)__ctl->extra1;
3273 
3274         if (write) {
3275                 rt_cache_flush(net);
3276                 fnhe_genid_bump(net);
3277                 return 0;
3278         }
3279 
3280         return -EINVAL;
3281 }
3282 
3283 static struct ctl_table ipv4_route_table[] = {
3284         {
3285                 .procname       = "gc_thresh",
3286                 .data           = &ipv4_dst_ops.gc_thresh,
3287                 .maxlen         = sizeof(int),
3288                 .mode           = 0644,
3289                 .proc_handler   = proc_dointvec,
3290         },
3291         {
3292                 .procname       = "max_size",
3293                 .data           = &ip_rt_max_size,
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0644,
3296                 .proc_handler   = proc_dointvec,
3297         },
3298         {
3299                 /*  Deprecated. Use gc_min_interval_ms */
3300 
3301                 .procname       = "gc_min_interval",
3302                 .data           = &ip_rt_gc_min_interval,
3303                 .maxlen         = sizeof(int),
3304                 .mode           = 0644,
3305                 .proc_handler   = proc_dointvec_jiffies,
3306         },
3307         {
3308                 .procname       = "gc_min_interval_ms",
3309                 .data           = &ip_rt_gc_min_interval,
3310                 .maxlen         = sizeof(int),
3311                 .mode           = 0644,
3312                 .proc_handler   = proc_dointvec_ms_jiffies,
3313         },
3314         {
3315                 .procname       = "gc_timeout",
3316                 .data           = &ip_rt_gc_timeout,
3317                 .maxlen         = sizeof(int),
3318                 .mode           = 0644,
3319                 .proc_handler   = proc_dointvec_jiffies,
3320         },
3321         {
3322                 .procname       = "gc_interval",
3323                 .data           = &ip_rt_gc_interval,
3324                 .maxlen         = sizeof(int),
3325                 .mode           = 0644,
3326                 .proc_handler   = proc_dointvec_jiffies,
3327         },
3328         {
3329                 .procname       = "redirect_load",
3330                 .data           = &ip_rt_redirect_load,
3331                 .maxlen         = sizeof(int),
3332                 .mode           = 0644,
3333                 .proc_handler   = proc_dointvec,
3334         },
3335         {
3336                 .procname       = "redirect_number",
3337                 .data           = &ip_rt_redirect_number,
3338                 .maxlen         = sizeof(int),
3339                 .mode           = 0644,
3340                 .proc_handler   = proc_dointvec,
3341         },
3342         {
3343                 .procname       = "redirect_silence",
3344                 .data           = &ip_rt_redirect_silence,
3345                 .maxlen         = sizeof(int),
3346                 .mode           = 0644,
3347                 .proc_handler   = proc_dointvec,
3348         },
3349         {
3350                 .procname       = "error_cost",
3351                 .data           = &ip_rt_error_cost,
3352                 .maxlen         = sizeof(int),
3353                 .mode           = 0644,
3354                 .proc_handler   = proc_dointvec,
3355         },
3356         {
3357                 .procname       = "error_burst",
3358                 .data           = &ip_rt_error_burst,
3359                 .maxlen         = sizeof(int),
3360                 .mode           = 0644,
3361                 .proc_handler   = proc_dointvec,
3362         },
3363         {
3364                 .procname       = "gc_elasticity",
3365                 .data           = &ip_rt_gc_elasticity,
3366                 .maxlen         = sizeof(int),
3367                 .mode           = 0644,
3368                 .proc_handler   = proc_dointvec,
3369         },
3370         {
3371                 .procname       = "mtu_expires",
3372                 .data           = &ip_rt_mtu_expires,
3373                 .maxlen         = sizeof(int),
3374                 .mode           = 0644,
3375                 .proc_handler   = proc_dointvec_jiffies,
3376         },
3377         {
3378                 .procname       = "min_pmtu",
3379                 .data           = &ip_rt_min_pmtu,
3380                 .maxlen         = sizeof(int),
3381                 .mode           = 0644,
3382                 .proc_handler   = proc_dointvec_minmax,
3383                 .extra1         = &ip_min_valid_pmtu,
3384         },
3385         {
3386                 .procname       = "min_adv_mss",
3387                 .data           = &ip_rt_min_advmss,
3388                 .maxlen         = sizeof(int),
3389                 .mode           = 0644,
3390                 .proc_handler   = proc_dointvec,
3391         },
3392         { }
3393 };
3394 
3395 static const char ipv4_route_flush_procname[] = "flush";
3396 
3397 static struct ctl_table ipv4_route_flush_table[] = {
3398         {
3399                 .procname       = ipv4_route_flush_procname,
3400                 .maxlen         = sizeof(int),
3401                 .mode           = 0200,
3402                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3403         },
3404         { },
3405 };
3406 
3407 static __net_init int sysctl_route_net_init(struct net *net)
3408 {
3409         struct ctl_table *tbl;
3410 
3411         tbl = ipv4_route_flush_table;
3412         if (!net_eq(net, &init_net)) {
3413                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3414                 if (!tbl)
3415                         goto err_dup;
3416 
3417                 /* Don't export non-whitelisted sysctls to unprivileged users */
3418                 if (net->user_ns != &init_user_ns) {
3419                         if (tbl[0].procname != ipv4_route_flush_procname)
3420                                 tbl[0].procname = NULL;
3421                 }
3422         }
3423         tbl[0].extra1 = net;
3424 
3425         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3426         if (!net->ipv4.route_hdr)
3427                 goto err_reg;
3428         return 0;
3429 
3430 err_reg:
3431         if (tbl != ipv4_route_flush_table)
3432                 kfree(tbl);
3433 err_dup:
3434         return -ENOMEM;
3435 }
3436 
3437 static __net_exit void sysctl_route_net_exit(struct net *net)
3438 {
3439         struct ctl_table *tbl;
3440 
3441         tbl = net->ipv4.route_hdr->ctl_table_arg;
3442         unregister_net_sysctl_table(net->ipv4.route_hdr);
3443         BUG_ON(tbl == ipv4_route_flush_table);
3444         kfree(tbl);
3445 }
3446 
3447 static __net_initdata struct pernet_operations sysctl_route_ops = {
3448         .init = sysctl_route_net_init,
3449         .exit = sysctl_route_net_exit,
3450 };
3451 #endif
3452 
3453 static __net_init int rt_genid_init(struct net *net)
3454 {
3455         atomic_set(&net->ipv4.rt_genid, 0);
3456         atomic_set(&net->fnhe_genid, 0);
3457         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3458         return 0;
3459 }
3460 
3461 static __net_initdata struct pernet_operations rt_genid_ops = {
3462         .init = rt_genid_init,
3463 };
3464 
3465 static int __net_init ipv4_inetpeer_init(struct net *net)
3466 {
3467         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3468 
3469         if (!bp)
3470                 return -ENOMEM;
3471         inet_peer_base_init(bp);
3472         net->ipv4.peers = bp;
3473         return 0;
3474 }
3475 
3476 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3477 {
3478         struct inet_peer_base *bp = net->ipv4.peers;
3479 
3480         net->ipv4.peers = NULL;
3481         inetpeer_invalidate_tree(bp);
3482         kfree(bp);
3483 }
3484 
3485 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3486         .init   =       ipv4_inetpeer_init,
3487         .exit   =       ipv4_inetpeer_exit,
3488 };
3489 
3490 #ifdef CONFIG_IP_ROUTE_CLASSID
3491 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3492 #endif /* CONFIG_IP_ROUTE_CLASSID */
3493 
3494 int __init ip_rt_init(void)
3495 {
3496         int cpu;
3497 
3498         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3499                                   GFP_KERNEL);
3500         if (!ip_idents)
3501                 panic("IP: failed to allocate ip_idents\n");
3502 
3503         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3504 
3505         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3506         if (!ip_tstamps)
3507                 panic("IP: failed to allocate ip_tstamps\n");
3508 
3509         for_each_possible_cpu(cpu) {
3510                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3511 
3512                 INIT_LIST_HEAD(&ul->head);
3513                 spin_lock_init(&ul->lock);
3514         }
3515 #ifdef CONFIG_IP_ROUTE_CLASSID
3516         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3517         if (!ip_rt_acct)
3518                 panic("IP: failed to allocate ip_rt_acct\n");
3519 #endif
3520 
3521         ipv4_dst_ops.kmem_cachep =
3522                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3523                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3524 
3525         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3526 
3527         if (dst_entries_init(&ipv4_dst_ops) < 0)
3528                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3529 
3530         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3531                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3532 
3533         ipv4_dst_ops.gc_thresh = ~0;
3534         ip_rt_max_size = INT_MAX;
3535 
3536         devinet_init();
3537         ip_fib_init();
3538 
3539         if (ip_rt_proc_init())
3540                 pr_err("Unable to create route proc files\n");
3541 #ifdef CONFIG_XFRM
3542         xfrm_init();
3543         xfrm4_init();
3544 #endif
3545         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3546                       RTNL_FLAG_DOIT_UNLOCKED);
3547 
3548 #ifdef CONFIG_SYSCTL
3549         register_pernet_subsys(&sysctl_route_ops);
3550 #endif
3551         register_pernet_subsys(&rt_genid_ops);
3552         register_pernet_subsys(&ipv4_inetpeer_ops);
3553         return 0;
3554 }
3555 
3556 #ifdef CONFIG_SYSCTL
3557 /*
3558  * We really need to sanitize the damn ipv4 init order, then all
3559  * this nonsense will go away.
3560  */
3561 void __init ip_static_sysctl_init(void)
3562 {
3563         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3564 }
3565 #endif
3566 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp