~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/route.c

Version: ~ [ linux-6.1-rc7 ] ~ [ linux-6.0.10 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.80 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.156 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.225 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.267 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.300 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.334 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  4  *              operating system.  INET is implemented using the  BSD Socket
  5  *              interface as the means of communication with the user level.
  6  *
  7  *              ROUTE - implementation of the IP router.
  8  *
  9  * Authors:     Ross Biro
 10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 14  *
 15  * Fixes:
 16  *              Alan Cox        :       Verify area fixes.
 17  *              Alan Cox        :       cli() protects routing changes
 18  *              Rui Oliveira    :       ICMP routing table updates
 19  *              (rco@di.uminho.pt)      Routing table insertion and update
 20  *              Linus Torvalds  :       Rewrote bits to be sensible
 21  *              Alan Cox        :       Added BSD route gw semantics
 22  *              Alan Cox        :       Super /proc >4K
 23  *              Alan Cox        :       MTU in route table
 24  *              Alan Cox        :       MSS actually. Also added the window
 25  *                                      clamper.
 26  *              Sam Lantinga    :       Fixed route matching in rt_del()
 27  *              Alan Cox        :       Routing cache support.
 28  *              Alan Cox        :       Removed compatibility cruft.
 29  *              Alan Cox        :       RTF_REJECT support.
 30  *              Alan Cox        :       TCP irtt support.
 31  *              Jonathan Naylor :       Added Metric support.
 32  *      Miquel van Smoorenburg  :       BSD API fixes.
 33  *      Miquel van Smoorenburg  :       Metrics.
 34  *              Alan Cox        :       Use __u32 properly
 35  *              Alan Cox        :       Aligned routing errors more closely with BSD
 36  *                                      our system is still very different.
 37  *              Alan Cox        :       Faster /proc handling
 38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 39  *                                      routing caches and better behaviour.
 40  *
 41  *              Olaf Erb        :       irtt wasn't being copied right.
 42  *              Bjorn Ekwall    :       Kerneld route support.
 43  *              Alan Cox        :       Multicast fixed (I hope)
 44  *              Pavel Krauz     :       Limited broadcast fixed
 45  *              Mike McLagan    :       Routing by source
 46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 47  *                                      route.c and rewritten from scratch.
 48  *              Andi Kleen      :       Load-limit warning messages.
 49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 53  *              Marc Boucher    :       routing by fwmark
 54  *      Robert Olsson           :       Added rt_cache statistics
 55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
 59  */
 60 
 61 #define pr_fmt(fmt) "IPv4: " fmt
 62 
 63 #include <linux/module.h>
 64 #include <linux/uaccess.h>
 65 #include <linux/bitops.h>
 66 #include <linux/types.h>
 67 #include <linux/kernel.h>
 68 #include <linux/mm.h>
 69 #include <linux/string.h>
 70 #include <linux/socket.h>
 71 #include <linux/sockios.h>
 72 #include <linux/errno.h>
 73 #include <linux/in.h>
 74 #include <linux/inet.h>
 75 #include <linux/netdevice.h>
 76 #include <linux/proc_fs.h>
 77 #include <linux/init.h>
 78 #include <linux/skbuff.h>
 79 #include <linux/inetdevice.h>
 80 #include <linux/igmp.h>
 81 #include <linux/pkt_sched.h>
 82 #include <linux/mroute.h>
 83 #include <linux/netfilter_ipv4.h>
 84 #include <linux/random.h>
 85 #include <linux/rcupdate.h>
 86 #include <linux/times.h>
 87 #include <linux/slab.h>
 88 #include <linux/jhash.h>
 89 #include <net/dst.h>
 90 #include <net/dst_metadata.h>
 91 #include <net/net_namespace.h>
 92 #include <net/protocol.h>
 93 #include <net/ip.h>
 94 #include <net/route.h>
 95 #include <net/inetpeer.h>
 96 #include <net/sock.h>
 97 #include <net/ip_fib.h>
 98 #include <net/nexthop.h>
 99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112 
113 #include "fib_lookup.h"
114 
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define RT_GC_TIMEOUT (300*HZ)
119 
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129 
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131 
132 /*
133  *      Interface to generic destination cache.
134  */
135 
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153 
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158 
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174 
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176 
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196 
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199 
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207 
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213 
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217 
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227 
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234 
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239 
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246 
247 
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251 
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254 
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263 
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267 
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276 
277 }
278 
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281 
282 }
283 
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287 
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292 
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303 
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307 
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317 
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324 
325 
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330 
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337 
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343 
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347 
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357 
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363 
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367 
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372 
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377 
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385 
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395 
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404 
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409 
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414 
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421 
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426 
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431 
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439 
440         rcu_read_lock_bh();
441 
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448 
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452 
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455 
456         rcu_read_unlock_bh();
457 
458         return n;
459 }
460 
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466 
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478 
479 #define IP_IDENTS_SZ 2048u
480 
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483 
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 delta = 0;
495 
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498 
499         /* If UBSAN reports an error there, please make sure your compiler
500          * supports -fno-strict-overflow before reporting it that was a bug
501          * in UBSAN, and it has been fixed in GCC-8.
502          */
503         return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506 
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509         u32 hash, id;
510 
511         /* Note the following code is not safe, but this is okay. */
512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513                 get_random_bytes(&net->ipv4.ip_id_key,
514                                  sizeof(net->ipv4.ip_id_key));
515 
516         hash = siphash_3u32((__force u32)iph->daddr,
517                             (__force u32)iph->saddr,
518                             iph->protocol,
519                             &net->ipv4.ip_id_key);
520         id = ip_idents_reserve(hash, segs);
521         iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524 
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526                              const struct sock *sk,
527                              const struct iphdr *iph,
528                              int oif, u8 tos,
529                              u8 prot, u32 mark, int flow_flags)
530 {
531         if (sk) {
532                 const struct inet_sock *inet = inet_sk(sk);
533 
534                 oif = sk->sk_bound_dev_if;
535                 mark = sk->sk_mark;
536                 tos = RT_CONN_FLAGS(sk);
537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538         }
539         flowi4_init_output(fl4, oif, mark, tos,
540                            RT_SCOPE_UNIVERSE, prot,
541                            flow_flags,
542                            iph->daddr, iph->saddr, 0, 0,
543                            sock_net_uid(net, sk));
544 }
545 
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547                                const struct sock *sk)
548 {
549         const struct net *net = dev_net(skb->dev);
550         const struct iphdr *iph = ip_hdr(skb);
551         int oif = skb->dev->ifindex;
552         u8 tos = RT_TOS(iph->tos);
553         u8 prot = iph->protocol;
554         u32 mark = skb->mark;
555 
556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558 
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561         const struct inet_sock *inet = inet_sk(sk);
562         const struct ip_options_rcu *inet_opt;
563         __be32 daddr = inet->inet_daddr;
564 
565         rcu_read_lock();
566         inet_opt = rcu_dereference(inet->inet_opt);
567         if (inet_opt && inet_opt->opt.srr)
568                 daddr = inet_opt->opt.faddr;
569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572                            inet_sk_flowi_flags(sk),
573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574         rcu_read_unlock();
575 }
576 
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578                                  const struct sk_buff *skb)
579 {
580         if (skb)
581                 build_skb_flow_key(fl4, skb, sk);
582         else
583                 build_sk_flow_key(fl4, sk);
584 }
585 
586 static DEFINE_SPINLOCK(fnhe_lock);
587 
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590         struct rtable *rt;
591 
592         rt = rcu_dereference(fnhe->fnhe_rth_input);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598         rt = rcu_dereference(fnhe->fnhe_rth_output);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604 }
605 
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608         struct fib_nh_exception *fnhe, *oldest;
609 
610         oldest = rcu_dereference(hash->chain);
611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614                         oldest = fnhe;
615         }
616         fnhe_flush_routes(oldest);
617         return oldest;
618 }
619 
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622         static u32 fnhe_hashrnd __read_mostly;
623         u32 hval;
624 
625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
627         return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629 
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632         rt->rt_pmtu = fnhe->fnhe_pmtu;
633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634         rt->dst.expires = fnhe->fnhe_expires;
635 
636         if (fnhe->fnhe_gw) {
637                 rt->rt_flags |= RTCF_REDIRECTED;
638                 rt->rt_uses_gateway = 1;
639                 rt->rt_gw_family = AF_INET;
640                 rt->rt_gw4 = fnhe->fnhe_gw;
641         }
642 }
643 
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645                                   __be32 gw, u32 pmtu, bool lock,
646                                   unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654 
655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
656         hval = fnhe_hashfun(daddr);
657 
658         spin_lock_bh(&fnhe_lock);
659 
660         hash = rcu_dereference(nhc->nhc_exceptions);
661         if (!hash) {
662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666         }
667 
668         hash += hval;
669 
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677 
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu) {
684                         fnhe->fnhe_pmtu = pmtu;
685                         fnhe->fnhe_mtu_locked = lock;
686                 }
687                 fnhe->fnhe_expires = max(1UL, expires);
688                 /* Update all cached dsts too */
689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695         } else {
696                 if (depth > FNHE_RECLAIM_DEPTH)
697                         fnhe = fnhe_oldest(hash);
698                 else {
699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700                         if (!fnhe)
701                                 goto out_unlock;
702 
703                         fnhe->fnhe_next = hash->chain;
704                         rcu_assign_pointer(hash->chain, fnhe);
705                 }
706                 fnhe->fnhe_genid = genid;
707                 fnhe->fnhe_daddr = daddr;
708                 fnhe->fnhe_gw = gw;
709                 fnhe->fnhe_pmtu = pmtu;
710                 fnhe->fnhe_mtu_locked = lock;
711                 fnhe->fnhe_expires = max(1UL, expires);
712 
713                 /* Exception created; mark the cached routes for the nexthop
714                  * stale, so anyone caching it rechecks if this exception
715                  * applies to them.
716                  */
717                 rt = rcu_dereference(nhc->nhc_rth_input);
718                 if (rt)
719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
720 
721                 for_each_possible_cpu(i) {
722                         struct rtable __rcu **prt;
723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724                         rt = rcu_dereference(*prt);
725                         if (rt)
726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
727                 }
728         }
729 
730         fnhe->fnhe_stamp = jiffies;
731 
732 out_unlock:
733         spin_unlock_bh(&fnhe_lock);
734 }
735 
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737                              bool kill_route)
738 {
739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
740         __be32 old_gw = ip_hdr(skb)->saddr;
741         struct net_device *dev = skb->dev;
742         struct in_device *in_dev;
743         struct fib_result res;
744         struct neighbour *n;
745         struct net *net;
746 
747         switch (icmp_hdr(skb)->code & 7) {
748         case ICMP_REDIR_NET:
749         case ICMP_REDIR_NETTOS:
750         case ICMP_REDIR_HOST:
751         case ICMP_REDIR_HOSTTOS:
752                 break;
753 
754         default:
755                 return;
756         }
757 
758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759                 return;
760 
761         in_dev = __in_dev_get_rcu(dev);
762         if (!in_dev)
763                 return;
764 
765         net = dev_net(dev);
766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768             ipv4_is_zeronet(new_gw))
769                 goto reject_redirect;
770 
771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773                         goto reject_redirect;
774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775                         goto reject_redirect;
776         } else {
777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778                         goto reject_redirect;
779         }
780 
781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782         if (!n)
783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784         if (!IS_ERR(n)) {
785                 if (!(n->nud_state & NUD_VALID)) {
786                         neigh_event_send(n, NULL);
787                 } else {
788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
789                                 struct fib_nh_common *nhc;
790 
791                                 fib_select_path(net, &res, fl4, skb);
792                                 nhc = FIB_RES_NHC(res);
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804 
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811 
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820 
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831 
832         rt = (struct rtable *) dst;
833 
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837 
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842 
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855 
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871 
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880 
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890 
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898 
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906 
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914 
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->n_redirects == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->n_redirects)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923 
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->n_redirects;
927 #ifdef CONFIG_IP_ROUTE_VERBOSE
928                 if (log_martians &&
929                     peer->n_redirects == ip_rt_redirect_number)
930                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
931                                              &ip_hdr(skb)->saddr, inet_iif(skb),
932                                              &ip_hdr(skb)->daddr, &gw);
933 #endif
934         }
935 out_put_peer:
936         inet_putpeer(peer);
937 }
938 
939 static int ip_error(struct sk_buff *skb)
940 {
941         struct rtable *rt = skb_rtable(skb);
942         struct net_device *dev = skb->dev;
943         struct in_device *in_dev;
944         struct inet_peer *peer;
945         unsigned long now;
946         struct net *net;
947         bool send;
948         int code;
949 
950         if (netif_is_l3_master(skb->dev)) {
951                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952                 if (!dev)
953                         goto out;
954         }
955 
956         in_dev = __in_dev_get_rcu(dev);
957 
958         /* IP on this device is disabled. */
959         if (!in_dev)
960                 goto out;
961 
962         net = dev_net(rt->dst.dev);
963         if (!IN_DEV_FORWARD(in_dev)) {
964                 switch (rt->dst.error) {
965                 case EHOSTUNREACH:
966                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
967                         break;
968 
969                 case ENETUNREACH:
970                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
971                         break;
972                 }
973                 goto out;
974         }
975 
976         switch (rt->dst.error) {
977         case EINVAL:
978         default:
979                 goto out;
980         case EHOSTUNREACH:
981                 code = ICMP_HOST_UNREACH;
982                 break;
983         case ENETUNREACH:
984                 code = ICMP_NET_UNREACH;
985                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986                 break;
987         case EACCES:
988                 code = ICMP_PKT_FILTERED;
989                 break;
990         }
991 
992         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
993                                l3mdev_master_ifindex(skb->dev), 1);
994 
995         send = true;
996         if (peer) {
997                 now = jiffies;
998                 peer->rate_tokens += now - peer->rate_last;
999                 if (peer->rate_tokens > ip_rt_error_burst)
1000                         peer->rate_tokens = ip_rt_error_burst;
1001                 peer->rate_last = now;
1002                 if (peer->rate_tokens >= ip_rt_error_cost)
1003                         peer->rate_tokens -= ip_rt_error_cost;
1004                 else
1005                         send = false;
1006                 inet_putpeer(peer);
1007         }
1008         if (send)
1009                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010 
1011 out:    kfree_skb(skb);
1012         return 0;
1013 }
1014 
1015 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 {
1017         struct dst_entry *dst = &rt->dst;
1018         struct net *net = dev_net(dst->dev);
1019         u32 old_mtu = ipv4_mtu(dst);
1020         struct fib_result res;
1021         bool lock = false;
1022 
1023         if (ip_mtu_locked(dst))
1024                 return;
1025 
1026         if (old_mtu < mtu)
1027                 return;
1028 
1029         if (mtu < ip_rt_min_pmtu) {
1030                 lock = true;
1031                 mtu = min(old_mtu, ip_rt_min_pmtu);
1032         }
1033 
1034         if (rt->rt_pmtu == mtu && !lock &&
1035             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1036                 return;
1037 
1038         rcu_read_lock();
1039         if (fib_lookup(net, fl4, &res, 0) == 0) {
1040                 struct fib_nh_common *nhc;
1041 
1042                 fib_select_path(net, &res, fl4, NULL);
1043                 nhc = FIB_RES_NHC(res);
1044                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1045                                       jiffies + ip_rt_mtu_expires);
1046         }
1047         rcu_read_unlock();
1048 }
1049 
1050 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1051                               struct sk_buff *skb, u32 mtu,
1052                               bool confirm_neigh)
1053 {
1054         struct rtable *rt = (struct rtable *) dst;
1055         struct flowi4 fl4;
1056 
1057         ip_rt_build_flow_key(&fl4, sk, skb);
1058 
1059         /* Don't make lookup fail for bridged encapsulations */
1060         if (skb && netif_is_any_bridge_port(skb->dev))
1061                 fl4.flowi4_oif = 0;
1062 
1063         __ip_rt_update_pmtu(rt, &fl4, mtu);
1064 }
1065 
1066 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1067                       int oif, u8 protocol)
1068 {
1069         const struct iphdr *iph = (const struct iphdr *) skb->data;
1070         struct flowi4 fl4;
1071         struct rtable *rt;
1072         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1073 
1074         __build_flow_key(net, &fl4, NULL, iph, oif,
1075                          RT_TOS(iph->tos), protocol, mark, 0);
1076         rt = __ip_route_output_key(net, &fl4);
1077         if (!IS_ERR(rt)) {
1078                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1079                 ip_rt_put(rt);
1080         }
1081 }
1082 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1083 
1084 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1085 {
1086         const struct iphdr *iph = (const struct iphdr *) skb->data;
1087         struct flowi4 fl4;
1088         struct rtable *rt;
1089 
1090         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1091 
1092         if (!fl4.flowi4_mark)
1093                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1094 
1095         rt = __ip_route_output_key(sock_net(sk), &fl4);
1096         if (!IS_ERR(rt)) {
1097                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1098                 ip_rt_put(rt);
1099         }
1100 }
1101 
1102 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1103 {
1104         const struct iphdr *iph = (const struct iphdr *) skb->data;
1105         struct flowi4 fl4;
1106         struct rtable *rt;
1107         struct dst_entry *odst = NULL;
1108         bool new = false;
1109         struct net *net = sock_net(sk);
1110 
1111         bh_lock_sock(sk);
1112 
1113         if (!ip_sk_accept_pmtu(sk))
1114                 goto out;
1115 
1116         odst = sk_dst_get(sk);
1117 
1118         if (sock_owned_by_user(sk) || !odst) {
1119                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1120                 goto out;
1121         }
1122 
1123         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1124 
1125         rt = (struct rtable *)odst;
1126         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1127                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1128                 if (IS_ERR(rt))
1129                         goto out;
1130 
1131                 new = true;
1132         }
1133 
1134         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1135 
1136         if (!dst_check(&rt->dst, 0)) {
1137                 if (new)
1138                         dst_release(&rt->dst);
1139 
1140                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1141                 if (IS_ERR(rt))
1142                         goto out;
1143 
1144                 new = true;
1145         }
1146 
1147         if (new)
1148                 sk_dst_set(sk, &rt->dst);
1149 
1150 out:
1151         bh_unlock_sock(sk);
1152         dst_release(odst);
1153 }
1154 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1155 
1156 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1157                    int oif, u8 protocol)
1158 {
1159         const struct iphdr *iph = (const struct iphdr *) skb->data;
1160         struct flowi4 fl4;
1161         struct rtable *rt;
1162 
1163         __build_flow_key(net, &fl4, NULL, iph, oif,
1164                          RT_TOS(iph->tos), protocol, 0, 0);
1165         rt = __ip_route_output_key(net, &fl4);
1166         if (!IS_ERR(rt)) {
1167                 __ip_do_redirect(rt, skb, &fl4, false);
1168                 ip_rt_put(rt);
1169         }
1170 }
1171 EXPORT_SYMBOL_GPL(ipv4_redirect);
1172 
1173 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1174 {
1175         const struct iphdr *iph = (const struct iphdr *) skb->data;
1176         struct flowi4 fl4;
1177         struct rtable *rt;
1178         struct net *net = sock_net(sk);
1179 
1180         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1181         rt = __ip_route_output_key(net, &fl4);
1182         if (!IS_ERR(rt)) {
1183                 __ip_do_redirect(rt, skb, &fl4, false);
1184                 ip_rt_put(rt);
1185         }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1188 
1189 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1190 {
1191         struct rtable *rt = (struct rtable *) dst;
1192 
1193         /* All IPV4 dsts are created with ->obsolete set to the value
1194          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1195          * into this function always.
1196          *
1197          * When a PMTU/redirect information update invalidates a route,
1198          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1199          * DST_OBSOLETE_DEAD.
1200          */
1201         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1202                 return NULL;
1203         return dst;
1204 }
1205 
1206 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1207 {
1208         struct ip_options opt;
1209         int res;
1210 
1211         /* Recompile ip options since IPCB may not be valid anymore.
1212          * Also check we have a reasonable ipv4 header.
1213          */
1214         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1215             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1216                 return;
1217 
1218         memset(&opt, 0, sizeof(opt));
1219         if (ip_hdr(skb)->ihl > 5) {
1220                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1221                         return;
1222                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1223 
1224                 rcu_read_lock();
1225                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1226                 rcu_read_unlock();
1227 
1228                 if (res)
1229                         return;
1230         }
1231         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1232 }
1233 
1234 static void ipv4_link_failure(struct sk_buff *skb)
1235 {
1236         struct rtable *rt;
1237 
1238         ipv4_send_dest_unreach(skb);
1239 
1240         rt = skb_rtable(skb);
1241         if (rt)
1242                 dst_set_expires(&rt->dst, 0);
1243 }
1244 
1245 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1246 {
1247         pr_debug("%s: %pI4 -> %pI4, %s\n",
1248                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1249                  skb->dev ? skb->dev->name : "?");
1250         kfree_skb(skb);
1251         WARN_ON(1);
1252         return 0;
1253 }
1254 
1255 /*
1256    We do not cache source address of outgoing interface,
1257    because it is used only by IP RR, TS and SRR options,
1258    so that it out of fast path.
1259 
1260    BTW remember: "addr" is allowed to be not aligned
1261    in IP options!
1262  */
1263 
1264 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1265 {
1266         __be32 src;
1267 
1268         if (rt_is_output_route(rt))
1269                 src = ip_hdr(skb)->saddr;
1270         else {
1271                 struct fib_result res;
1272                 struct iphdr *iph = ip_hdr(skb);
1273                 struct flowi4 fl4 = {
1274                         .daddr = iph->daddr,
1275                         .saddr = iph->saddr,
1276                         .flowi4_tos = RT_TOS(iph->tos),
1277                         .flowi4_oif = rt->dst.dev->ifindex,
1278                         .flowi4_iif = skb->dev->ifindex,
1279                         .flowi4_mark = skb->mark,
1280                 };
1281 
1282                 rcu_read_lock();
1283                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1284                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1285                 else
1286                         src = inet_select_addr(rt->dst.dev,
1287                                                rt_nexthop(rt, iph->daddr),
1288                                                RT_SCOPE_UNIVERSE);
1289                 rcu_read_unlock();
1290         }
1291         memcpy(addr, &src, 4);
1292 }
1293 
1294 #ifdef CONFIG_IP_ROUTE_CLASSID
1295 static void set_class_tag(struct rtable *rt, u32 tag)
1296 {
1297         if (!(rt->dst.tclassid & 0xFFFF))
1298                 rt->dst.tclassid |= tag & 0xFFFF;
1299         if (!(rt->dst.tclassid & 0xFFFF0000))
1300                 rt->dst.tclassid |= tag & 0xFFFF0000;
1301 }
1302 #endif
1303 
1304 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1305 {
1306         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1307         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1308                                     ip_rt_min_advmss);
1309 
1310         return min(advmss, IPV4_MAX_PMTU - header_size);
1311 }
1312 
1313 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1314 {
1315         const struct rtable *rt = (const struct rtable *) dst;
1316         unsigned int mtu = rt->rt_pmtu;
1317 
1318         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1319                 mtu = dst_metric_raw(dst, RTAX_MTU);
1320 
1321         if (mtu)
1322                 return mtu;
1323 
1324         mtu = READ_ONCE(dst->dev->mtu);
1325 
1326         if (unlikely(ip_mtu_locked(dst))) {
1327                 if (rt->rt_uses_gateway && mtu > 576)
1328                         mtu = 576;
1329         }
1330 
1331         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1332 
1333         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1334 }
1335 
1336 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1337 {
1338         struct fnhe_hash_bucket *hash;
1339         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1340         u32 hval = fnhe_hashfun(daddr);
1341 
1342         spin_lock_bh(&fnhe_lock);
1343 
1344         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1345                                          lockdep_is_held(&fnhe_lock));
1346         hash += hval;
1347 
1348         fnhe_p = &hash->chain;
1349         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1350         while (fnhe) {
1351                 if (fnhe->fnhe_daddr == daddr) {
1352                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1353                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1354                         /* set fnhe_daddr to 0 to ensure it won't bind with
1355                          * new dsts in rt_bind_exception().
1356                          */
1357                         fnhe->fnhe_daddr = 0;
1358                         fnhe_flush_routes(fnhe);
1359                         kfree_rcu(fnhe, rcu);
1360                         break;
1361                 }
1362                 fnhe_p = &fnhe->fnhe_next;
1363                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1364                                                  lockdep_is_held(&fnhe_lock));
1365         }
1366 
1367         spin_unlock_bh(&fnhe_lock);
1368 }
1369 
1370 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1371                                                __be32 daddr)
1372 {
1373         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1374         struct fib_nh_exception *fnhe;
1375         u32 hval;
1376 
1377         if (!hash)
1378                 return NULL;
1379 
1380         hval = fnhe_hashfun(daddr);
1381 
1382         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1383              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1384                 if (fnhe->fnhe_daddr == daddr) {
1385                         if (fnhe->fnhe_expires &&
1386                             time_after(jiffies, fnhe->fnhe_expires)) {
1387                                 ip_del_fnhe(nhc, daddr);
1388                                 break;
1389                         }
1390                         return fnhe;
1391                 }
1392         }
1393         return NULL;
1394 }
1395 
1396 /* MTU selection:
1397  * 1. mtu on route is locked - use it
1398  * 2. mtu from nexthop exception
1399  * 3. mtu from egress device
1400  */
1401 
1402 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1403 {
1404         struct fib_nh_common *nhc = res->nhc;
1405         struct net_device *dev = nhc->nhc_dev;
1406         struct fib_info *fi = res->fi;
1407         u32 mtu = 0;
1408 
1409         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1410             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1411                 mtu = fi->fib_mtu;
1412 
1413         if (likely(!mtu)) {
1414                 struct fib_nh_exception *fnhe;
1415 
1416                 fnhe = find_exception(nhc, daddr);
1417                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1418                         mtu = fnhe->fnhe_pmtu;
1419         }
1420 
1421         if (likely(!mtu))
1422                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1423 
1424         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1425 }
1426 
1427 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1428                               __be32 daddr, const bool do_cache)
1429 {
1430         bool ret = false;
1431 
1432         spin_lock_bh(&fnhe_lock);
1433 
1434         if (daddr == fnhe->fnhe_daddr) {
1435                 struct rtable __rcu **porig;
1436                 struct rtable *orig;
1437                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1438 
1439                 if (rt_is_input_route(rt))
1440                         porig = &fnhe->fnhe_rth_input;
1441                 else
1442                         porig = &fnhe->fnhe_rth_output;
1443                 orig = rcu_dereference(*porig);
1444 
1445                 if (fnhe->fnhe_genid != genid) {
1446                         fnhe->fnhe_genid = genid;
1447                         fnhe->fnhe_gw = 0;
1448                         fnhe->fnhe_pmtu = 0;
1449                         fnhe->fnhe_expires = 0;
1450                         fnhe->fnhe_mtu_locked = false;
1451                         fnhe_flush_routes(fnhe);
1452                         orig = NULL;
1453                 }
1454                 fill_route_from_fnhe(rt, fnhe);
1455                 if (!rt->rt_gw4) {
1456                         rt->rt_gw4 = daddr;
1457                         rt->rt_gw_family = AF_INET;
1458                 }
1459 
1460                 if (do_cache) {
1461                         dst_hold(&rt->dst);
1462                         rcu_assign_pointer(*porig, rt);
1463                         if (orig) {
1464                                 dst_dev_put(&orig->dst);
1465                                 dst_release(&orig->dst);
1466                         }
1467                         ret = true;
1468                 }
1469 
1470                 fnhe->fnhe_stamp = jiffies;
1471         }
1472         spin_unlock_bh(&fnhe_lock);
1473 
1474         return ret;
1475 }
1476 
1477 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1478 {
1479         struct rtable *orig, *prev, **p;
1480         bool ret = true;
1481 
1482         if (rt_is_input_route(rt)) {
1483                 p = (struct rtable **)&nhc->nhc_rth_input;
1484         } else {
1485                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1486         }
1487         orig = *p;
1488 
1489         /* hold dst before doing cmpxchg() to avoid race condition
1490          * on this dst
1491          */
1492         dst_hold(&rt->dst);
1493         prev = cmpxchg(p, orig, rt);
1494         if (prev == orig) {
1495                 if (orig) {
1496                         rt_add_uncached_list(orig);
1497                         dst_release(&orig->dst);
1498                 }
1499         } else {
1500                 dst_release(&rt->dst);
1501                 ret = false;
1502         }
1503 
1504         return ret;
1505 }
1506 
1507 struct uncached_list {
1508         spinlock_t              lock;
1509         struct list_head        head;
1510 };
1511 
1512 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1513 
1514 void rt_add_uncached_list(struct rtable *rt)
1515 {
1516         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1517 
1518         rt->rt_uncached_list = ul;
1519 
1520         spin_lock_bh(&ul->lock);
1521         list_add_tail(&rt->rt_uncached, &ul->head);
1522         spin_unlock_bh(&ul->lock);
1523 }
1524 
1525 void rt_del_uncached_list(struct rtable *rt)
1526 {
1527         if (!list_empty(&rt->rt_uncached)) {
1528                 struct uncached_list *ul = rt->rt_uncached_list;
1529 
1530                 spin_lock_bh(&ul->lock);
1531                 list_del(&rt->rt_uncached);
1532                 spin_unlock_bh(&ul->lock);
1533         }
1534 }
1535 
1536 static void ipv4_dst_destroy(struct dst_entry *dst)
1537 {
1538         struct rtable *rt = (struct rtable *)dst;
1539 
1540         ip_dst_metrics_put(dst);
1541         rt_del_uncached_list(rt);
1542 }
1543 
1544 void rt_flush_dev(struct net_device *dev)
1545 {
1546         struct rtable *rt;
1547         int cpu;
1548 
1549         for_each_possible_cpu(cpu) {
1550                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1551 
1552                 spin_lock_bh(&ul->lock);
1553                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1554                         if (rt->dst.dev != dev)
1555                                 continue;
1556                         rt->dst.dev = blackhole_netdev;
1557                         dev_hold(rt->dst.dev);
1558                         dev_put(dev);
1559                 }
1560                 spin_unlock_bh(&ul->lock);
1561         }
1562 }
1563 
1564 static bool rt_cache_valid(const struct rtable *rt)
1565 {
1566         return  rt &&
1567                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1568                 !rt_is_expired(rt);
1569 }
1570 
1571 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1572                            const struct fib_result *res,
1573                            struct fib_nh_exception *fnhe,
1574                            struct fib_info *fi, u16 type, u32 itag,
1575                            const bool do_cache)
1576 {
1577         bool cached = false;
1578 
1579         if (fi) {
1580                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1581 
1582                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1583                         rt->rt_uses_gateway = 1;
1584                         rt->rt_gw_family = nhc->nhc_gw_family;
1585                         /* only INET and INET6 are supported */
1586                         if (likely(nhc->nhc_gw_family == AF_INET))
1587                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1588                         else
1589                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1590                 }
1591 
1592                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1593 
1594 #ifdef CONFIG_IP_ROUTE_CLASSID
1595                 if (nhc->nhc_family == AF_INET) {
1596                         struct fib_nh *nh;
1597 
1598                         nh = container_of(nhc, struct fib_nh, nh_common);
1599                         rt->dst.tclassid = nh->nh_tclassid;
1600                 }
1601 #endif
1602                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1603                 if (unlikely(fnhe))
1604                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1605                 else if (do_cache)
1606                         cached = rt_cache_route(nhc, rt);
1607                 if (unlikely(!cached)) {
1608                         /* Routes we intend to cache in nexthop exception or
1609                          * FIB nexthop have the DST_NOCACHE bit clear.
1610                          * However, if we are unsuccessful at storing this
1611                          * route into the cache we really need to set it.
1612                          */
1613                         if (!rt->rt_gw4) {
1614                                 rt->rt_gw_family = AF_INET;
1615                                 rt->rt_gw4 = daddr;
1616                         }
1617                         rt_add_uncached_list(rt);
1618                 }
1619         } else
1620                 rt_add_uncached_list(rt);
1621 
1622 #ifdef CONFIG_IP_ROUTE_CLASSID
1623 #ifdef CONFIG_IP_MULTIPLE_TABLES
1624         set_class_tag(rt, res->tclassid);
1625 #endif
1626         set_class_tag(rt, itag);
1627 #endif
1628 }
1629 
1630 struct rtable *rt_dst_alloc(struct net_device *dev,
1631                             unsigned int flags, u16 type,
1632                             bool nopolicy, bool noxfrm)
1633 {
1634         struct rtable *rt;
1635 
1636         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1637                        (nopolicy ? DST_NOPOLICY : 0) |
1638                        (noxfrm ? DST_NOXFRM : 0));
1639 
1640         if (rt) {
1641                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1642                 rt->rt_flags = flags;
1643                 rt->rt_type = type;
1644                 rt->rt_is_input = 0;
1645                 rt->rt_iif = 0;
1646                 rt->rt_pmtu = 0;
1647                 rt->rt_mtu_locked = 0;
1648                 rt->rt_uses_gateway = 0;
1649                 rt->rt_gw_family = 0;
1650                 rt->rt_gw4 = 0;
1651                 INIT_LIST_HEAD(&rt->rt_uncached);
1652 
1653                 rt->dst.output = ip_output;
1654                 if (flags & RTCF_LOCAL)
1655                         rt->dst.input = ip_local_deliver;
1656         }
1657 
1658         return rt;
1659 }
1660 EXPORT_SYMBOL(rt_dst_alloc);
1661 
1662 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1663 {
1664         struct rtable *new_rt;
1665 
1666         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1667                            rt->dst.flags);
1668 
1669         if (new_rt) {
1670                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1671                 new_rt->rt_flags = rt->rt_flags;
1672                 new_rt->rt_type = rt->rt_type;
1673                 new_rt->rt_is_input = rt->rt_is_input;
1674                 new_rt->rt_iif = rt->rt_iif;
1675                 new_rt->rt_pmtu = rt->rt_pmtu;
1676                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1677                 new_rt->rt_gw_family = rt->rt_gw_family;
1678                 if (rt->rt_gw_family == AF_INET)
1679                         new_rt->rt_gw4 = rt->rt_gw4;
1680                 else if (rt->rt_gw_family == AF_INET6)
1681                         new_rt->rt_gw6 = rt->rt_gw6;
1682                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1683 
1684                 new_rt->dst.input = rt->dst.input;
1685                 new_rt->dst.output = rt->dst.output;
1686                 new_rt->dst.error = rt->dst.error;
1687                 new_rt->dst.lastuse = jiffies;
1688                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1689         }
1690         return new_rt;
1691 }
1692 EXPORT_SYMBOL(rt_dst_clone);
1693 
1694 /* called in rcu_read_lock() section */
1695 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1696                           u8 tos, struct net_device *dev,
1697                           struct in_device *in_dev, u32 *itag)
1698 {
1699         int err;
1700 
1701         /* Primary sanity checks. */
1702         if (!in_dev)
1703                 return -EINVAL;
1704 
1705         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1706             skb->protocol != htons(ETH_P_IP))
1707                 return -EINVAL;
1708 
1709         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1710                 return -EINVAL;
1711 
1712         if (ipv4_is_zeronet(saddr)) {
1713                 if (!ipv4_is_local_multicast(daddr) &&
1714                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1715                         return -EINVAL;
1716         } else {
1717                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1718                                           in_dev, itag);
1719                 if (err < 0)
1720                         return err;
1721         }
1722         return 0;
1723 }
1724 
1725 /* called in rcu_read_lock() section */
1726 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1727                              u8 tos, struct net_device *dev, int our)
1728 {
1729         struct in_device *in_dev = __in_dev_get_rcu(dev);
1730         unsigned int flags = RTCF_MULTICAST;
1731         struct rtable *rth;
1732         u32 itag = 0;
1733         int err;
1734 
1735         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1736         if (err)
1737                 return err;
1738 
1739         if (our)
1740                 flags |= RTCF_LOCAL;
1741 
1742         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1743                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1744         if (!rth)
1745                 return -ENOBUFS;
1746 
1747 #ifdef CONFIG_IP_ROUTE_CLASSID
1748         rth->dst.tclassid = itag;
1749 #endif
1750         rth->dst.output = ip_rt_bug;
1751         rth->rt_is_input= 1;
1752 
1753 #ifdef CONFIG_IP_MROUTE
1754         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1755                 rth->dst.input = ip_mr_input;
1756 #endif
1757         RT_CACHE_STAT_INC(in_slow_mc);
1758 
1759         skb_dst_set(skb, &rth->dst);
1760         return 0;
1761 }
1762 
1763 
1764 static void ip_handle_martian_source(struct net_device *dev,
1765                                      struct in_device *in_dev,
1766                                      struct sk_buff *skb,
1767                                      __be32 daddr,
1768                                      __be32 saddr)
1769 {
1770         RT_CACHE_STAT_INC(in_martian_src);
1771 #ifdef CONFIG_IP_ROUTE_VERBOSE
1772         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773                 /*
1774                  *      RFC1812 recommendation, if source is martian,
1775                  *      the only hint is MAC header.
1776                  */
1777                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778                         &daddr, &saddr, dev->name);
1779                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780                         print_hex_dump(KERN_WARNING, "ll header: ",
1781                                        DUMP_PREFIX_OFFSET, 16, 1,
1782                                        skb_mac_header(skb),
1783                                        dev->hard_header_len, false);
1784                 }
1785         }
1786 #endif
1787 }
1788 
1789 /* called in rcu_read_lock() section */
1790 static int __mkroute_input(struct sk_buff *skb,
1791                            const struct fib_result *res,
1792                            struct in_device *in_dev,
1793                            __be32 daddr, __be32 saddr, u32 tos)
1794 {
1795         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796         struct net_device *dev = nhc->nhc_dev;
1797         struct fib_nh_exception *fnhe;
1798         struct rtable *rth;
1799         int err;
1800         struct in_device *out_dev;
1801         bool do_cache;
1802         u32 itag = 0;
1803 
1804         /* get a working reference to the output device */
1805         out_dev = __in_dev_get_rcu(dev);
1806         if (!out_dev) {
1807                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808                 return -EINVAL;
1809         }
1810 
1811         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812                                   in_dev->dev, in_dev, &itag);
1813         if (err < 0) {
1814                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815                                          saddr);
1816 
1817                 goto cleanup;
1818         }
1819 
1820         do_cache = res->fi && !itag;
1821         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822             skb->protocol == htons(ETH_P_IP)) {
1823                 __be32 gw;
1824 
1825                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827                     inet_addr_onlink(out_dev, saddr, gw))
1828                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829         }
1830 
1831         if (skb->protocol != htons(ETH_P_IP)) {
1832                 /* Not IP (i.e. ARP). Do not create route, if it is
1833                  * invalid for proxy arp. DNAT routes are always valid.
1834                  *
1835                  * Proxy arp feature have been extended to allow, ARP
1836                  * replies back to the same interface, to support
1837                  * Private VLAN switch technologies. See arp.c.
1838                  */
1839                 if (out_dev == in_dev &&
1840                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841                         err = -EINVAL;
1842                         goto cleanup;
1843                 }
1844         }
1845 
1846         fnhe = find_exception(nhc, daddr);
1847         if (do_cache) {
1848                 if (fnhe)
1849                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1850                 else
1851                         rth = rcu_dereference(nhc->nhc_rth_input);
1852                 if (rt_cache_valid(rth)) {
1853                         skb_dst_set_noref(skb, &rth->dst);
1854                         goto out;
1855                 }
1856         }
1857 
1858         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1859                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1860                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1861         if (!rth) {
1862                 err = -ENOBUFS;
1863                 goto cleanup;
1864         }
1865 
1866         rth->rt_is_input = 1;
1867         RT_CACHE_STAT_INC(in_slow_tot);
1868 
1869         rth->dst.input = ip_forward;
1870 
1871         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1872                        do_cache);
1873         lwtunnel_set_redirect(&rth->dst);
1874         skb_dst_set(skb, &rth->dst);
1875 out:
1876         err = 0;
1877  cleanup:
1878         return err;
1879 }
1880 
1881 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1882 /* To make ICMP packets follow the right flow, the multipath hash is
1883  * calculated from the inner IP addresses.
1884  */
1885 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1886                                  struct flow_keys *hash_keys)
1887 {
1888         const struct iphdr *outer_iph = ip_hdr(skb);
1889         const struct iphdr *key_iph = outer_iph;
1890         const struct iphdr *inner_iph;
1891         const struct icmphdr *icmph;
1892         struct iphdr _inner_iph;
1893         struct icmphdr _icmph;
1894 
1895         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1896                 goto out;
1897 
1898         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1899                 goto out;
1900 
1901         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1902                                    &_icmph);
1903         if (!icmph)
1904                 goto out;
1905 
1906         if (!icmp_is_err(icmph->type))
1907                 goto out;
1908 
1909         inner_iph = skb_header_pointer(skb,
1910                                        outer_iph->ihl * 4 + sizeof(_icmph),
1911                                        sizeof(_inner_iph), &_inner_iph);
1912         if (!inner_iph)
1913                 goto out;
1914 
1915         key_iph = inner_iph;
1916 out:
1917         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1918         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1919 }
1920 
1921 /* if skb is set it will be used and fl4 can be NULL */
1922 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1923                        const struct sk_buff *skb, struct flow_keys *flkeys)
1924 {
1925         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1926         struct flow_keys hash_keys;
1927         u32 mhash;
1928 
1929         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1930         case 0:
1931                 memset(&hash_keys, 0, sizeof(hash_keys));
1932                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1933                 if (skb) {
1934                         ip_multipath_l3_keys(skb, &hash_keys);
1935                 } else {
1936                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1937                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1938                 }
1939                 break;
1940         case 1:
1941                 /* skb is currently provided only when forwarding */
1942                 if (skb) {
1943                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1944                         struct flow_keys keys;
1945 
1946                         /* short-circuit if we already have L4 hash present */
1947                         if (skb->l4_hash)
1948                                 return skb_get_hash_raw(skb) >> 1;
1949 
1950                         memset(&hash_keys, 0, sizeof(hash_keys));
1951 
1952                         if (!flkeys) {
1953                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1954                                 flkeys = &keys;
1955                         }
1956 
1957                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1959                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1960                         hash_keys.ports.src = flkeys->ports.src;
1961                         hash_keys.ports.dst = flkeys->ports.dst;
1962                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1963                 } else {
1964                         memset(&hash_keys, 0, sizeof(hash_keys));
1965                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1966                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1967                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1968                         hash_keys.ports.src = fl4->fl4_sport;
1969                         hash_keys.ports.dst = fl4->fl4_dport;
1970                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1971                 }
1972                 break;
1973         case 2:
1974                 memset(&hash_keys, 0, sizeof(hash_keys));
1975                 /* skb is currently provided only when forwarding */
1976                 if (skb) {
1977                         struct flow_keys keys;
1978 
1979                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1980                         /* Inner can be v4 or v6 */
1981                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1982                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1983                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1984                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1985                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1986                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1988                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1989                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1990                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1991                         } else {
1992                                 /* Same as case 0 */
1993                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994                                 ip_multipath_l3_keys(skb, &hash_keys);
1995                         }
1996                 } else {
1997                         /* Same as case 0 */
1998                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2000                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2001                 }
2002                 break;
2003         }
2004         mhash = flow_hash_from_keys(&hash_keys);
2005 
2006         if (multipath_hash)
2007                 mhash = jhash_2words(mhash, multipath_hash, 0);
2008 
2009         return mhash >> 1;
2010 }
2011 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2012 
2013 static int ip_mkroute_input(struct sk_buff *skb,
2014                             struct fib_result *res,
2015                             struct in_device *in_dev,
2016                             __be32 daddr, __be32 saddr, u32 tos,
2017                             struct flow_keys *hkeys)
2018 {
2019 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2020         if (res->fi && fib_info_num_path(res->fi) > 1) {
2021                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2022 
2023                 fib_select_multipath(res, h);
2024         }
2025 #endif
2026 
2027         /* create a routing cache entry */
2028         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2029 }
2030 
2031 /* Implements all the saddr-related checks as ip_route_input_slow(),
2032  * assuming daddr is valid and the destination is not a local broadcast one.
2033  * Uses the provided hint instead of performing a route lookup.
2034  */
2035 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036                       u8 tos, struct net_device *dev,
2037                       const struct sk_buff *hint)
2038 {
2039         struct in_device *in_dev = __in_dev_get_rcu(dev);
2040         struct rtable *rt = skb_rtable(hint);
2041         struct net *net = dev_net(dev);
2042         int err = -EINVAL;
2043         u32 tag = 0;
2044 
2045         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2046                 goto martian_source;
2047 
2048         if (ipv4_is_zeronet(saddr))
2049                 goto martian_source;
2050 
2051         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2052                 goto martian_source;
2053 
2054         if (rt->rt_type != RTN_LOCAL)
2055                 goto skip_validate_source;
2056 
2057         tos &= IPTOS_RT_MASK;
2058         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2059         if (err < 0)
2060                 goto martian_source;
2061 
2062 skip_validate_source:
2063         skb_dst_copy(skb, hint);
2064         return 0;
2065 
2066 martian_source:
2067         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2068         return err;
2069 }
2070 
2071 /*
2072  *      NOTE. We drop all the packets that has local source
2073  *      addresses, because every properly looped back packet
2074  *      must have correct destination already attached by output routine.
2075  *      Changes in the enforced policies must be applied also to
2076  *      ip_route_use_hint().
2077  *
2078  *      Such approach solves two big problems:
2079  *      1. Not simplex devices are handled properly.
2080  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2081  *      called with rcu_read_lock()
2082  */
2083 
2084 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085                                u8 tos, struct net_device *dev,
2086                                struct fib_result *res)
2087 {
2088         struct in_device *in_dev = __in_dev_get_rcu(dev);
2089         struct flow_keys *flkeys = NULL, _flkeys;
2090         struct net    *net = dev_net(dev);
2091         struct ip_tunnel_info *tun_info;
2092         int             err = -EINVAL;
2093         unsigned int    flags = 0;
2094         u32             itag = 0;
2095         struct rtable   *rth;
2096         struct flowi4   fl4;
2097         bool do_cache = true;
2098 
2099         /* IP on this device is disabled. */
2100 
2101         if (!in_dev)
2102                 goto out;
2103 
2104         /* Check for the most weird martians, which can be not detected
2105            by fib_lookup.
2106          */
2107 
2108         tun_info = skb_tunnel_info(skb);
2109         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2110                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2111         else
2112                 fl4.flowi4_tun_key.tun_id = 0;
2113         skb_dst_drop(skb);
2114 
2115         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2116                 goto martian_source;
2117 
2118         res->fi = NULL;
2119         res->table = NULL;
2120         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2121                 goto brd_input;
2122 
2123         /* Accept zero addresses only to limited broadcast;
2124          * I even do not know to fix it or not. Waiting for complains :-)
2125          */
2126         if (ipv4_is_zeronet(saddr))
2127                 goto martian_source;
2128 
2129         if (ipv4_is_zeronet(daddr))
2130                 goto martian_destination;
2131 
2132         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2133          * and call it once if daddr or/and saddr are loopback addresses
2134          */
2135         if (ipv4_is_loopback(daddr)) {
2136                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2137                         goto martian_destination;
2138         } else if (ipv4_is_loopback(saddr)) {
2139                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2140                         goto martian_source;
2141         }
2142 
2143         /*
2144          *      Now we are ready to route packet.
2145          */
2146         fl4.flowi4_oif = 0;
2147         fl4.flowi4_iif = dev->ifindex;
2148         fl4.flowi4_mark = skb->mark;
2149         fl4.flowi4_tos = tos;
2150         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2151         fl4.flowi4_flags = 0;
2152         fl4.daddr = daddr;
2153         fl4.saddr = saddr;
2154         fl4.flowi4_uid = sock_net_uid(net, NULL);
2155         fl4.flowi4_multipath_hash = 0;
2156 
2157         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2158                 flkeys = &_flkeys;
2159         } else {
2160                 fl4.flowi4_proto = 0;
2161                 fl4.fl4_sport = 0;
2162                 fl4.fl4_dport = 0;
2163         }
2164 
2165         err = fib_lookup(net, &fl4, res, 0);
2166         if (err != 0) {
2167                 if (!IN_DEV_FORWARD(in_dev))
2168                         err = -EHOSTUNREACH;
2169                 goto no_route;
2170         }
2171 
2172         if (res->type == RTN_BROADCAST) {
2173                 if (IN_DEV_BFORWARD(in_dev))
2174                         goto make_route;
2175                 /* not do cache if bc_forwarding is enabled */
2176                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2177                         do_cache = false;
2178                 goto brd_input;
2179         }
2180 
2181         if (res->type == RTN_LOCAL) {
2182                 err = fib_validate_source(skb, saddr, daddr, tos,
2183                                           0, dev, in_dev, &itag);
2184                 if (err < 0)
2185                         goto martian_source;
2186                 goto local_input;
2187         }
2188 
2189         if (!IN_DEV_FORWARD(in_dev)) {
2190                 err = -EHOSTUNREACH;
2191                 goto no_route;
2192         }
2193         if (res->type != RTN_UNICAST)
2194                 goto martian_destination;
2195 
2196 make_route:
2197         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2198 out:    return err;
2199 
2200 brd_input:
2201         if (skb->protocol != htons(ETH_P_IP))
2202                 goto e_inval;
2203 
2204         if (!ipv4_is_zeronet(saddr)) {
2205                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2206                                           in_dev, &itag);
2207                 if (err < 0)
2208                         goto martian_source;
2209         }
2210         flags |= RTCF_BROADCAST;
2211         res->type = RTN_BROADCAST;
2212         RT_CACHE_STAT_INC(in_brd);
2213 
2214 local_input:
2215         do_cache &= res->fi && !itag;
2216         if (do_cache) {
2217                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2218 
2219                 rth = rcu_dereference(nhc->nhc_rth_input);
2220                 if (rt_cache_valid(rth)) {
2221                         skb_dst_set_noref(skb, &rth->dst);
2222                         err = 0;
2223                         goto out;
2224                 }
2225         }
2226 
2227         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2228                            flags | RTCF_LOCAL, res->type,
2229                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2230         if (!rth)
2231                 goto e_nobufs;
2232 
2233         rth->dst.output= ip_rt_bug;
2234 #ifdef CONFIG_IP_ROUTE_CLASSID
2235         rth->dst.tclassid = itag;
2236 #endif
2237         rth->rt_is_input = 1;
2238 
2239         RT_CACHE_STAT_INC(in_slow_tot);
2240         if (res->type == RTN_UNREACHABLE) {
2241                 rth->dst.input= ip_error;
2242                 rth->dst.error= -err;
2243                 rth->rt_flags   &= ~RTCF_LOCAL;
2244         }
2245 
2246         if (do_cache) {
2247                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2248 
2249                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2250                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2251                         WARN_ON(rth->dst.input == lwtunnel_input);
2252                         rth->dst.lwtstate->orig_input = rth->dst.input;
2253                         rth->dst.input = lwtunnel_input;
2254                 }
2255 
2256                 if (unlikely(!rt_cache_route(nhc, rth)))
2257                         rt_add_uncached_list(rth);
2258         }
2259         skb_dst_set(skb, &rth->dst);
2260         err = 0;
2261         goto out;
2262 
2263 no_route:
2264         RT_CACHE_STAT_INC(in_no_route);
2265         res->type = RTN_UNREACHABLE;
2266         res->fi = NULL;
2267         res->table = NULL;
2268         goto local_input;
2269 
2270         /*
2271          *      Do not cache martian addresses: they should be logged (RFC1812)
2272          */
2273 martian_destination:
2274         RT_CACHE_STAT_INC(in_martian_dst);
2275 #ifdef CONFIG_IP_ROUTE_VERBOSE
2276         if (IN_DEV_LOG_MARTIANS(in_dev))
2277                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2278                                      &daddr, &saddr, dev->name);
2279 #endif
2280 
2281 e_inval:
2282         err = -EINVAL;
2283         goto out;
2284 
2285 e_nobufs:
2286         err = -ENOBUFS;
2287         goto out;
2288 
2289 martian_source:
2290         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2291         goto out;
2292 }
2293 
2294 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2295                          u8 tos, struct net_device *dev)
2296 {
2297         struct fib_result res;
2298         int err;
2299 
2300         tos &= IPTOS_RT_MASK;
2301         rcu_read_lock();
2302         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2303         rcu_read_unlock();
2304 
2305         return err;
2306 }
2307 EXPORT_SYMBOL(ip_route_input_noref);
2308 
2309 /* called with rcu_read_lock held */
2310 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311                        u8 tos, struct net_device *dev, struct fib_result *res)
2312 {
2313         /* Multicast recognition logic is moved from route cache to here.
2314            The problem was that too many Ethernet cards have broken/missing
2315            hardware multicast filters :-( As result the host on multicasting
2316            network acquires a lot of useless route cache entries, sort of
2317            SDR messages from all the world. Now we try to get rid of them.
2318            Really, provided software IP multicast filter is organized
2319            reasonably (at least, hashed), it does not result in a slowdown
2320            comparing with route cache reject entries.
2321            Note, that multicast routers are not affected, because
2322            route cache entry is created eventually.
2323          */
2324         if (ipv4_is_multicast(daddr)) {
2325                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2326                 int our = 0;
2327                 int err = -EINVAL;
2328 
2329                 if (!in_dev)
2330                         return err;
2331                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2332                                       ip_hdr(skb)->protocol);
2333 
2334                 /* check l3 master if no match yet */
2335                 if (!our && netif_is_l3_slave(dev)) {
2336                         struct in_device *l3_in_dev;
2337 
2338                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2339                         if (l3_in_dev)
2340                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2341                                                       ip_hdr(skb)->protocol);
2342                 }
2343 
2344                 if (our
2345 #ifdef CONFIG_IP_MROUTE
2346                         ||
2347                     (!ipv4_is_local_multicast(daddr) &&
2348                      IN_DEV_MFORWARD(in_dev))
2349 #endif
2350                    ) {
2351                         err = ip_route_input_mc(skb, daddr, saddr,
2352                                                 tos, dev, our);
2353                 }
2354                 return err;
2355         }
2356 
2357         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2358 }
2359 
2360 /* called with rcu_read_lock() */
2361 static struct rtable *__mkroute_output(const struct fib_result *res,
2362                                        const struct flowi4 *fl4, int orig_oif,
2363                                        struct net_device *dev_out,
2364                                        unsigned int flags)
2365 {
2366         struct fib_info *fi = res->fi;
2367         struct fib_nh_exception *fnhe;
2368         struct in_device *in_dev;
2369         u16 type = res->type;
2370         struct rtable *rth;
2371         bool do_cache;
2372 
2373         in_dev = __in_dev_get_rcu(dev_out);
2374         if (!in_dev)
2375                 return ERR_PTR(-EINVAL);
2376 
2377         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2378                 if (ipv4_is_loopback(fl4->saddr) &&
2379                     !(dev_out->flags & IFF_LOOPBACK) &&
2380                     !netif_is_l3_master(dev_out))
2381                         return ERR_PTR(-EINVAL);
2382 
2383         if (ipv4_is_lbcast(fl4->daddr))
2384                 type = RTN_BROADCAST;
2385         else if (ipv4_is_multicast(fl4->daddr))
2386                 type = RTN_MULTICAST;
2387         else if (ipv4_is_zeronet(fl4->daddr))
2388                 return ERR_PTR(-EINVAL);
2389 
2390         if (dev_out->flags & IFF_LOOPBACK)
2391                 flags |= RTCF_LOCAL;
2392 
2393         do_cache = true;
2394         if (type == RTN_BROADCAST) {
2395                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2396                 fi = NULL;
2397         } else if (type == RTN_MULTICAST) {
2398                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2399                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2400                                      fl4->flowi4_proto))
2401                         flags &= ~RTCF_LOCAL;
2402                 else
2403                         do_cache = false;
2404                 /* If multicast route do not exist use
2405                  * default one, but do not gateway in this case.
2406                  * Yes, it is hack.
2407                  */
2408                 if (fi && res->prefixlen < 4)
2409                         fi = NULL;
2410         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2411                    (orig_oif != dev_out->ifindex)) {
2412                 /* For local routes that require a particular output interface
2413                  * we do not want to cache the result.  Caching the result
2414                  * causes incorrect behaviour when there are multiple source
2415                  * addresses on the interface, the end result being that if the
2416                  * intended recipient is waiting on that interface for the
2417                  * packet he won't receive it because it will be delivered on
2418                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2419                  * be set to the loopback interface as well.
2420                  */
2421                 do_cache = false;
2422         }
2423 
2424         fnhe = NULL;
2425         do_cache &= fi != NULL;
2426         if (fi) {
2427                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2428                 struct rtable __rcu **prth;
2429 
2430                 fnhe = find_exception(nhc, fl4->daddr);
2431                 if (!do_cache)
2432                         goto add;
2433                 if (fnhe) {
2434                         prth = &fnhe->fnhe_rth_output;
2435                 } else {
2436                         if (unlikely(fl4->flowi4_flags &
2437                                      FLOWI_FLAG_KNOWN_NH &&
2438                                      !(nhc->nhc_gw_family &&
2439                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2440                                 do_cache = false;
2441                                 goto add;
2442                         }
2443                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2444                 }
2445                 rth = rcu_dereference(*prth);
2446                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2447                         return rth;
2448         }
2449 
2450 add:
2451         rth = rt_dst_alloc(dev_out, flags, type,
2452                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2453                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2454         if (!rth)
2455                 return ERR_PTR(-ENOBUFS);
2456 
2457         rth->rt_iif = orig_oif;
2458 
2459         RT_CACHE_STAT_INC(out_slow_tot);
2460 
2461         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2462                 if (flags & RTCF_LOCAL &&
2463                     !(dev_out->flags & IFF_LOOPBACK)) {
2464                         rth->dst.output = ip_mc_output;
2465                         RT_CACHE_STAT_INC(out_slow_mc);
2466                 }
2467 #ifdef CONFIG_IP_MROUTE
2468                 if (type == RTN_MULTICAST) {
2469                         if (IN_DEV_MFORWARD(in_dev) &&
2470                             !ipv4_is_local_multicast(fl4->daddr)) {
2471                                 rth->dst.input = ip_mr_input;
2472                                 rth->dst.output = ip_mc_output;
2473                         }
2474                 }
2475 #endif
2476         }
2477 
2478         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2479         lwtunnel_set_redirect(&rth->dst);
2480 
2481         return rth;
2482 }
2483 
2484 /*
2485  * Major route resolver routine.
2486  */
2487 
2488 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2489                                         const struct sk_buff *skb)
2490 {
2491         __u8 tos = RT_FL_TOS(fl4);
2492         struct fib_result res = {
2493                 .type           = RTN_UNSPEC,
2494                 .fi             = NULL,
2495                 .table          = NULL,
2496                 .tclassid       = 0,
2497         };
2498         struct rtable *rth;
2499 
2500         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2501         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2502         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2503                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2504 
2505         rcu_read_lock();
2506         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2507         rcu_read_unlock();
2508 
2509         return rth;
2510 }
2511 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2512 
2513 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2514                                             struct fib_result *res,
2515                                             const struct sk_buff *skb)
2516 {
2517         struct net_device *dev_out = NULL;
2518         int orig_oif = fl4->flowi4_oif;
2519         unsigned int flags = 0;
2520         struct rtable *rth;
2521         int err;
2522 
2523         if (fl4->saddr) {
2524                 if (ipv4_is_multicast(fl4->saddr) ||
2525                     ipv4_is_lbcast(fl4->saddr) ||
2526                     ipv4_is_zeronet(fl4->saddr)) {
2527                         rth = ERR_PTR(-EINVAL);
2528                         goto out;
2529                 }
2530 
2531                 rth = ERR_PTR(-ENETUNREACH);
2532 
2533                 /* I removed check for oif == dev_out->oif here.
2534                    It was wrong for two reasons:
2535                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2536                       is assigned to multiple interfaces.
2537                    2. Moreover, we are allowed to send packets with saddr
2538                       of another iface. --ANK
2539                  */
2540 
2541                 if (fl4->flowi4_oif == 0 &&
2542                     (ipv4_is_multicast(fl4->daddr) ||
2543                      ipv4_is_lbcast(fl4->daddr))) {
2544                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2545                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2546                         if (!dev_out)
2547                                 goto out;
2548 
2549                         /* Special hack: user can direct multicasts
2550                            and limited broadcast via necessary interface
2551                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2552                            This hack is not just for fun, it allows
2553                            vic,vat and friends to work.
2554                            They bind socket to loopback, set ttl to zero
2555                            and expect that it will work.
2556                            From the viewpoint of routing cache they are broken,
2557                            because we are not allowed to build multicast path
2558                            with loopback source addr (look, routing cache
2559                            cannot know, that ttl is zero, so that packet
2560                            will not leave this host and route is valid).
2561                            Luckily, this hack is good workaround.
2562                          */
2563 
2564                         fl4->flowi4_oif = dev_out->ifindex;
2565                         goto make_route;
2566                 }
2567 
2568                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2569                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2570                         if (!__ip_dev_find(net, fl4->saddr, false))
2571                                 goto out;
2572                 }
2573         }
2574 
2575 
2576         if (fl4->flowi4_oif) {
2577                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2578                 rth = ERR_PTR(-ENODEV);
2579                 if (!dev_out)
2580                         goto out;
2581 
2582                 /* RACE: Check return value of inet_select_addr instead. */
2583                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2584                         rth = ERR_PTR(-ENETUNREACH);
2585                         goto out;
2586                 }
2587                 if (ipv4_is_local_multicast(fl4->daddr) ||
2588                     ipv4_is_lbcast(fl4->daddr) ||
2589                     fl4->flowi4_proto == IPPROTO_IGMP) {
2590                         if (!fl4->saddr)
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               RT_SCOPE_LINK);
2593                         goto make_route;
2594                 }
2595                 if (!fl4->saddr) {
2596                         if (ipv4_is_multicast(fl4->daddr))
2597                                 fl4->saddr = inet_select_addr(dev_out, 0,
2598                                                               fl4->flowi4_scope);
2599                         else if (!fl4->daddr)
2600                                 fl4->saddr = inet_select_addr(dev_out, 0,
2601                                                               RT_SCOPE_HOST);
2602                 }
2603         }
2604 
2605         if (!fl4->daddr) {
2606                 fl4->daddr = fl4->saddr;
2607                 if (!fl4->daddr)
2608                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2609                 dev_out = net->loopback_dev;
2610                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2611                 res->type = RTN_LOCAL;
2612                 flags |= RTCF_LOCAL;
2613                 goto make_route;
2614         }
2615 
2616         err = fib_lookup(net, fl4, res, 0);
2617         if (err) {
2618                 res->fi = NULL;
2619                 res->table = NULL;
2620                 if (fl4->flowi4_oif &&
2621                     (ipv4_is_multicast(fl4->daddr) ||
2622                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2623                         /* Apparently, routing tables are wrong. Assume,
2624                            that the destination is on link.
2625 
2626                            WHY? DW.
2627                            Because we are allowed to send to iface
2628                            even if it has NO routes and NO assigned
2629                            addresses. When oif is specified, routing
2630                            tables are looked up with only one purpose:
2631                            to catch if destination is gatewayed, rather than
2632                            direct. Moreover, if MSG_DONTROUTE is set,
2633                            we send packet, ignoring both routing tables
2634                            and ifaddr state. --ANK
2635 
2636 
2637                            We could make it even if oif is unknown,
2638                            likely IPv6, but we do not.
2639                          */
2640 
2641                         if (fl4->saddr == 0)
2642                                 fl4->saddr = inet_select_addr(dev_out, 0,
2643                                                               RT_SCOPE_LINK);
2644                         res->type = RTN_UNICAST;
2645                         goto make_route;
2646                 }
2647                 rth = ERR_PTR(err);
2648                 goto out;
2649         }
2650 
2651         if (res->type == RTN_LOCAL) {
2652                 if (!fl4->saddr) {
2653                         if (res->fi->fib_prefsrc)
2654                                 fl4->saddr = res->fi->fib_prefsrc;
2655                         else
2656                                 fl4->saddr = fl4->daddr;
2657                 }
2658 
2659                 /* L3 master device is the loopback for that domain */
2660                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2661                         net->loopback_dev;
2662 
2663                 /* make sure orig_oif points to fib result device even
2664                  * though packet rx/tx happens over loopback or l3mdev
2665                  */
2666                 orig_oif = FIB_RES_OIF(*res);
2667 
2668                 fl4->flowi4_oif = dev_out->ifindex;
2669                 flags |= RTCF_LOCAL;
2670                 goto make_route;
2671         }
2672 
2673         fib_select_path(net, res, fl4, skb);
2674 
2675         dev_out = FIB_RES_DEV(*res);
2676 
2677 make_route:
2678         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2679 
2680 out:
2681         return rth;
2682 }
2683 
2684 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2685 {
2686         return NULL;
2687 }
2688 
2689 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2690 {
2691         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2692 
2693         return mtu ? : dst->dev->mtu;
2694 }
2695 
2696 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2697                                           struct sk_buff *skb, u32 mtu,
2698                                           bool confirm_neigh)
2699 {
2700 }
2701 
2702 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2703                                        struct sk_buff *skb)
2704 {
2705 }
2706 
2707 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2708                                           unsigned long old)
2709 {
2710         return NULL;
2711 }
2712 
2713 static struct dst_ops ipv4_dst_blackhole_ops = {
2714         .family                 =       AF_INET,
2715         .check                  =       ipv4_blackhole_dst_check,
2716         .mtu                    =       ipv4_blackhole_mtu,
2717         .default_advmss         =       ipv4_default_advmss,
2718         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2719         .redirect               =       ipv4_rt_blackhole_redirect,
2720         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2721         .neigh_lookup           =       ipv4_neigh_lookup,
2722 };
2723 
2724 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2725 {
2726         struct rtable *ort = (struct rtable *) dst_orig;
2727         struct rtable *rt;
2728 
2729         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2730         if (rt) {
2731                 struct dst_entry *new = &rt->dst;
2732 
2733                 new->__use = 1;
2734                 new->input = dst_discard;
2735                 new->output = dst_discard_out;
2736 
2737                 new->dev = net->loopback_dev;
2738                 if (new->dev)
2739                         dev_hold(new->dev);
2740 
2741                 rt->rt_is_input = ort->rt_is_input;
2742                 rt->rt_iif = ort->rt_iif;
2743                 rt->rt_pmtu = ort->rt_pmtu;
2744                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2745 
2746                 rt->rt_genid = rt_genid_ipv4(net);
2747                 rt->rt_flags = ort->rt_flags;
2748                 rt->rt_type = ort->rt_type;
2749                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2750                 rt->rt_gw_family = ort->rt_gw_family;
2751                 if (rt->rt_gw_family == AF_INET)
2752                         rt->rt_gw4 = ort->rt_gw4;
2753                 else if (rt->rt_gw_family == AF_INET6)
2754                         rt->rt_gw6 = ort->rt_gw6;
2755 
2756                 INIT_LIST_HEAD(&rt->rt_uncached);
2757         }
2758 
2759         dst_release(dst_orig);
2760 
2761         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2762 }
2763 
2764 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2765                                     const struct sock *sk)
2766 {
2767         struct rtable *rt = __ip_route_output_key(net, flp4);
2768 
2769         if (IS_ERR(rt))
2770                 return rt;
2771 
2772         if (flp4->flowi4_proto) {
2773                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2774                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2775                                                         flowi4_to_flowi(flp4),
2776                                                         sk, 0);
2777         }
2778 
2779         return rt;
2780 }
2781 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2782 
2783 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2784                                       struct net_device *dev,
2785                                       struct net *net, __be32 *saddr,
2786                                       const struct ip_tunnel_info *info,
2787                                       u8 protocol, bool use_cache)
2788 {
2789 #ifdef CONFIG_DST_CACHE
2790         struct dst_cache *dst_cache;
2791 #endif
2792         struct rtable *rt = NULL;
2793         struct flowi4 fl4;
2794         __u8 tos;
2795 
2796 #ifdef CONFIG_DST_CACHE
2797         dst_cache = (struct dst_cache *)&info->dst_cache;
2798         if (use_cache) {
2799                 rt = dst_cache_get_ip4(dst_cache, saddr);
2800                 if (rt)
2801                         return rt;
2802         }
2803 #endif
2804         memset(&fl4, 0, sizeof(fl4));
2805         fl4.flowi4_mark = skb->mark;
2806         fl4.flowi4_proto = protocol;
2807         fl4.daddr = info->key.u.ipv4.dst;
2808         fl4.saddr = info->key.u.ipv4.src;
2809         tos = info->key.tos;
2810         fl4.flowi4_tos = RT_TOS(tos);
2811 
2812         rt = ip_route_output_key(net, &fl4);
2813         if (IS_ERR(rt)) {
2814                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2815                 return ERR_PTR(-ENETUNREACH);
2816         }
2817         if (rt->dst.dev == dev) { /* is this necessary? */
2818                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2819                 ip_rt_put(rt);
2820                 return ERR_PTR(-ELOOP);
2821         }
2822 #ifdef CONFIG_DST_CACHE
2823         if (use_cache)
2824                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2825 #endif
2826         *saddr = fl4.saddr;
2827         return rt;
2828 }
2829 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2830 
2831 /* called with rcu_read_lock held */
2832 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2833                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2834                         struct sk_buff *skb, u32 portid, u32 seq,
2835                         unsigned int flags)
2836 {
2837         struct rtmsg *r;
2838         struct nlmsghdr *nlh;
2839         unsigned long expires = 0;
2840         u32 error;
2841         u32 metrics[RTAX_MAX];
2842 
2843         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2844         if (!nlh)
2845                 return -EMSGSIZE;
2846 
2847         r = nlmsg_data(nlh);
2848         r->rtm_family    = AF_INET;
2849         r->rtm_dst_len  = 32;
2850         r->rtm_src_len  = 0;
2851         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2852         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2853         if (nla_put_u32(skb, RTA_TABLE, table_id))
2854                 goto nla_put_failure;
2855         r->rtm_type     = rt->rt_type;
2856         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2857         r->rtm_protocol = RTPROT_UNSPEC;
2858         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2859         if (rt->rt_flags & RTCF_NOTIFY)
2860                 r->rtm_flags |= RTM_F_NOTIFY;
2861         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2862                 r->rtm_flags |= RTCF_DOREDIRECT;
2863 
2864         if (nla_put_in_addr(skb, RTA_DST, dst))
2865                 goto nla_put_failure;
2866         if (src) {
2867                 r->rtm_src_len = 32;
2868                 if (nla_put_in_addr(skb, RTA_SRC, src))
2869                         goto nla_put_failure;
2870         }
2871         if (rt->dst.dev &&
2872             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2873                 goto nla_put_failure;
2874 #ifdef CONFIG_IP_ROUTE_CLASSID
2875         if (rt->dst.tclassid &&
2876             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2877                 goto nla_put_failure;
2878 #endif
2879         if (fl4 && !rt_is_input_route(rt) &&
2880             fl4->saddr != src) {
2881                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2882                         goto nla_put_failure;
2883         }
2884         if (rt->rt_uses_gateway) {
2885                 if (rt->rt_gw_family == AF_INET &&
2886                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2887                         goto nla_put_failure;
2888                 } else if (rt->rt_gw_family == AF_INET6) {
2889                         int alen = sizeof(struct in6_addr);
2890                         struct nlattr *nla;
2891                         struct rtvia *via;
2892 
2893                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2894                         if (!nla)
2895                                 goto nla_put_failure;
2896 
2897                         via = nla_data(nla);
2898                         via->rtvia_family = AF_INET6;
2899                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2900                 }
2901         }
2902 
2903         expires = rt->dst.expires;
2904         if (expires) {
2905                 unsigned long now = jiffies;
2906 
2907                 if (time_before(now, expires))
2908                         expires -= now;
2909                 else
2910                         expires = 0;
2911         }
2912 
2913         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2914         if (rt->rt_pmtu && expires)
2915                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2916         if (rt->rt_mtu_locked && expires)
2917                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2918         if (rtnetlink_put_metrics(skb, metrics) < 0)
2919                 goto nla_put_failure;
2920 
2921         if (fl4) {
2922                 if (fl4->flowi4_mark &&
2923                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2924                         goto nla_put_failure;
2925 
2926                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2927                     nla_put_u32(skb, RTA_UID,
2928                                 from_kuid_munged(current_user_ns(),
2929                                                  fl4->flowi4_uid)))
2930                         goto nla_put_failure;
2931 
2932                 if (rt_is_input_route(rt)) {
2933 #ifdef CONFIG_IP_MROUTE
2934                         if (ipv4_is_multicast(dst) &&
2935                             !ipv4_is_local_multicast(dst) &&
2936                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2937                                 int err = ipmr_get_route(net, skb,
2938                                                          fl4->saddr, fl4->daddr,
2939                                                          r, portid);
2940 
2941                                 if (err <= 0) {
2942                                         if (err == 0)
2943                                                 return 0;
2944                                         goto nla_put_failure;
2945                                 }
2946                         } else
2947 #endif
2948                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2949                                         goto nla_put_failure;
2950                 }
2951         }
2952 
2953         error = rt->dst.error;
2954 
2955         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2956                 goto nla_put_failure;
2957 
2958         nlmsg_end(skb, nlh);
2959         return 0;
2960 
2961 nla_put_failure:
2962         nlmsg_cancel(skb, nlh);
2963         return -EMSGSIZE;
2964 }
2965 
2966 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2967                             struct netlink_callback *cb, u32 table_id,
2968                             struct fnhe_hash_bucket *bucket, int genid,
2969                             int *fa_index, int fa_start, unsigned int flags)
2970 {
2971         int i;
2972 
2973         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2974                 struct fib_nh_exception *fnhe;
2975 
2976                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2977                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2978                         struct rtable *rt;
2979                         int err;
2980 
2981                         if (*fa_index < fa_start)
2982                                 goto next;
2983 
2984                         if (fnhe->fnhe_genid != genid)
2985                                 goto next;
2986 
2987                         if (fnhe->fnhe_expires &&
2988                             time_after(jiffies, fnhe->fnhe_expires))
2989                                 goto next;
2990 
2991                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2992                         if (!rt)
2993                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2994                         if (!rt)
2995                                 goto next;
2996 
2997                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2998                                            table_id, NULL, skb,
2999                                            NETLINK_CB(cb->skb).portid,
3000                                            cb->nlh->nlmsg_seq, flags);
3001                         if (err)
3002                                 return err;
3003 next:
3004                         (*fa_index)++;
3005                 }
3006         }
3007 
3008         return 0;
3009 }
3010 
3011 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3012                        u32 table_id, struct fib_info *fi,
3013                        int *fa_index, int fa_start, unsigned int flags)
3014 {
3015         struct net *net = sock_net(cb->skb->sk);
3016         int nhsel, genid = fnhe_genid(net);
3017 
3018         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3019                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3020                 struct fnhe_hash_bucket *bucket;
3021                 int err;
3022 
3023                 if (nhc->nhc_flags & RTNH_F_DEAD)
3024                         continue;
3025 
3026                 rcu_read_lock();
3027                 bucket = rcu_dereference(nhc->nhc_exceptions);
3028                 err = 0;
3029                 if (bucket)
3030                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3031                                                genid, fa_index, fa_start,
3032                                                flags);
3033                 rcu_read_unlock();
3034                 if (err)
3035                         return err;
3036         }
3037 
3038         return 0;
3039 }
3040 
3041 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3042                                                    u8 ip_proto, __be16 sport,
3043                                                    __be16 dport)
3044 {
3045         struct sk_buff *skb;
3046         struct iphdr *iph;
3047 
3048         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3049         if (!skb)
3050                 return NULL;
3051 
3052         /* Reserve room for dummy headers, this skb can pass
3053          * through good chunk of routing engine.
3054          */
3055         skb_reset_mac_header(skb);
3056         skb_reset_network_header(skb);
3057         skb->protocol = htons(ETH_P_IP);
3058         iph = skb_put(skb, sizeof(struct iphdr));
3059         iph->protocol = ip_proto;
3060         iph->saddr = src;
3061         iph->daddr = dst;
3062         iph->version = 0x4;
3063         iph->frag_off = 0;
3064         iph->ihl = 0x5;
3065         skb_set_transport_header(skb, skb->len);
3066 
3067         switch (iph->protocol) {
3068         case IPPROTO_UDP: {
3069                 struct udphdr *udph;
3070 
3071                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3072                 udph->source = sport;
3073                 udph->dest = dport;
3074                 udph->len = sizeof(struct udphdr);
3075                 udph->check = 0;
3076                 break;
3077         }
3078         case IPPROTO_TCP: {
3079                 struct tcphdr *tcph;
3080 
3081                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3082                 tcph->source    = sport;
3083                 tcph->dest      = dport;
3084                 tcph->doff      = sizeof(struct tcphdr) / 4;
3085                 tcph->rst = 1;
3086                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3087                                             src, dst, 0);
3088                 break;
3089         }
3090         case IPPROTO_ICMP: {
3091                 struct icmphdr *icmph;
3092 
3093                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3094                 icmph->type = ICMP_ECHO;
3095                 icmph->code = 0;
3096         }
3097         }
3098 
3099         return skb;
3100 }
3101 
3102 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3103                                        const struct nlmsghdr *nlh,
3104                                        struct nlattr **tb,
3105                                        struct netlink_ext_ack *extack)
3106 {
3107         struct rtmsg *rtm;
3108         int i, err;
3109 
3110         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3111                 NL_SET_ERR_MSG(extack,
3112                                "ipv4: Invalid header for route get request");
3113                 return -EINVAL;
3114         }
3115 
3116         if (!netlink_strict_get_check(skb))
3117                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3118                                               rtm_ipv4_policy, extack);
3119 
3120         rtm = nlmsg_data(nlh);
3121         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3122             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3123             rtm->rtm_table || rtm->rtm_protocol ||
3124             rtm->rtm_scope || rtm->rtm_type) {
3125                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3126                 return -EINVAL;
3127         }
3128 
3129         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3130                                RTM_F_LOOKUP_TABLE |
3131                                RTM_F_FIB_MATCH)) {
3132                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3133                 return -EINVAL;
3134         }
3135 
3136         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3137                                             rtm_ipv4_policy, extack);
3138         if (err)
3139                 return err;
3140 
3141         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3142             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3143                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3144                 return -EINVAL;
3145         }
3146 
3147         for (i = 0; i <= RTA_MAX; i++) {
3148                 if (!tb[i])
3149                         continue;
3150 
3151                 switch (i) {
3152                 case RTA_IIF:
3153                 case RTA_OIF:
3154                 case RTA_SRC:
3155                 case RTA_DST:
3156                 case RTA_IP_PROTO:
3157                 case RTA_SPORT:
3158                 case RTA_DPORT:
3159                 case RTA_MARK:
3160                 case RTA_UID:
3161                         break;
3162                 default:
3163                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3164                         return -EINVAL;
3165                 }
3166         }
3167 
3168         return 0;
3169 }
3170 
3171 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3172                              struct netlink_ext_ack *extack)
3173 {
3174         struct net *net = sock_net(in_skb->sk);
3175         struct nlattr *tb[RTA_MAX+1];
3176         u32 table_id = RT_TABLE_MAIN;
3177         __be16 sport = 0, dport = 0;
3178         struct fib_result res = {};
3179         u8 ip_proto = IPPROTO_UDP;
3180         struct rtable *rt = NULL;
3181         struct sk_buff *skb;
3182         struct rtmsg *rtm;
3183         struct flowi4 fl4 = {};
3184         __be32 dst = 0;
3185         __be32 src = 0;
3186         kuid_t uid;
3187         u32 iif;
3188         int err;
3189         int mark;
3190 
3191         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3192         if (err < 0)
3193                 return err;
3194 
3195         rtm = nlmsg_data(nlh);
3196         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3197         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3198         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3199         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3200         if (tb[RTA_UID])
3201                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3202         else
3203                 uid = (iif ? INVALID_UID : current_uid());
3204 
3205         if (tb[RTA_IP_PROTO]) {
3206                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3207                                                   &ip_proto, AF_INET, extack);
3208                 if (err)
3209                         return err;
3210         }
3211 
3212         if (tb[RTA_SPORT])
3213                 sport = nla_get_be16(tb[RTA_SPORT]);
3214 
3215         if (tb[RTA_DPORT])
3216                 dport = nla_get_be16(tb[RTA_DPORT]);
3217 
3218         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3219         if (!skb)
3220                 return -ENOBUFS;
3221 
3222         fl4.daddr = dst;
3223         fl4.saddr = src;
3224         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3225         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3226         fl4.flowi4_mark = mark;
3227         fl4.flowi4_uid = uid;
3228         if (sport)
3229                 fl4.fl4_sport = sport;
3230         if (dport)
3231                 fl4.fl4_dport = dport;
3232         fl4.flowi4_proto = ip_proto;
3233 
3234         rcu_read_lock();
3235 
3236         if (iif) {
3237                 struct net_device *dev;
3238 
3239                 dev = dev_get_by_index_rcu(net, iif);
3240                 if (!dev) {
3241                         err = -ENODEV;
3242                         goto errout_rcu;
3243                 }
3244 
3245                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3246                 skb->dev        = dev;
3247                 skb->mark       = mark;
3248                 err = ip_route_input_rcu(skb, dst, src,
3249                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3250                                          &res);
3251 
3252                 rt = skb_rtable(skb);
3253                 if (err == 0 && rt->dst.error)
3254                         err = -rt->dst.error;
3255         } else {
3256                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3257                 skb->dev = net->loopback_dev;
3258                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3259                 err = 0;
3260                 if (IS_ERR(rt))
3261                         err = PTR_ERR(rt);
3262                 else
3263                         skb_dst_set(skb, &rt->dst);
3264         }
3265 
3266         if (err)
3267                 goto errout_rcu;
3268 
3269         if (rtm->rtm_flags & RTM_F_NOTIFY)
3270                 rt->rt_flags |= RTCF_NOTIFY;
3271 
3272         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3273                 table_id = res.table ? res.table->tb_id : 0;
3274 
3275         /* reset skb for netlink reply msg */
3276         skb_trim(skb, 0);
3277         skb_reset_network_header(skb);
3278         skb_reset_transport_header(skb);
3279         skb_reset_mac_header(skb);
3280 
3281         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3282                 struct fib_rt_info fri;
3283 
3284                 if (!res.fi) {
3285                         err = fib_props[res.type].error;
3286                         if (!err)
3287                                 err = -EHOSTUNREACH;
3288                         goto errout_rcu;
3289                 }
3290                 fri.fi = res.fi;
3291                 fri.tb_id = table_id;
3292                 fri.dst = res.prefix;
3293                 fri.dst_len = res.prefixlen;
3294                 fri.tos = fl4.flowi4_tos;
3295                 fri.type = rt->rt_type;
3296                 fri.offload = 0;
3297                 fri.trap = 0;
3298                 if (res.fa_head) {
3299                         struct fib_alias *fa;
3300 
3301                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3302                                 u8 slen = 32 - fri.dst_len;
3303 
3304                                 if (fa->fa_slen == slen &&
3305                                     fa->tb_id == fri.tb_id &&
3306                                     fa->fa_tos == fri.tos &&
3307                                     fa->fa_info == res.fi &&
3308                                     fa->fa_type == fri.type) {
3309                                         fri.offload = fa->offload;
3310                                         fri.trap = fa->trap;
3311                                         break;
3312                                 }
3313                         }
3314                 }
3315                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3316                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3317         } else {
3318                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3319                                    NETLINK_CB(in_skb).portid,
3320                                    nlh->nlmsg_seq, 0);
3321         }
3322         if (err < 0)
3323                 goto errout_rcu;
3324 
3325         rcu_read_unlock();
3326 
3327         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3328 
3329 errout_free:
3330         return err;
3331 errout_rcu:
3332         rcu_read_unlock();
3333         kfree_skb(skb);
3334         goto errout_free;
3335 }
3336 
3337 void ip_rt_multicast_event(struct in_device *in_dev)
3338 {
3339         rt_cache_flush(dev_net(in_dev->dev));
3340 }
3341 
3342 #ifdef CONFIG_SYSCTL
3343 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3344 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3345 static int ip_rt_gc_elasticity __read_mostly    = 8;
3346 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3347 
3348 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3349                 void *buffer, size_t *lenp, loff_t *ppos)
3350 {
3351         struct net *net = (struct net *)__ctl->extra1;
3352 
3353         if (write) {
3354                 rt_cache_flush(net);
3355                 fnhe_genid_bump(net);
3356                 return 0;
3357         }
3358 
3359         return -EINVAL;
3360 }
3361 
3362 static struct ctl_table ipv4_route_table[] = {
3363         {
3364                 .procname       = "gc_thresh",
3365                 .data           = &ipv4_dst_ops.gc_thresh,
3366                 .maxlen         = sizeof(int),
3367                 .mode           = 0644,
3368                 .proc_handler   = proc_dointvec,
3369         },
3370         {
3371                 .procname       = "max_size",
3372                 .data           = &ip_rt_max_size,
3373                 .maxlen         = sizeof(int),
3374                 .mode           = 0644,
3375                 .proc_handler   = proc_dointvec,
3376         },
3377         {
3378                 /*  Deprecated. Use gc_min_interval_ms */
3379 
3380                 .procname       = "gc_min_interval",
3381                 .data           = &ip_rt_gc_min_interval,
3382                 .maxlen         = sizeof(int),
3383                 .mode           = 0644,
3384                 .proc_handler   = proc_dointvec_jiffies,
3385         },
3386         {
3387                 .procname       = "gc_min_interval_ms",
3388                 .data           = &ip_rt_gc_min_interval,
3389                 .maxlen         = sizeof(int),
3390                 .mode           = 0644,
3391                 .proc_handler   = proc_dointvec_ms_jiffies,
3392         },
3393         {
3394                 .procname       = "gc_timeout",
3395                 .data           = &ip_rt_gc_timeout,
3396                 .maxlen         = sizeof(int),
3397                 .mode           = 0644,
3398                 .proc_handler   = proc_dointvec_jiffies,
3399         },
3400         {
3401                 .procname       = "gc_interval",
3402                 .data           = &ip_rt_gc_interval,
3403                 .maxlen         = sizeof(int),
3404                 .mode           = 0644,
3405                 .proc_handler   = proc_dointvec_jiffies,
3406         },
3407         {
3408                 .procname       = "redirect_load",
3409                 .data           = &ip_rt_redirect_load,
3410                 .maxlen         = sizeof(int),
3411                 .mode           = 0644,
3412                 .proc_handler   = proc_dointvec,
3413         },
3414         {
3415                 .procname       = "redirect_number",
3416                 .data           = &ip_rt_redirect_number,
3417                 .maxlen         = sizeof(int),
3418                 .mode           = 0644,
3419                 .proc_handler   = proc_dointvec,
3420         },
3421         {
3422                 .procname       = "redirect_silence",
3423                 .data           = &ip_rt_redirect_silence,
3424                 .maxlen         = sizeof(int),
3425                 .mode           = 0644,
3426                 .proc_handler   = proc_dointvec,
3427         },
3428         {
3429                 .procname       = "error_cost",
3430                 .data           = &ip_rt_error_cost,
3431                 .maxlen         = sizeof(int),
3432                 .mode           = 0644,
3433                 .proc_handler   = proc_dointvec,
3434         },
3435         {
3436                 .procname       = "error_burst",
3437                 .data           = &ip_rt_error_burst,
3438                 .maxlen         = sizeof(int),
3439                 .mode           = 0644,
3440                 .proc_handler   = proc_dointvec,
3441         },
3442         {
3443                 .procname       = "gc_elasticity",
3444                 .data           = &ip_rt_gc_elasticity,
3445                 .maxlen         = sizeof(int),
3446                 .mode           = 0644,
3447                 .proc_handler   = proc_dointvec,
3448         },
3449         {
3450                 .procname       = "mtu_expires",
3451                 .data           = &ip_rt_mtu_expires,
3452                 .maxlen         = sizeof(int),
3453                 .mode           = 0644,
3454                 .proc_handler   = proc_dointvec_jiffies,
3455         },
3456         {
3457                 .procname       = "min_pmtu",
3458                 .data           = &ip_rt_min_pmtu,
3459                 .maxlen         = sizeof(int),
3460                 .mode           = 0644,
3461                 .proc_handler   = proc_dointvec_minmax,
3462                 .extra1         = &ip_min_valid_pmtu,
3463         },
3464         {
3465                 .procname       = "min_adv_mss",
3466                 .data           = &ip_rt_min_advmss,
3467                 .maxlen         = sizeof(int),
3468                 .mode           = 0644,
3469                 .proc_handler   = proc_dointvec,
3470         },
3471         { }
3472 };
3473 
3474 static const char ipv4_route_flush_procname[] = "flush";
3475 
3476 static struct ctl_table ipv4_route_flush_table[] = {
3477         {
3478                 .procname       = ipv4_route_flush_procname,
3479                 .maxlen         = sizeof(int),
3480                 .mode           = 0200,
3481                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3482         },
3483         { },
3484 };
3485 
3486 static __net_init int sysctl_route_net_init(struct net *net)
3487 {
3488         struct ctl_table *tbl;
3489 
3490         tbl = ipv4_route_flush_table;
3491         if (!net_eq(net, &init_net)) {
3492                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3493                 if (!tbl)
3494                         goto err_dup;
3495 
3496                 /* Don't export non-whitelisted sysctls to unprivileged users */
3497                 if (net->user_ns != &init_user_ns) {
3498                         if (tbl[0].procname != ipv4_route_flush_procname)
3499                                 tbl[0].procname = NULL;
3500                 }
3501         }
3502         tbl[0].extra1 = net;
3503 
3504         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3505         if (!net->ipv4.route_hdr)
3506                 goto err_reg;
3507         return 0;
3508 
3509 err_reg:
3510         if (tbl != ipv4_route_flush_table)
3511                 kfree(tbl);
3512 err_dup:
3513         return -ENOMEM;
3514 }
3515 
3516 static __net_exit void sysctl_route_net_exit(struct net *net)
3517 {
3518         struct ctl_table *tbl;
3519 
3520         tbl = net->ipv4.route_hdr->ctl_table_arg;
3521         unregister_net_sysctl_table(net->ipv4.route_hdr);
3522         BUG_ON(tbl == ipv4_route_flush_table);
3523         kfree(tbl);
3524 }
3525 
3526 static __net_initdata struct pernet_operations sysctl_route_ops = {
3527         .init = sysctl_route_net_init,
3528         .exit = sysctl_route_net_exit,
3529 };
3530 #endif
3531 
3532 static __net_init int rt_genid_init(struct net *net)
3533 {
3534         atomic_set(&net->ipv4.rt_genid, 0);
3535         atomic_set(&net->fnhe_genid, 0);
3536         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3537         return 0;
3538 }
3539 
3540 static __net_initdata struct pernet_operations rt_genid_ops = {
3541         .init = rt_genid_init,
3542 };
3543 
3544 static int __net_init ipv4_inetpeer_init(struct net *net)
3545 {
3546         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3547 
3548         if (!bp)
3549                 return -ENOMEM;
3550         inet_peer_base_init(bp);
3551         net->ipv4.peers = bp;
3552         return 0;
3553 }
3554 
3555 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3556 {
3557         struct inet_peer_base *bp = net->ipv4.peers;
3558 
3559         net->ipv4.peers = NULL;
3560         inetpeer_invalidate_tree(bp);
3561         kfree(bp);
3562 }
3563 
3564 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3565         .init   =       ipv4_inetpeer_init,
3566         .exit   =       ipv4_inetpeer_exit,
3567 };
3568 
3569 #ifdef CONFIG_IP_ROUTE_CLASSID
3570 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3571 #endif /* CONFIG_IP_ROUTE_CLASSID */
3572 
3573 int __init ip_rt_init(void)
3574 {
3575         int cpu;
3576 
3577         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3578                                   GFP_KERNEL);
3579         if (!ip_idents)
3580                 panic("IP: failed to allocate ip_idents\n");
3581 
3582         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3583 
3584         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3585         if (!ip_tstamps)
3586                 panic("IP: failed to allocate ip_tstamps\n");
3587 
3588         for_each_possible_cpu(cpu) {
3589                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3590 
3591                 INIT_LIST_HEAD(&ul->head);
3592                 spin_lock_init(&ul->lock);
3593         }
3594 #ifdef CONFIG_IP_ROUTE_CLASSID
3595         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3596         if (!ip_rt_acct)
3597                 panic("IP: failed to allocate ip_rt_acct\n");
3598 #endif
3599 
3600         ipv4_dst_ops.kmem_cachep =
3601                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3602                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3603 
3604         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3605 
3606         if (dst_entries_init(&ipv4_dst_ops) < 0)
3607                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3608 
3609         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3610                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3611 
3612         ipv4_dst_ops.gc_thresh = ~0;
3613         ip_rt_max_size = INT_MAX;
3614 
3615         devinet_init();
3616         ip_fib_init();
3617 
3618         if (ip_rt_proc_init())
3619                 pr_err("Unable to create route proc files\n");
3620 #ifdef CONFIG_XFRM
3621         xfrm_init();
3622         xfrm4_init();
3623 #endif
3624         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3625                       RTNL_FLAG_DOIT_UNLOCKED);
3626 
3627 #ifdef CONFIG_SYSCTL
3628         register_pernet_subsys(&sysctl_route_ops);
3629 #endif
3630         register_pernet_subsys(&rt_genid_ops);
3631         register_pernet_subsys(&ipv4_inetpeer_ops);
3632         return 0;
3633 }
3634 
3635 #ifdef CONFIG_SYSCTL
3636 /*
3637  * We really need to sanitize the damn ipv4 init order, then all
3638  * this nonsense will go away.
3639  */
3640 void __init ip_static_sysctl_init(void)
3641 {
3642         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3643 }
3644 #endif
3645 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp