~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/route.c

Version: ~ [ linux-5.5 ] ~ [ linux-5.4.15 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.98 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.167 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.211 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.211 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.81 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              ROUTE - implementation of the IP router.
  7  *
  8  * Authors:     Ross Biro
  9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 13  *
 14  * Fixes:
 15  *              Alan Cox        :       Verify area fixes.
 16  *              Alan Cox        :       cli() protects routing changes
 17  *              Rui Oliveira    :       ICMP routing table updates
 18  *              (rco@di.uminho.pt)      Routing table insertion and update
 19  *              Linus Torvalds  :       Rewrote bits to be sensible
 20  *              Alan Cox        :       Added BSD route gw semantics
 21  *              Alan Cox        :       Super /proc >4K
 22  *              Alan Cox        :       MTU in route table
 23  *              Alan Cox        :       MSS actually. Also added the window
 24  *                                      clamper.
 25  *              Sam Lantinga    :       Fixed route matching in rt_del()
 26  *              Alan Cox        :       Routing cache support.
 27  *              Alan Cox        :       Removed compatibility cruft.
 28  *              Alan Cox        :       RTF_REJECT support.
 29  *              Alan Cox        :       TCP irtt support.
 30  *              Jonathan Naylor :       Added Metric support.
 31  *      Miquel van Smoorenburg  :       BSD API fixes.
 32  *      Miquel van Smoorenburg  :       Metrics.
 33  *              Alan Cox        :       Use __u32 properly
 34  *              Alan Cox        :       Aligned routing errors more closely with BSD
 35  *                                      our system is still very different.
 36  *              Alan Cox        :       Faster /proc handling
 37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 38  *                                      routing caches and better behaviour.
 39  *
 40  *              Olaf Erb        :       irtt wasn't being copied right.
 41  *              Bjorn Ekwall    :       Kerneld route support.
 42  *              Alan Cox        :       Multicast fixed (I hope)
 43  *              Pavel Krauz     :       Limited broadcast fixed
 44  *              Mike McLagan    :       Routing by source
 45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 46  *                                      route.c and rewritten from scratch.
 47  *              Andi Kleen      :       Load-limit warning messages.
 48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 52  *              Marc Boucher    :       routing by fwmark
 53  *      Robert Olsson           :       Added rt_cache statistics
 54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
 58  *
 59  *              This program is free software; you can redistribute it and/or
 60  *              modify it under the terms of the GNU General Public License
 61  *              as published by the Free Software Foundation; either version
 62  *              2 of the License, or (at your option) any later version.
 63  */
 64 
 65 #define pr_fmt(fmt) "IPv4: " fmt
 66 
 67 #include <linux/module.h>
 68 #include <asm/uaccess.h>
 69 #include <linux/bitops.h>
 70 #include <linux/types.h>
 71 #include <linux/kernel.h>
 72 #include <linux/mm.h>
 73 #include <linux/string.h>
 74 #include <linux/socket.h>
 75 #include <linux/sockios.h>
 76 #include <linux/errno.h>
 77 #include <linux/in.h>
 78 #include <linux/inet.h>
 79 #include <linux/netdevice.h>
 80 #include <linux/proc_fs.h>
 81 #include <linux/init.h>
 82 #include <linux/skbuff.h>
 83 #include <linux/inetdevice.h>
 84 #include <linux/igmp.h>
 85 #include <linux/pkt_sched.h>
 86 #include <linux/mroute.h>
 87 #include <linux/netfilter_ipv4.h>
 88 #include <linux/random.h>
 89 #include <linux/rcupdate.h>
 90 #include <linux/times.h>
 91 #include <linux/slab.h>
 92 #include <linux/jhash.h>
 93 #include <net/dst.h>
 94 #include <net/net_namespace.h>
 95 #include <net/protocol.h>
 96 #include <net/ip.h>
 97 #include <net/route.h>
 98 #include <net/inetpeer.h>
 99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112 
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115 
116 /* IPv4 datagram length is stored into 16bit field (tot_len) */
117 #define IP_MAX_MTU      0xFFFF
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
128 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly       = 256;
130 
131 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
132 /*
133  *      Interface to generic destination cache.
134  */
135 
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu);
143 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
144                                         struct sk_buff *skb);
145 static void             ipv4_dst_destroy(struct dst_entry *dst);
146 
147 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148                             int how)
149 {
150 }
151 
152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153 {
154         WARN_ON(1);
155         return NULL;
156 }
157 
158 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
159                                            struct sk_buff *skb,
160                                            const void *daddr);
161 
162 static struct dst_ops ipv4_dst_ops = {
163         .family =               AF_INET,
164         .protocol =             cpu_to_be16(ETH_P_IP),
165         .check =                ipv4_dst_check,
166         .default_advmss =       ipv4_default_advmss,
167         .mtu =                  ipv4_mtu,
168         .cow_metrics =          ipv4_cow_metrics,
169         .destroy =              ipv4_dst_destroy,
170         .ifdown =               ipv4_dst_ifdown,
171         .negative_advice =      ipv4_negative_advice,
172         .link_failure =         ipv4_link_failure,
173         .update_pmtu =          ip_rt_update_pmtu,
174         .redirect =             ip_do_redirect,
175         .local_out =            __ip_local_out,
176         .neigh_lookup =         ipv4_neigh_lookup,
177 };
178 
179 #define ECN_OR_COST(class)      TC_PRIO_##class
180 
181 const __u8 ip_tos2prio[16] = {
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BESTEFFORT,
185         ECN_OR_COST(BESTEFFORT),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_BULK,
189         ECN_OR_COST(BULK),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE,
193         ECN_OR_COST(INTERACTIVE),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK),
196         TC_PRIO_INTERACTIVE_BULK,
197         ECN_OR_COST(INTERACTIVE_BULK)
198 };
199 EXPORT_SYMBOL(ip_tos2prio);
200 
201 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
202 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
203 
204 #ifdef CONFIG_PROC_FS
205 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
206 {
207         if (*pos)
208                 return NULL;
209         return SEQ_START_TOKEN;
210 }
211 
212 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
213 {
214         ++*pos;
215         return NULL;
216 }
217 
218 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
219 {
220 }
221 
222 static int rt_cache_seq_show(struct seq_file *seq, void *v)
223 {
224         if (v == SEQ_START_TOKEN)
225                 seq_printf(seq, "%-127s\n",
226                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
227                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
228                            "HHUptod\tSpecDst");
229         return 0;
230 }
231 
232 static const struct seq_operations rt_cache_seq_ops = {
233         .start  = rt_cache_seq_start,
234         .next   = rt_cache_seq_next,
235         .stop   = rt_cache_seq_stop,
236         .show   = rt_cache_seq_show,
237 };
238 
239 static int rt_cache_seq_open(struct inode *inode, struct file *file)
240 {
241         return seq_open(file, &rt_cache_seq_ops);
242 }
243 
244 static const struct file_operations rt_cache_seq_fops = {
245         .owner   = THIS_MODULE,
246         .open    = rt_cache_seq_open,
247         .read    = seq_read,
248         .llseek  = seq_lseek,
249         .release = seq_release,
250 };
251 
252 
253 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
254 {
255         int cpu;
256 
257         if (*pos == 0)
258                 return SEQ_START_TOKEN;
259 
260         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
261                 if (!cpu_possible(cpu))
262                         continue;
263                 *pos = cpu+1;
264                 return &per_cpu(rt_cache_stat, cpu);
265         }
266         return NULL;
267 }
268 
269 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 {
271         int cpu;
272 
273         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
274                 if (!cpu_possible(cpu))
275                         continue;
276                 *pos = cpu+1;
277                 return &per_cpu(rt_cache_stat, cpu);
278         }
279         return NULL;
280 
281 }
282 
283 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
284 {
285 
286 }
287 
288 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
289 {
290         struct rt_cache_stat *st = v;
291 
292         if (v == SEQ_START_TOKEN) {
293                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
294                 return 0;
295         }
296 
297         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
298                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
299                    dst_entries_get_slow(&ipv4_dst_ops),
300                    st->in_hit,
301                    st->in_slow_tot,
302                    st->in_slow_mc,
303                    st->in_no_route,
304                    st->in_brd,
305                    st->in_martian_dst,
306                    st->in_martian_src,
307 
308                    st->out_hit,
309                    st->out_slow_tot,
310                    st->out_slow_mc,
311 
312                    st->gc_total,
313                    st->gc_ignored,
314                    st->gc_goal_miss,
315                    st->gc_dst_overflow,
316                    st->in_hlist_search,
317                    st->out_hlist_search
318                 );
319         return 0;
320 }
321 
322 static const struct seq_operations rt_cpu_seq_ops = {
323         .start  = rt_cpu_seq_start,
324         .next   = rt_cpu_seq_next,
325         .stop   = rt_cpu_seq_stop,
326         .show   = rt_cpu_seq_show,
327 };
328 
329 
330 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
331 {
332         return seq_open(file, &rt_cpu_seq_ops);
333 }
334 
335 static const struct file_operations rt_cpu_seq_fops = {
336         .owner   = THIS_MODULE,
337         .open    = rt_cpu_seq_open,
338         .read    = seq_read,
339         .llseek  = seq_lseek,
340         .release = seq_release,
341 };
342 
343 #ifdef CONFIG_IP_ROUTE_CLASSID
344 static int rt_acct_proc_show(struct seq_file *m, void *v)
345 {
346         struct ip_rt_acct *dst, *src;
347         unsigned int i, j;
348 
349         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
350         if (!dst)
351                 return -ENOMEM;
352 
353         for_each_possible_cpu(i) {
354                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
355                 for (j = 0; j < 256; j++) {
356                         dst[j].o_bytes   += src[j].o_bytes;
357                         dst[j].o_packets += src[j].o_packets;
358                         dst[j].i_bytes   += src[j].i_bytes;
359                         dst[j].i_packets += src[j].i_packets;
360                 }
361         }
362 
363         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
364         kfree(dst);
365         return 0;
366 }
367 
368 static int rt_acct_proc_open(struct inode *inode, struct file *file)
369 {
370         return single_open(file, rt_acct_proc_show, NULL);
371 }
372 
373 static const struct file_operations rt_acct_proc_fops = {
374         .owner          = THIS_MODULE,
375         .open           = rt_acct_proc_open,
376         .read           = seq_read,
377         .llseek         = seq_lseek,
378         .release        = single_release,
379 };
380 #endif
381 
382 static int __net_init ip_rt_do_proc_init(struct net *net)
383 {
384         struct proc_dir_entry *pde;
385 
386         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
387                           &rt_cache_seq_fops);
388         if (!pde)
389                 goto err1;
390 
391         pde = proc_create("rt_cache", S_IRUGO,
392                           net->proc_net_stat, &rt_cpu_seq_fops);
393         if (!pde)
394                 goto err2;
395 
396 #ifdef CONFIG_IP_ROUTE_CLASSID
397         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
398         if (!pde)
399                 goto err3;
400 #endif
401         return 0;
402 
403 #ifdef CONFIG_IP_ROUTE_CLASSID
404 err3:
405         remove_proc_entry("rt_cache", net->proc_net_stat);
406 #endif
407 err2:
408         remove_proc_entry("rt_cache", net->proc_net);
409 err1:
410         return -ENOMEM;
411 }
412 
413 static void __net_exit ip_rt_do_proc_exit(struct net *net)
414 {
415         remove_proc_entry("rt_cache", net->proc_net_stat);
416         remove_proc_entry("rt_cache", net->proc_net);
417 #ifdef CONFIG_IP_ROUTE_CLASSID
418         remove_proc_entry("rt_acct", net->proc_net);
419 #endif
420 }
421 
422 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
423         .init = ip_rt_do_proc_init,
424         .exit = ip_rt_do_proc_exit,
425 };
426 
427 static int __init ip_rt_proc_init(void)
428 {
429         return register_pernet_subsys(&ip_rt_proc_ops);
430 }
431 
432 #else
433 static inline int ip_rt_proc_init(void)
434 {
435         return 0;
436 }
437 #endif /* CONFIG_PROC_FS */
438 
439 static inline bool rt_is_expired(const struct rtable *rth)
440 {
441         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
442 }
443 
444 void rt_cache_flush(struct net *net)
445 {
446         rt_genid_bump_ipv4(net);
447 }
448 
449 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
450                                            struct sk_buff *skb,
451                                            const void *daddr)
452 {
453         struct net_device *dev = dst->dev;
454         const __be32 *pkey = daddr;
455         const struct rtable *rt;
456         struct neighbour *n;
457 
458         rt = (const struct rtable *) dst;
459         if (rt->rt_gateway)
460                 pkey = (const __be32 *) &rt->rt_gateway;
461         else if (skb)
462                 pkey = &ip_hdr(skb)->daddr;
463 
464         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
465         if (n)
466                 return n;
467         return neigh_create(&arp_tbl, pkey, dev);
468 }
469 
470 #define IP_IDENTS_SZ 2048u
471 struct ip_ident_bucket {
472         atomic_t        id;
473         u32             stamp32;
474 };
475 
476 static struct ip_ident_bucket *ip_idents __read_mostly;
477 
478 /* In order to protect privacy, we add a perturbation to identifiers
479  * if one generator is seldom used. This makes hard for an attacker
480  * to infer how many packets were sent between two points in time.
481  */
482 u32 ip_idents_reserve(u32 hash, int segs)
483 {
484         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
485         u32 old = ACCESS_ONCE(bucket->stamp32);
486         u32 now = (u32)jiffies;
487         u32 delta = 0;
488 
489         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
490                 u64 x = prandom_u32();
491 
492                 x *= (now - old);
493                 delta = (u32)(x >> 32);
494         }
495 
496         return atomic_add_return(segs + delta, &bucket->id) - segs;
497 }
498 EXPORT_SYMBOL(ip_idents_reserve);
499 
500 void __ip_select_ident(struct iphdr *iph, int segs)
501 {
502         static u32 ip_idents_hashrnd __read_mostly;
503         static bool hashrnd_initialized = false;
504         u32 hash, id;
505 
506         if (unlikely(!hashrnd_initialized)) {
507                 hashrnd_initialized = true;
508                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
509         }
510 
511         hash = jhash_3words((__force u32)iph->daddr,
512                             (__force u32)iph->saddr,
513                             iph->protocol,
514                             ip_idents_hashrnd);
515         id = ip_idents_reserve(hash, segs);
516         iph->id = htons(id);
517 }
518 EXPORT_SYMBOL(__ip_select_ident);
519 
520 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
521                              const struct iphdr *iph,
522                              int oif, u8 tos,
523                              u8 prot, u32 mark, int flow_flags)
524 {
525         if (sk) {
526                 const struct inet_sock *inet = inet_sk(sk);
527 
528                 oif = sk->sk_bound_dev_if;
529                 mark = sk->sk_mark;
530                 tos = RT_CONN_FLAGS(sk);
531                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
532         }
533         flowi4_init_output(fl4, oif, mark, tos,
534                            RT_SCOPE_UNIVERSE, prot,
535                            flow_flags,
536                            iph->daddr, iph->saddr, 0, 0);
537 }
538 
539 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
540                                const struct sock *sk)
541 {
542         const struct iphdr *iph = ip_hdr(skb);
543         int oif = skb->dev->ifindex;
544         u8 tos = RT_TOS(iph->tos);
545         u8 prot = iph->protocol;
546         u32 mark = skb->mark;
547 
548         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
549 }
550 
551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
552 {
553         const struct inet_sock *inet = inet_sk(sk);
554         const struct ip_options_rcu *inet_opt;
555         __be32 daddr = inet->inet_daddr;
556 
557         rcu_read_lock();
558         inet_opt = rcu_dereference(inet->inet_opt);
559         if (inet_opt && inet_opt->opt.srr)
560                 daddr = inet_opt->opt.faddr;
561         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
562                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
563                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
564                            inet_sk_flowi_flags(sk),
565                            daddr, inet->inet_saddr, 0, 0);
566         rcu_read_unlock();
567 }
568 
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
570                                  const struct sk_buff *skb)
571 {
572         if (skb)
573                 build_skb_flow_key(fl4, skb, sk);
574         else
575                 build_sk_flow_key(fl4, sk);
576 }
577 
578 static inline void rt_free(struct rtable *rt)
579 {
580         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
581 }
582 
583 static DEFINE_SPINLOCK(fnhe_lock);
584 
585 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
586 {
587         struct rtable *rt;
588 
589         rt = rcu_dereference(fnhe->fnhe_rth_input);
590         if (rt) {
591                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
592                 rt_free(rt);
593         }
594         rt = rcu_dereference(fnhe->fnhe_rth_output);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
597                 rt_free(rt);
598         }
599 }
600 
601 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
602 {
603         struct fib_nh_exception *fnhe, *oldest;
604 
605         oldest = rcu_dereference(hash->chain);
606         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
607              fnhe = rcu_dereference(fnhe->fnhe_next)) {
608                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
609                         oldest = fnhe;
610         }
611         fnhe_flush_routes(oldest);
612         return oldest;
613 }
614 
615 static inline u32 fnhe_hashfun(__be32 daddr)
616 {
617         u32 hval;
618 
619         hval = (__force u32) daddr;
620         hval ^= (hval >> 11) ^ (hval >> 22);
621 
622         return hval & (FNHE_HASH_SIZE - 1);
623 }
624 
625 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
626 {
627         rt->rt_pmtu = fnhe->fnhe_pmtu;
628         rt->dst.expires = fnhe->fnhe_expires;
629 
630         if (fnhe->fnhe_gw) {
631                 rt->rt_flags |= RTCF_REDIRECTED;
632                 rt->rt_gateway = fnhe->fnhe_gw;
633                 rt->rt_uses_gateway = 1;
634         }
635 }
636 
637 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
638                                   u32 pmtu, unsigned long expires)
639 {
640         struct fnhe_hash_bucket *hash;
641         struct fib_nh_exception *fnhe;
642         struct rtable *rt;
643         unsigned int i;
644         int depth;
645         u32 hval = fnhe_hashfun(daddr);
646 
647         spin_lock_bh(&fnhe_lock);
648 
649         hash = nh->nh_exceptions;
650         if (!hash) {
651                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
652                 if (!hash)
653                         goto out_unlock;
654                 nh->nh_exceptions = hash;
655         }
656 
657         hash += hval;
658 
659         depth = 0;
660         for (fnhe = rcu_dereference(hash->chain); fnhe;
661              fnhe = rcu_dereference(fnhe->fnhe_next)) {
662                 if (fnhe->fnhe_daddr == daddr)
663                         break;
664                 depth++;
665         }
666 
667         if (fnhe) {
668                 if (gw)
669                         fnhe->fnhe_gw = gw;
670                 if (pmtu) {
671                         fnhe->fnhe_pmtu = pmtu;
672                         fnhe->fnhe_expires = max(1UL, expires);
673                 }
674                 /* Update all cached dsts too */
675                 rt = rcu_dereference(fnhe->fnhe_rth_input);
676                 if (rt)
677                         fill_route_from_fnhe(rt, fnhe);
678                 rt = rcu_dereference(fnhe->fnhe_rth_output);
679                 if (rt)
680                         fill_route_from_fnhe(rt, fnhe);
681         } else {
682                 if (depth > FNHE_RECLAIM_DEPTH)
683                         fnhe = fnhe_oldest(hash);
684                 else {
685                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
686                         if (!fnhe)
687                                 goto out_unlock;
688 
689                         fnhe->fnhe_next = hash->chain;
690                         rcu_assign_pointer(hash->chain, fnhe);
691                 }
692                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
693                 fnhe->fnhe_daddr = daddr;
694                 fnhe->fnhe_gw = gw;
695                 fnhe->fnhe_pmtu = pmtu;
696                 fnhe->fnhe_expires = expires;
697 
698                 /* Exception created; mark the cached routes for the nexthop
699                  * stale, so anyone caching it rechecks if this exception
700                  * applies to them.
701                  */
702                 rt = rcu_dereference(nh->nh_rth_input);
703                 if (rt)
704                         rt->dst.obsolete = DST_OBSOLETE_KILL;
705 
706                 for_each_possible_cpu(i) {
707                         struct rtable __rcu **prt;
708                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
709                         rt = rcu_dereference(*prt);
710                         if (rt)
711                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
712                 }
713         }
714 
715         fnhe->fnhe_stamp = jiffies;
716 
717 out_unlock:
718         spin_unlock_bh(&fnhe_lock);
719         return;
720 }
721 
722 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
723                              bool kill_route)
724 {
725         __be32 new_gw = icmp_hdr(skb)->un.gateway;
726         __be32 old_gw = ip_hdr(skb)->saddr;
727         struct net_device *dev = skb->dev;
728         struct in_device *in_dev;
729         struct fib_result res;
730         struct neighbour *n;
731         struct net *net;
732 
733         switch (icmp_hdr(skb)->code & 7) {
734         case ICMP_REDIR_NET:
735         case ICMP_REDIR_NETTOS:
736         case ICMP_REDIR_HOST:
737         case ICMP_REDIR_HOSTTOS:
738                 break;
739 
740         default:
741                 return;
742         }
743 
744         if (rt->rt_gateway != old_gw)
745                 return;
746 
747         in_dev = __in_dev_get_rcu(dev);
748         if (!in_dev)
749                 return;
750 
751         net = dev_net(dev);
752         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
753             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
754             ipv4_is_zeronet(new_gw))
755                 goto reject_redirect;
756 
757         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
758                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
759                         goto reject_redirect;
760                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
761                         goto reject_redirect;
762         } else {
763                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
764                         goto reject_redirect;
765         }
766 
767         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
768         if (!n)
769                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
770         if (!IS_ERR(n)) {
771                 if (!(n->nud_state & NUD_VALID)) {
772                         neigh_event_send(n, NULL);
773                 } else {
774                         if (fib_lookup(net, fl4, &res) == 0) {
775                                 struct fib_nh *nh = &FIB_RES_NH(res);
776 
777                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
778                                                 0, jiffies + ip_rt_gc_timeout);
779                         }
780                         if (kill_route)
781                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
782                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
783                 }
784                 neigh_release(n);
785         }
786         return;
787 
788 reject_redirect:
789 #ifdef CONFIG_IP_ROUTE_VERBOSE
790         if (IN_DEV_LOG_MARTIANS(in_dev)) {
791                 const struct iphdr *iph = (const struct iphdr *) skb->data;
792                 __be32 daddr = iph->daddr;
793                 __be32 saddr = iph->saddr;
794 
795                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
796                                      "  Advised path = %pI4 -> %pI4\n",
797                                      &old_gw, dev->name, &new_gw,
798                                      &saddr, &daddr);
799         }
800 #endif
801         ;
802 }
803 
804 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
805 {
806         struct rtable *rt;
807         struct flowi4 fl4;
808         const struct iphdr *iph = (const struct iphdr *) skb->data;
809         int oif = skb->dev->ifindex;
810         u8 tos = RT_TOS(iph->tos);
811         u8 prot = iph->protocol;
812         u32 mark = skb->mark;
813 
814         rt = (struct rtable *) dst;
815 
816         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
817         __ip_do_redirect(rt, skb, &fl4, true);
818 }
819 
820 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
821 {
822         struct rtable *rt = (struct rtable *)dst;
823         struct dst_entry *ret = dst;
824 
825         if (rt) {
826                 if (dst->obsolete > 0) {
827                         ip_rt_put(rt);
828                         ret = NULL;
829                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
830                            rt->dst.expires) {
831                         ip_rt_put(rt);
832                         ret = NULL;
833                 }
834         }
835         return ret;
836 }
837 
838 /*
839  * Algorithm:
840  *      1. The first ip_rt_redirect_number redirects are sent
841  *         with exponential backoff, then we stop sending them at all,
842  *         assuming that the host ignores our redirects.
843  *      2. If we did not see packets requiring redirects
844  *         during ip_rt_redirect_silence, we assume that the host
845  *         forgot redirected route and start to send redirects again.
846  *
847  * This algorithm is much cheaper and more intelligent than dumb load limiting
848  * in icmp.c.
849  *
850  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
851  * and "frag. need" (breaks PMTU discovery) in icmp.c.
852  */
853 
854 void ip_rt_send_redirect(struct sk_buff *skb)
855 {
856         struct rtable *rt = skb_rtable(skb);
857         struct in_device *in_dev;
858         struct inet_peer *peer;
859         struct net *net;
860         int log_martians;
861 
862         rcu_read_lock();
863         in_dev = __in_dev_get_rcu(rt->dst.dev);
864         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
865                 rcu_read_unlock();
866                 return;
867         }
868         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
869         rcu_read_unlock();
870 
871         net = dev_net(rt->dst.dev);
872         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
873         if (!peer) {
874                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
875                           rt_nexthop(rt, ip_hdr(skb)->daddr));
876                 return;
877         }
878 
879         /* No redirected packets during ip_rt_redirect_silence;
880          * reset the algorithm.
881          */
882         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
883                 peer->rate_tokens = 0;
884 
885         /* Too many ignored redirects; do not send anything
886          * set dst.rate_last to the last seen redirected packet.
887          */
888         if (peer->rate_tokens >= ip_rt_redirect_number) {
889                 peer->rate_last = jiffies;
890                 goto out_put_peer;
891         }
892 
893         /* Check for load limit; set rate_last to the latest sent
894          * redirect.
895          */
896         if (peer->rate_tokens == 0 ||
897             time_after(jiffies,
898                        (peer->rate_last +
899                         (ip_rt_redirect_load << peer->rate_tokens)))) {
900                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
901 
902                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
903                 peer->rate_last = jiffies;
904                 ++peer->rate_tokens;
905 #ifdef CONFIG_IP_ROUTE_VERBOSE
906                 if (log_martians &&
907                     peer->rate_tokens == ip_rt_redirect_number)
908                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
909                                              &ip_hdr(skb)->saddr, inet_iif(skb),
910                                              &ip_hdr(skb)->daddr, &gw);
911 #endif
912         }
913 out_put_peer:
914         inet_putpeer(peer);
915 }
916 
917 static int ip_error(struct sk_buff *skb)
918 {
919         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
920         struct rtable *rt = skb_rtable(skb);
921         struct inet_peer *peer;
922         unsigned long now;
923         struct net *net;
924         bool send;
925         int code;
926 
927         /* IP on this device is disabled. */
928         if (!in_dev)
929                 goto out;
930 
931         net = dev_net(rt->dst.dev);
932         if (!IN_DEV_FORWARD(in_dev)) {
933                 switch (rt->dst.error) {
934                 case EHOSTUNREACH:
935                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
936                         break;
937 
938                 case ENETUNREACH:
939                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
940                         break;
941                 }
942                 goto out;
943         }
944 
945         switch (rt->dst.error) {
946         case EINVAL:
947         default:
948                 goto out;
949         case EHOSTUNREACH:
950                 code = ICMP_HOST_UNREACH;
951                 break;
952         case ENETUNREACH:
953                 code = ICMP_NET_UNREACH;
954                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
955                 break;
956         case EACCES:
957                 code = ICMP_PKT_FILTERED;
958                 break;
959         }
960 
961         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
962 
963         send = true;
964         if (peer) {
965                 now = jiffies;
966                 peer->rate_tokens += now - peer->rate_last;
967                 if (peer->rate_tokens > ip_rt_error_burst)
968                         peer->rate_tokens = ip_rt_error_burst;
969                 peer->rate_last = now;
970                 if (peer->rate_tokens >= ip_rt_error_cost)
971                         peer->rate_tokens -= ip_rt_error_cost;
972                 else
973                         send = false;
974                 inet_putpeer(peer);
975         }
976         if (send)
977                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
978 
979 out:    kfree_skb(skb);
980         return 0;
981 }
982 
983 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
984 {
985         struct dst_entry *dst = &rt->dst;
986         struct fib_result res;
987 
988         if (dst_metric_locked(dst, RTAX_MTU))
989                 return;
990 
991         if (ipv4_mtu(dst) < mtu)
992                 return;
993 
994         if (mtu < ip_rt_min_pmtu)
995                 mtu = ip_rt_min_pmtu;
996 
997         if (rt->rt_pmtu == mtu &&
998             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
999                 return;
1000 
1001         rcu_read_lock();
1002         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
1003                 struct fib_nh *nh = &FIB_RES_NH(res);
1004 
1005                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1006                                       jiffies + ip_rt_mtu_expires);
1007         }
1008         rcu_read_unlock();
1009 }
1010 
1011 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1012                               struct sk_buff *skb, u32 mtu)
1013 {
1014         struct rtable *rt = (struct rtable *) dst;
1015         struct flowi4 fl4;
1016 
1017         ip_rt_build_flow_key(&fl4, sk, skb);
1018         __ip_rt_update_pmtu(rt, &fl4, mtu);
1019 }
1020 
1021 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1022                       int oif, u32 mark, u8 protocol, int flow_flags)
1023 {
1024         const struct iphdr *iph = (const struct iphdr *) skb->data;
1025         struct flowi4 fl4;
1026         struct rtable *rt;
1027 
1028         __build_flow_key(&fl4, NULL, iph, oif,
1029                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1030         rt = __ip_route_output_key(net, &fl4);
1031         if (!IS_ERR(rt)) {
1032                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1033                 ip_rt_put(rt);
1034         }
1035 }
1036 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1037 
1038 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1039 {
1040         const struct iphdr *iph = (const struct iphdr *) skb->data;
1041         struct flowi4 fl4;
1042         struct rtable *rt;
1043 
1044         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1045         rt = __ip_route_output_key(sock_net(sk), &fl4);
1046         if (!IS_ERR(rt)) {
1047                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1048                 ip_rt_put(rt);
1049         }
1050 }
1051 
1052 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1053 {
1054         const struct iphdr *iph = (const struct iphdr *) skb->data;
1055         struct flowi4 fl4;
1056         struct rtable *rt;
1057         struct dst_entry *odst = NULL;
1058         bool new = false;
1059 
1060         bh_lock_sock(sk);
1061         odst = sk_dst_get(sk);
1062 
1063         if (sock_owned_by_user(sk) || !odst) {
1064                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1065                 goto out;
1066         }
1067 
1068         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1069 
1070         rt = (struct rtable *)odst;
1071         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1072                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1073                 if (IS_ERR(rt))
1074                         goto out;
1075 
1076                 new = true;
1077         }
1078 
1079         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1080 
1081         if (!dst_check(&rt->dst, 0)) {
1082                 if (new)
1083                         dst_release(&rt->dst);
1084 
1085                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1086                 if (IS_ERR(rt))
1087                         goto out;
1088 
1089                 new = true;
1090         }
1091 
1092         if (new)
1093                 sk_dst_set(sk, &rt->dst);
1094 
1095 out:
1096         bh_unlock_sock(sk);
1097         dst_release(odst);
1098 }
1099 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1100 
1101 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1102                    int oif, u32 mark, u8 protocol, int flow_flags)
1103 {
1104         const struct iphdr *iph = (const struct iphdr *) skb->data;
1105         struct flowi4 fl4;
1106         struct rtable *rt;
1107 
1108         __build_flow_key(&fl4, NULL, iph, oif,
1109                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1110         rt = __ip_route_output_key(net, &fl4);
1111         if (!IS_ERR(rt)) {
1112                 __ip_do_redirect(rt, skb, &fl4, false);
1113                 ip_rt_put(rt);
1114         }
1115 }
1116 EXPORT_SYMBOL_GPL(ipv4_redirect);
1117 
1118 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1119 {
1120         const struct iphdr *iph = (const struct iphdr *) skb->data;
1121         struct flowi4 fl4;
1122         struct rtable *rt;
1123 
1124         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1125         rt = __ip_route_output_key(sock_net(sk), &fl4);
1126         if (!IS_ERR(rt)) {
1127                 __ip_do_redirect(rt, skb, &fl4, false);
1128                 ip_rt_put(rt);
1129         }
1130 }
1131 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1132 
1133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1134 {
1135         struct rtable *rt = (struct rtable *) dst;
1136 
1137         /* All IPV4 dsts are created with ->obsolete set to the value
1138          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1139          * into this function always.
1140          *
1141          * When a PMTU/redirect information update invalidates a route,
1142          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1143          * DST_OBSOLETE_DEAD by dst_free().
1144          */
1145         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1146                 return NULL;
1147         return dst;
1148 }
1149 
1150 static void ipv4_link_failure(struct sk_buff *skb)
1151 {
1152         struct rtable *rt;
1153 
1154         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1155 
1156         rt = skb_rtable(skb);
1157         if (rt)
1158                 dst_set_expires(&rt->dst, 0);
1159 }
1160 
1161 static int ip_rt_bug(struct sk_buff *skb)
1162 {
1163         pr_debug("%s: %pI4 -> %pI4, %s\n",
1164                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1165                  skb->dev ? skb->dev->name : "?");
1166         kfree_skb(skb);
1167         WARN_ON(1);
1168         return 0;
1169 }
1170 
1171 /*
1172    We do not cache source address of outgoing interface,
1173    because it is used only by IP RR, TS and SRR options,
1174    so that it out of fast path.
1175 
1176    BTW remember: "addr" is allowed to be not aligned
1177    in IP options!
1178  */
1179 
1180 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1181 {
1182         __be32 src;
1183 
1184         if (rt_is_output_route(rt))
1185                 src = ip_hdr(skb)->saddr;
1186         else {
1187                 struct fib_result res;
1188                 struct flowi4 fl4;
1189                 struct iphdr *iph;
1190 
1191                 iph = ip_hdr(skb);
1192 
1193                 memset(&fl4, 0, sizeof(fl4));
1194                 fl4.daddr = iph->daddr;
1195                 fl4.saddr = iph->saddr;
1196                 fl4.flowi4_tos = RT_TOS(iph->tos);
1197                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1198                 fl4.flowi4_iif = skb->dev->ifindex;
1199                 fl4.flowi4_mark = skb->mark;
1200 
1201                 rcu_read_lock();
1202                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1203                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1204                 else
1205                         src = inet_select_addr(rt->dst.dev,
1206                                                rt_nexthop(rt, iph->daddr),
1207                                                RT_SCOPE_UNIVERSE);
1208                 rcu_read_unlock();
1209         }
1210         memcpy(addr, &src, 4);
1211 }
1212 
1213 #ifdef CONFIG_IP_ROUTE_CLASSID
1214 static void set_class_tag(struct rtable *rt, u32 tag)
1215 {
1216         if (!(rt->dst.tclassid & 0xFFFF))
1217                 rt->dst.tclassid |= tag & 0xFFFF;
1218         if (!(rt->dst.tclassid & 0xFFFF0000))
1219                 rt->dst.tclassid |= tag & 0xFFFF0000;
1220 }
1221 #endif
1222 
1223 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1224 {
1225         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1226 
1227         if (advmss == 0) {
1228                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1229                                ip_rt_min_advmss);
1230                 if (advmss > 65535 - 40)
1231                         advmss = 65535 - 40;
1232         }
1233         return advmss;
1234 }
1235 
1236 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1237 {
1238         const struct rtable *rt = (const struct rtable *) dst;
1239         unsigned int mtu = rt->rt_pmtu;
1240 
1241         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1242                 mtu = dst_metric_raw(dst, RTAX_MTU);
1243 
1244         if (mtu)
1245                 return mtu;
1246 
1247         mtu = dst->dev->mtu;
1248 
1249         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1250                 if (rt->rt_uses_gateway && mtu > 576)
1251                         mtu = 576;
1252         }
1253 
1254         return min_t(unsigned int, mtu, IP_MAX_MTU);
1255 }
1256 
1257 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1258 {
1259         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1260         struct fib_nh_exception *fnhe;
1261         u32 hval;
1262 
1263         if (!hash)
1264                 return NULL;
1265 
1266         hval = fnhe_hashfun(daddr);
1267 
1268         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1269              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1270                 if (fnhe->fnhe_daddr == daddr)
1271                         return fnhe;
1272         }
1273         return NULL;
1274 }
1275 
1276 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1277                               __be32 daddr)
1278 {
1279         bool ret = false;
1280 
1281         spin_lock_bh(&fnhe_lock);
1282 
1283         if (daddr == fnhe->fnhe_daddr) {
1284                 struct rtable __rcu **porig;
1285                 struct rtable *orig;
1286                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1287 
1288                 if (rt_is_input_route(rt))
1289                         porig = &fnhe->fnhe_rth_input;
1290                 else
1291                         porig = &fnhe->fnhe_rth_output;
1292                 orig = rcu_dereference(*porig);
1293 
1294                 if (fnhe->fnhe_genid != genid) {
1295                         fnhe->fnhe_genid = genid;
1296                         fnhe->fnhe_gw = 0;
1297                         fnhe->fnhe_pmtu = 0;
1298                         fnhe->fnhe_expires = 0;
1299                         fnhe_flush_routes(fnhe);
1300                         orig = NULL;
1301                 }
1302                 fill_route_from_fnhe(rt, fnhe);
1303                 if (!rt->rt_gateway)
1304                         rt->rt_gateway = daddr;
1305 
1306                 if (!(rt->dst.flags & DST_NOCACHE)) {
1307                         rcu_assign_pointer(*porig, rt);
1308                         if (orig)
1309                                 rt_free(orig);
1310                         ret = true;
1311                 }
1312 
1313                 fnhe->fnhe_stamp = jiffies;
1314         }
1315         spin_unlock_bh(&fnhe_lock);
1316 
1317         return ret;
1318 }
1319 
1320 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1321 {
1322         struct rtable *orig, *prev, **p;
1323         bool ret = true;
1324 
1325         if (rt_is_input_route(rt)) {
1326                 p = (struct rtable **)&nh->nh_rth_input;
1327         } else {
1328                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1329         }
1330         orig = *p;
1331 
1332         prev = cmpxchg(p, orig, rt);
1333         if (prev == orig) {
1334                 if (orig)
1335                         rt_free(orig);
1336         } else
1337                 ret = false;
1338 
1339         return ret;
1340 }
1341 
1342 static DEFINE_SPINLOCK(rt_uncached_lock);
1343 static LIST_HEAD(rt_uncached_list);
1344 
1345 static void rt_add_uncached_list(struct rtable *rt)
1346 {
1347         spin_lock_bh(&rt_uncached_lock);
1348         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1349         spin_unlock_bh(&rt_uncached_lock);
1350 }
1351 
1352 static void ipv4_dst_destroy(struct dst_entry *dst)
1353 {
1354         struct rtable *rt = (struct rtable *) dst;
1355 
1356         if (!list_empty(&rt->rt_uncached)) {
1357                 spin_lock_bh(&rt_uncached_lock);
1358                 list_del(&rt->rt_uncached);
1359                 spin_unlock_bh(&rt_uncached_lock);
1360         }
1361 }
1362 
1363 void rt_flush_dev(struct net_device *dev)
1364 {
1365         if (!list_empty(&rt_uncached_list)) {
1366                 struct net *net = dev_net(dev);
1367                 struct rtable *rt;
1368 
1369                 spin_lock_bh(&rt_uncached_lock);
1370                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1371                         if (rt->dst.dev != dev)
1372                                 continue;
1373                         rt->dst.dev = net->loopback_dev;
1374                         dev_hold(rt->dst.dev);
1375                         dev_put(dev);
1376                 }
1377                 spin_unlock_bh(&rt_uncached_lock);
1378         }
1379 }
1380 
1381 static bool rt_cache_valid(const struct rtable *rt)
1382 {
1383         return  rt &&
1384                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1385                 !rt_is_expired(rt);
1386 }
1387 
1388 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1389                            const struct fib_result *res,
1390                            struct fib_nh_exception *fnhe,
1391                            struct fib_info *fi, u16 type, u32 itag)
1392 {
1393         bool cached = false;
1394 
1395         if (fi) {
1396                 struct fib_nh *nh = &FIB_RES_NH(*res);
1397 
1398                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1399                         rt->rt_gateway = nh->nh_gw;
1400                         rt->rt_uses_gateway = 1;
1401                 }
1402                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1403 #ifdef CONFIG_IP_ROUTE_CLASSID
1404                 rt->dst.tclassid = nh->nh_tclassid;
1405 #endif
1406                 if (unlikely(fnhe))
1407                         cached = rt_bind_exception(rt, fnhe, daddr);
1408                 else if (!(rt->dst.flags & DST_NOCACHE))
1409                         cached = rt_cache_route(nh, rt);
1410                 if (unlikely(!cached)) {
1411                         /* Routes we intend to cache in nexthop exception or
1412                          * FIB nexthop have the DST_NOCACHE bit clear.
1413                          * However, if we are unsuccessful at storing this
1414                          * route into the cache we really need to set it.
1415                          */
1416                         rt->dst.flags |= DST_NOCACHE;
1417                         if (!rt->rt_gateway)
1418                                 rt->rt_gateway = daddr;
1419                         rt_add_uncached_list(rt);
1420                 }
1421         } else
1422                 rt_add_uncached_list(rt);
1423 
1424 #ifdef CONFIG_IP_ROUTE_CLASSID
1425 #ifdef CONFIG_IP_MULTIPLE_TABLES
1426         set_class_tag(rt, res->tclassid);
1427 #endif
1428         set_class_tag(rt, itag);
1429 #endif
1430 }
1431 
1432 static struct rtable *rt_dst_alloc(struct net_device *dev,
1433                                    bool nopolicy, bool noxfrm, bool will_cache)
1434 {
1435         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1436                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1437                          (nopolicy ? DST_NOPOLICY : 0) |
1438                          (noxfrm ? DST_NOXFRM : 0));
1439 }
1440 
1441 /* called in rcu_read_lock() section */
1442 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1443                                 u8 tos, struct net_device *dev, int our)
1444 {
1445         struct rtable *rth;
1446         struct in_device *in_dev = __in_dev_get_rcu(dev);
1447         u32 itag = 0;
1448         int err;
1449 
1450         /* Primary sanity checks. */
1451 
1452         if (in_dev == NULL)
1453                 return -EINVAL;
1454 
1455         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1456             skb->protocol != htons(ETH_P_IP))
1457                 goto e_inval;
1458 
1459         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1460                 if (ipv4_is_loopback(saddr))
1461                         goto e_inval;
1462 
1463         if (ipv4_is_zeronet(saddr)) {
1464                 if (!ipv4_is_local_multicast(daddr))
1465                         goto e_inval;
1466         } else {
1467                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1468                                           in_dev, &itag);
1469                 if (err < 0)
1470                         goto e_err;
1471         }
1472         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1473                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1474         if (!rth)
1475                 goto e_nobufs;
1476 
1477 #ifdef CONFIG_IP_ROUTE_CLASSID
1478         rth->dst.tclassid = itag;
1479 #endif
1480         rth->dst.output = ip_rt_bug;
1481 
1482         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1483         rth->rt_flags   = RTCF_MULTICAST;
1484         rth->rt_type    = RTN_MULTICAST;
1485         rth->rt_is_input= 1;
1486         rth->rt_iif     = 0;
1487         rth->rt_pmtu    = 0;
1488         rth->rt_gateway = 0;
1489         rth->rt_uses_gateway = 0;
1490         INIT_LIST_HEAD(&rth->rt_uncached);
1491         if (our) {
1492                 rth->dst.input= ip_local_deliver;
1493                 rth->rt_flags |= RTCF_LOCAL;
1494         }
1495 
1496 #ifdef CONFIG_IP_MROUTE
1497         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1498                 rth->dst.input = ip_mr_input;
1499 #endif
1500         RT_CACHE_STAT_INC(in_slow_mc);
1501 
1502         skb_dst_set(skb, &rth->dst);
1503         return 0;
1504 
1505 e_nobufs:
1506         return -ENOBUFS;
1507 e_inval:
1508         return -EINVAL;
1509 e_err:
1510         return err;
1511 }
1512 
1513 
1514 static void ip_handle_martian_source(struct net_device *dev,
1515                                      struct in_device *in_dev,
1516                                      struct sk_buff *skb,
1517                                      __be32 daddr,
1518                                      __be32 saddr)
1519 {
1520         RT_CACHE_STAT_INC(in_martian_src);
1521 #ifdef CONFIG_IP_ROUTE_VERBOSE
1522         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1523                 /*
1524                  *      RFC1812 recommendation, if source is martian,
1525                  *      the only hint is MAC header.
1526                  */
1527                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1528                         &daddr, &saddr, dev->name);
1529                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1530                         print_hex_dump(KERN_WARNING, "ll header: ",
1531                                        DUMP_PREFIX_OFFSET, 16, 1,
1532                                        skb_mac_header(skb),
1533                                        dev->hard_header_len, true);
1534                 }
1535         }
1536 #endif
1537 }
1538 
1539 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1540 {
1541         struct fnhe_hash_bucket *hash;
1542         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1543         u32 hval = fnhe_hashfun(daddr);
1544 
1545         spin_lock_bh(&fnhe_lock);
1546 
1547         hash = rcu_dereference_protected(nh->nh_exceptions,
1548                                          lockdep_is_held(&fnhe_lock));
1549         hash += hval;
1550 
1551         fnhe_p = &hash->chain;
1552         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1553         while (fnhe) {
1554                 if (fnhe->fnhe_daddr == daddr) {
1555                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1556                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1557                         fnhe_flush_routes(fnhe);
1558                         kfree_rcu(fnhe, rcu);
1559                         break;
1560                 }
1561                 fnhe_p = &fnhe->fnhe_next;
1562                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1563                                                  lockdep_is_held(&fnhe_lock));
1564         }
1565 
1566         spin_unlock_bh(&fnhe_lock);
1567 }
1568 
1569 /* called in rcu_read_lock() section */
1570 static int __mkroute_input(struct sk_buff *skb,
1571                            const struct fib_result *res,
1572                            struct in_device *in_dev,
1573                            __be32 daddr, __be32 saddr, u32 tos)
1574 {
1575         struct fib_nh_exception *fnhe;
1576         struct rtable *rth;
1577         int err;
1578         struct in_device *out_dev;
1579         unsigned int flags = 0;
1580         bool do_cache;
1581         u32 itag = 0;
1582 
1583         /* get a working reference to the output device */
1584         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1585         if (out_dev == NULL) {
1586                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1587                 return -EINVAL;
1588         }
1589 
1590         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1591                                   in_dev->dev, in_dev, &itag);
1592         if (err < 0) {
1593                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1594                                          saddr);
1595 
1596                 goto cleanup;
1597         }
1598 
1599         do_cache = res->fi && !itag;
1600         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1601             skb->protocol == htons(ETH_P_IP) &&
1602             (IN_DEV_SHARED_MEDIA(out_dev) ||
1603              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1604                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1605 
1606         if (skb->protocol != htons(ETH_P_IP)) {
1607                 /* Not IP (i.e. ARP). Do not create route, if it is
1608                  * invalid for proxy arp. DNAT routes are always valid.
1609                  *
1610                  * Proxy arp feature have been extended to allow, ARP
1611                  * replies back to the same interface, to support
1612                  * Private VLAN switch technologies. See arp.c.
1613                  */
1614                 if (out_dev == in_dev &&
1615                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1616                         err = -EINVAL;
1617                         goto cleanup;
1618                 }
1619         }
1620 
1621         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1622         if (do_cache) {
1623                 if (fnhe) {
1624                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1625                         if (rth && rth->dst.expires &&
1626                             time_after(jiffies, rth->dst.expires)) {
1627                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1628                                 fnhe = NULL;
1629                         } else {
1630                                 goto rt_cache;
1631                         }
1632                 }
1633 
1634                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1635 
1636 rt_cache:
1637                 if (rt_cache_valid(rth)) {
1638                         skb_dst_set_noref(skb, &rth->dst);
1639                         goto out;
1640                 }
1641         }
1642 
1643         rth = rt_dst_alloc(out_dev->dev,
1644                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1645                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1646         if (!rth) {
1647                 err = -ENOBUFS;
1648                 goto cleanup;
1649         }
1650 
1651         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1652         rth->rt_flags = flags;
1653         rth->rt_type = res->type;
1654         rth->rt_is_input = 1;
1655         rth->rt_iif     = 0;
1656         rth->rt_pmtu    = 0;
1657         rth->rt_gateway = 0;
1658         rth->rt_uses_gateway = 0;
1659         INIT_LIST_HEAD(&rth->rt_uncached);
1660         RT_CACHE_STAT_INC(in_slow_tot);
1661 
1662         rth->dst.input = ip_forward;
1663         rth->dst.output = ip_output;
1664 
1665         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1666         skb_dst_set(skb, &rth->dst);
1667 out:
1668         err = 0;
1669  cleanup:
1670         return err;
1671 }
1672 
1673 static int ip_mkroute_input(struct sk_buff *skb,
1674                             struct fib_result *res,
1675                             const struct flowi4 *fl4,
1676                             struct in_device *in_dev,
1677                             __be32 daddr, __be32 saddr, u32 tos)
1678 {
1679 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1680         if (res->fi && res->fi->fib_nhs > 1)
1681                 fib_select_multipath(res);
1682 #endif
1683 
1684         /* create a routing cache entry */
1685         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1686 }
1687 
1688 /*
1689  *      NOTE. We drop all the packets that has local source
1690  *      addresses, because every properly looped back packet
1691  *      must have correct destination already attached by output routine.
1692  *
1693  *      Such approach solves two big problems:
1694  *      1. Not simplex devices are handled properly.
1695  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1696  *      called with rcu_read_lock()
1697  */
1698 
1699 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1700                                u8 tos, struct net_device *dev)
1701 {
1702         struct fib_result res;
1703         struct in_device *in_dev = __in_dev_get_rcu(dev);
1704         struct flowi4   fl4;
1705         unsigned int    flags = 0;
1706         u32             itag = 0;
1707         struct rtable   *rth;
1708         int             err = -EINVAL;
1709         struct net    *net = dev_net(dev);
1710         bool do_cache;
1711 
1712         /* IP on this device is disabled. */
1713 
1714         if (!in_dev)
1715                 goto out;
1716 
1717         /* Check for the most weird martians, which can be not detected
1718            by fib_lookup.
1719          */
1720 
1721         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1722                 goto martian_source;
1723 
1724         res.fi = NULL;
1725         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1726                 goto brd_input;
1727 
1728         /* Accept zero addresses only to limited broadcast;
1729          * I even do not know to fix it or not. Waiting for complains :-)
1730          */
1731         if (ipv4_is_zeronet(saddr))
1732                 goto martian_source;
1733 
1734         if (ipv4_is_zeronet(daddr))
1735                 goto martian_destination;
1736 
1737         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1738          * and call it once if daddr or/and saddr are loopback addresses
1739          */
1740         if (ipv4_is_loopback(daddr)) {
1741                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1742                         goto martian_destination;
1743         } else if (ipv4_is_loopback(saddr)) {
1744                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1745                         goto martian_source;
1746         }
1747 
1748         /*
1749          *      Now we are ready to route packet.
1750          */
1751         fl4.flowi4_oif = 0;
1752         fl4.flowi4_iif = dev->ifindex;
1753         fl4.flowi4_mark = skb->mark;
1754         fl4.flowi4_tos = tos;
1755         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1756         fl4.daddr = daddr;
1757         fl4.saddr = saddr;
1758         err = fib_lookup(net, &fl4, &res);
1759         if (err != 0)
1760                 goto no_route;
1761 
1762         if (res.type == RTN_BROADCAST)
1763                 goto brd_input;
1764 
1765         if (res.type == RTN_LOCAL) {
1766                 err = fib_validate_source(skb, saddr, daddr, tos,
1767                                           LOOPBACK_IFINDEX,
1768                                           dev, in_dev, &itag);
1769                 if (err < 0)
1770                         goto martian_source_keep_err;
1771                 goto local_input;
1772         }
1773 
1774         if (!IN_DEV_FORWARD(in_dev))
1775                 goto no_route;
1776         if (res.type != RTN_UNICAST)
1777                 goto martian_destination;
1778 
1779         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1780 out:    return err;
1781 
1782 brd_input:
1783         if (skb->protocol != htons(ETH_P_IP))
1784                 goto e_inval;
1785 
1786         if (!ipv4_is_zeronet(saddr)) {
1787                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1788                                           in_dev, &itag);
1789                 if (err < 0)
1790                         goto martian_source_keep_err;
1791         }
1792         flags |= RTCF_BROADCAST;
1793         res.type = RTN_BROADCAST;
1794         RT_CACHE_STAT_INC(in_brd);
1795 
1796 local_input:
1797         do_cache = false;
1798         if (res.fi) {
1799                 if (!itag) {
1800                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1801                         if (rt_cache_valid(rth)) {
1802                                 skb_dst_set_noref(skb, &rth->dst);
1803                                 err = 0;
1804                                 goto out;
1805                         }
1806                         do_cache = true;
1807                 }
1808         }
1809 
1810         rth = rt_dst_alloc(net->loopback_dev,
1811                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1812         if (!rth)
1813                 goto e_nobufs;
1814 
1815         rth->dst.input= ip_local_deliver;
1816         rth->dst.output= ip_rt_bug;
1817 #ifdef CONFIG_IP_ROUTE_CLASSID
1818         rth->dst.tclassid = itag;
1819 #endif
1820 
1821         rth->rt_genid = rt_genid_ipv4(net);
1822         rth->rt_flags   = flags|RTCF_LOCAL;
1823         rth->rt_type    = res.type;
1824         rth->rt_is_input = 1;
1825         rth->rt_iif     = 0;
1826         rth->rt_pmtu    = 0;
1827         rth->rt_gateway = 0;
1828         rth->rt_uses_gateway = 0;
1829         INIT_LIST_HEAD(&rth->rt_uncached);
1830         RT_CACHE_STAT_INC(in_slow_tot);
1831         if (res.type == RTN_UNREACHABLE) {
1832                 rth->dst.input= ip_error;
1833                 rth->dst.error= -err;
1834                 rth->rt_flags   &= ~RTCF_LOCAL;
1835         }
1836         if (do_cache) {
1837                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1838                         rth->dst.flags |= DST_NOCACHE;
1839                         rt_add_uncached_list(rth);
1840                 }
1841         }
1842         skb_dst_set(skb, &rth->dst);
1843         err = 0;
1844         goto out;
1845 
1846 no_route:
1847         RT_CACHE_STAT_INC(in_no_route);
1848         res.type = RTN_UNREACHABLE;
1849         if (err == -ESRCH)
1850                 err = -ENETUNREACH;
1851         goto local_input;
1852 
1853         /*
1854          *      Do not cache martian addresses: they should be logged (RFC1812)
1855          */
1856 martian_destination:
1857         RT_CACHE_STAT_INC(in_martian_dst);
1858 #ifdef CONFIG_IP_ROUTE_VERBOSE
1859         if (IN_DEV_LOG_MARTIANS(in_dev))
1860                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1861                                      &daddr, &saddr, dev->name);
1862 #endif
1863 
1864 e_inval:
1865         err = -EINVAL;
1866         goto out;
1867 
1868 e_nobufs:
1869         err = -ENOBUFS;
1870         goto out;
1871 
1872 martian_source:
1873         err = -EINVAL;
1874 martian_source_keep_err:
1875         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1876         goto out;
1877 }
1878 
1879 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1880                          u8 tos, struct net_device *dev)
1881 {
1882         int res;
1883 
1884         tos &= IPTOS_RT_MASK;
1885         rcu_read_lock();
1886 
1887         /* Multicast recognition logic is moved from route cache to here.
1888            The problem was that too many Ethernet cards have broken/missing
1889            hardware multicast filters :-( As result the host on multicasting
1890            network acquires a lot of useless route cache entries, sort of
1891            SDR messages from all the world. Now we try to get rid of them.
1892            Really, provided software IP multicast filter is organized
1893            reasonably (at least, hashed), it does not result in a slowdown
1894            comparing with route cache reject entries.
1895            Note, that multicast routers are not affected, because
1896            route cache entry is created eventually.
1897          */
1898         if (ipv4_is_multicast(daddr)) {
1899                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1900 
1901                 if (in_dev) {
1902                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1903                                                   ip_hdr(skb)->protocol);
1904                         if (our
1905 #ifdef CONFIG_IP_MROUTE
1906                                 ||
1907                             (!ipv4_is_local_multicast(daddr) &&
1908                              IN_DEV_MFORWARD(in_dev))
1909 #endif
1910                            ) {
1911                                 int res = ip_route_input_mc(skb, daddr, saddr,
1912                                                             tos, dev, our);
1913                                 rcu_read_unlock();
1914                                 return res;
1915                         }
1916                 }
1917                 rcu_read_unlock();
1918                 return -EINVAL;
1919         }
1920         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1921         rcu_read_unlock();
1922         return res;
1923 }
1924 EXPORT_SYMBOL(ip_route_input_noref);
1925 
1926 /* called with rcu_read_lock() */
1927 static struct rtable *__mkroute_output(const struct fib_result *res,
1928                                        const struct flowi4 *fl4, int orig_oif,
1929                                        struct net_device *dev_out,
1930                                        unsigned int flags)
1931 {
1932         struct fib_info *fi = res->fi;
1933         struct fib_nh_exception *fnhe;
1934         struct in_device *in_dev;
1935         u16 type = res->type;
1936         struct rtable *rth;
1937         bool do_cache;
1938 
1939         in_dev = __in_dev_get_rcu(dev_out);
1940         if (!in_dev)
1941                 return ERR_PTR(-EINVAL);
1942 
1943         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1944                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1945                         return ERR_PTR(-EINVAL);
1946 
1947         if (ipv4_is_lbcast(fl4->daddr))
1948                 type = RTN_BROADCAST;
1949         else if (ipv4_is_multicast(fl4->daddr))
1950                 type = RTN_MULTICAST;
1951         else if (ipv4_is_zeronet(fl4->daddr))
1952                 return ERR_PTR(-EINVAL);
1953 
1954         if (dev_out->flags & IFF_LOOPBACK)
1955                 flags |= RTCF_LOCAL;
1956 
1957         do_cache = true;
1958         if (type == RTN_BROADCAST) {
1959                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1960                 fi = NULL;
1961         } else if (type == RTN_MULTICAST) {
1962                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1963                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1964                                      fl4->flowi4_proto))
1965                         flags &= ~RTCF_LOCAL;
1966                 else
1967                         do_cache = false;
1968                 /* If multicast route do not exist use
1969                  * default one, but do not gateway in this case.
1970                  * Yes, it is hack.
1971                  */
1972                 if (fi && res->prefixlen < 4)
1973                         fi = NULL;
1974         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
1975                    (orig_oif != dev_out->ifindex)) {
1976                 /* For local routes that require a particular output interface
1977                  * we do not want to cache the result.  Caching the result
1978                  * causes incorrect behaviour when there are multiple source
1979                  * addresses on the interface, the end result being that if the
1980                  * intended recipient is waiting on that interface for the
1981                  * packet he won't receive it because it will be delivered on
1982                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
1983                  * be set to the loopback interface as well.
1984                  */
1985                 fi = NULL;
1986         }
1987 
1988         fnhe = NULL;
1989         do_cache &= fi != NULL;
1990         if (do_cache) {
1991                 struct rtable __rcu **prth;
1992                 struct fib_nh *nh = &FIB_RES_NH(*res);
1993 
1994                 fnhe = find_exception(nh, fl4->daddr);
1995                 if (fnhe) {
1996                         prth = &fnhe->fnhe_rth_output;
1997                         rth = rcu_dereference(*prth);
1998                         if (rth && rth->dst.expires &&
1999                             time_after(jiffies, rth->dst.expires)) {
2000                                 ip_del_fnhe(nh, fl4->daddr);
2001                                 fnhe = NULL;
2002                         } else {
2003                                 goto rt_cache;
2004                         }
2005                 }
2006 
2007                 if (unlikely(fl4->flowi4_flags &
2008                              FLOWI_FLAG_KNOWN_NH &&
2009                              !(nh->nh_gw &&
2010                                nh->nh_scope == RT_SCOPE_LINK))) {
2011                         do_cache = false;
2012                         goto add;
2013                 }
2014                 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
2015                 rth = rcu_dereference(*prth);
2016 
2017 rt_cache:
2018                 if (rt_cache_valid(rth)) {
2019                         dst_hold(&rth->dst);
2020                         return rth;
2021                 }
2022         }
2023 
2024 add:
2025         rth = rt_dst_alloc(dev_out,
2026                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2027                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2028                            do_cache);
2029         if (!rth)
2030                 return ERR_PTR(-ENOBUFS);
2031 
2032         rth->dst.output = ip_output;
2033 
2034         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
2035         rth->rt_flags   = flags;
2036         rth->rt_type    = type;
2037         rth->rt_is_input = 0;
2038         rth->rt_iif     = orig_oif ? : 0;
2039         rth->rt_pmtu    = 0;
2040         rth->rt_gateway = 0;
2041         rth->rt_uses_gateway = 0;
2042         INIT_LIST_HEAD(&rth->rt_uncached);
2043 
2044         RT_CACHE_STAT_INC(out_slow_tot);
2045 
2046         if (flags & RTCF_LOCAL)
2047                 rth->dst.input = ip_local_deliver;
2048         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2049                 if (flags & RTCF_LOCAL &&
2050                     !(dev_out->flags & IFF_LOOPBACK)) {
2051                         rth->dst.output = ip_mc_output;
2052                         RT_CACHE_STAT_INC(out_slow_mc);
2053                 }
2054 #ifdef CONFIG_IP_MROUTE
2055                 if (type == RTN_MULTICAST) {
2056                         if (IN_DEV_MFORWARD(in_dev) &&
2057                             !ipv4_is_local_multicast(fl4->daddr)) {
2058                                 rth->dst.input = ip_mr_input;
2059                                 rth->dst.output = ip_mc_output;
2060                         }
2061                 }
2062 #endif
2063         }
2064 
2065         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2066 
2067         return rth;
2068 }
2069 
2070 /*
2071  * Major route resolver routine.
2072  */
2073 
2074 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2075 {
2076         struct net_device *dev_out = NULL;
2077         __u8 tos = RT_FL_TOS(fl4);
2078         unsigned int flags = 0;
2079         struct fib_result res;
2080         struct rtable *rth;
2081         int orig_oif;
2082 
2083         res.tclassid    = 0;
2084         res.fi          = NULL;
2085         res.table       = NULL;
2086 
2087         orig_oif = fl4->flowi4_oif;
2088 
2089         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2090         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2091         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2092                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2093 
2094         rcu_read_lock();
2095         if (fl4->saddr) {
2096                 rth = ERR_PTR(-EINVAL);
2097                 if (ipv4_is_multicast(fl4->saddr) ||
2098                     ipv4_is_lbcast(fl4->saddr) ||
2099                     ipv4_is_zeronet(fl4->saddr))
2100                         goto out;
2101 
2102                 /* I removed check for oif == dev_out->oif here.
2103                    It was wrong for two reasons:
2104                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2105                       is assigned to multiple interfaces.
2106                    2. Moreover, we are allowed to send packets with saddr
2107                       of another iface. --ANK
2108                  */
2109 
2110                 if (fl4->flowi4_oif == 0 &&
2111                     (ipv4_is_multicast(fl4->daddr) ||
2112                      ipv4_is_lbcast(fl4->daddr))) {
2113                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2114                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2115                         if (dev_out == NULL)
2116                                 goto out;
2117 
2118                         /* Special hack: user can direct multicasts
2119                            and limited broadcast via necessary interface
2120                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2121                            This hack is not just for fun, it allows
2122                            vic,vat and friends to work.
2123                            They bind socket to loopback, set ttl to zero
2124                            and expect that it will work.
2125                            From the viewpoint of routing cache they are broken,
2126                            because we are not allowed to build multicast path
2127                            with loopback source addr (look, routing cache
2128                            cannot know, that ttl is zero, so that packet
2129                            will not leave this host and route is valid).
2130                            Luckily, this hack is good workaround.
2131                          */
2132 
2133                         fl4->flowi4_oif = dev_out->ifindex;
2134                         goto make_route;
2135                 }
2136 
2137                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2138                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2139                         if (!__ip_dev_find(net, fl4->saddr, false))
2140                                 goto out;
2141                 }
2142         }
2143 
2144 
2145         if (fl4->flowi4_oif) {
2146                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2147                 rth = ERR_PTR(-ENODEV);
2148                 if (dev_out == NULL)
2149                         goto out;
2150 
2151                 /* RACE: Check return value of inet_select_addr instead. */
2152                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2153                         rth = ERR_PTR(-ENETUNREACH);
2154                         goto out;
2155                 }
2156                 if (ipv4_is_local_multicast(fl4->daddr) ||
2157                     ipv4_is_lbcast(fl4->daddr)) {
2158                         if (!fl4->saddr)
2159                                 fl4->saddr = inet_select_addr(dev_out, 0,
2160                                                               RT_SCOPE_LINK);
2161                         goto make_route;
2162                 }
2163                 if (!fl4->saddr) {
2164                         if (ipv4_is_multicast(fl4->daddr))
2165                                 fl4->saddr = inet_select_addr(dev_out, 0,
2166                                                               fl4->flowi4_scope);
2167                         else if (!fl4->daddr)
2168                                 fl4->saddr = inet_select_addr(dev_out, 0,
2169                                                               RT_SCOPE_HOST);
2170                 }
2171         }
2172 
2173         if (!fl4->daddr) {
2174                 fl4->daddr = fl4->saddr;
2175                 if (!fl4->daddr)
2176                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2177                 dev_out = net->loopback_dev;
2178                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2179                 res.type = RTN_LOCAL;
2180                 flags |= RTCF_LOCAL;
2181                 goto make_route;
2182         }
2183 
2184         if (fib_lookup(net, fl4, &res)) {
2185                 res.fi = NULL;
2186                 res.table = NULL;
2187                 if (fl4->flowi4_oif) {
2188                         /* Apparently, routing tables are wrong. Assume,
2189                            that the destination is on link.
2190 
2191                            WHY? DW.
2192                            Because we are allowed to send to iface
2193                            even if it has NO routes and NO assigned
2194                            addresses. When oif is specified, routing
2195                            tables are looked up with only one purpose:
2196                            to catch if destination is gatewayed, rather than
2197                            direct. Moreover, if MSG_DONTROUTE is set,
2198                            we send packet, ignoring both routing tables
2199                            and ifaddr state. --ANK
2200 
2201 
2202                            We could make it even if oif is unknown,
2203                            likely IPv6, but we do not.
2204                          */
2205 
2206                         if (fl4->saddr == 0)
2207                                 fl4->saddr = inet_select_addr(dev_out, 0,
2208                                                               RT_SCOPE_LINK);
2209                         res.type = RTN_UNICAST;
2210                         goto make_route;
2211                 }
2212                 rth = ERR_PTR(-ENETUNREACH);
2213                 goto out;
2214         }
2215 
2216         if (res.type == RTN_LOCAL) {
2217                 if (!fl4->saddr) {
2218                         if (res.fi->fib_prefsrc)
2219                                 fl4->saddr = res.fi->fib_prefsrc;
2220                         else
2221                                 fl4->saddr = fl4->daddr;
2222                 }
2223                 dev_out = net->loopback_dev;
2224                 fl4->flowi4_oif = dev_out->ifindex;
2225                 flags |= RTCF_LOCAL;
2226                 goto make_route;
2227         }
2228 
2229 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2230         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2231                 fib_select_multipath(&res);
2232         else
2233 #endif
2234         if (!res.prefixlen &&
2235             res.table->tb_num_default > 1 &&
2236             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2237                 fib_select_default(&res);
2238 
2239         if (!fl4->saddr)
2240                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2241 
2242         dev_out = FIB_RES_DEV(res);
2243         fl4->flowi4_oif = dev_out->ifindex;
2244 
2245 
2246 make_route:
2247         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2248 
2249 out:
2250         rcu_read_unlock();
2251         return rth;
2252 }
2253 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2254 
2255 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2256 {
2257         return NULL;
2258 }
2259 
2260 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2261 {
2262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2263 
2264         return mtu ? : dst->dev->mtu;
2265 }
2266 
2267 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2268                                           struct sk_buff *skb, u32 mtu)
2269 {
2270 }
2271 
2272 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2273                                        struct sk_buff *skb)
2274 {
2275 }
2276 
2277 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2278                                           unsigned long old)
2279 {
2280         return NULL;
2281 }
2282 
2283 static struct dst_ops ipv4_dst_blackhole_ops = {
2284         .family                 =       AF_INET,
2285         .protocol               =       cpu_to_be16(ETH_P_IP),
2286         .check                  =       ipv4_blackhole_dst_check,
2287         .mtu                    =       ipv4_blackhole_mtu,
2288         .default_advmss         =       ipv4_default_advmss,
2289         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2290         .redirect               =       ipv4_rt_blackhole_redirect,
2291         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2292         .neigh_lookup           =       ipv4_neigh_lookup,
2293 };
2294 
2295 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2296 {
2297         struct rtable *ort = (struct rtable *) dst_orig;
2298         struct rtable *rt;
2299 
2300         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2301         if (rt) {
2302                 struct dst_entry *new = &rt->dst;
2303 
2304                 new->__use = 1;
2305                 new->input = dst_discard;
2306                 new->output = dst_discard;
2307 
2308                 new->dev = ort->dst.dev;
2309                 if (new->dev)
2310                         dev_hold(new->dev);
2311 
2312                 rt->rt_is_input = ort->rt_is_input;
2313                 rt->rt_iif = ort->rt_iif;
2314                 rt->rt_pmtu = ort->rt_pmtu;
2315 
2316                 rt->rt_genid = rt_genid_ipv4(net);
2317                 rt->rt_flags = ort->rt_flags;
2318                 rt->rt_type = ort->rt_type;
2319                 rt->rt_gateway = ort->rt_gateway;
2320                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2321 
2322                 INIT_LIST_HEAD(&rt->rt_uncached);
2323 
2324                 dst_free(new);
2325         }
2326 
2327         dst_release(dst_orig);
2328 
2329         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2330 }
2331 
2332 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2333                                     struct sock *sk)
2334 {
2335         struct rtable *rt = __ip_route_output_key(net, flp4);
2336 
2337         if (IS_ERR(rt))
2338                 return rt;
2339 
2340         if (flp4->flowi4_proto)
2341                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2342                                                         flowi4_to_flowi(flp4),
2343                                                         sk, 0);
2344 
2345         return rt;
2346 }
2347 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2348 
2349 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2350                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2351                         u32 seq, int event, int nowait, unsigned int flags)
2352 {
2353         struct rtable *rt = skb_rtable(skb);
2354         struct rtmsg *r;
2355         struct nlmsghdr *nlh;
2356         unsigned long expires = 0;
2357         u32 error;
2358         u32 metrics[RTAX_MAX];
2359 
2360         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2361         if (nlh == NULL)
2362                 return -EMSGSIZE;
2363 
2364         r = nlmsg_data(nlh);
2365         r->rtm_family    = AF_INET;
2366         r->rtm_dst_len  = 32;
2367         r->rtm_src_len  = 0;
2368         r->rtm_tos      = fl4->flowi4_tos;
2369         r->rtm_table    = RT_TABLE_MAIN;
2370         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2371                 goto nla_put_failure;
2372         r->rtm_type     = rt->rt_type;
2373         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2374         r->rtm_protocol = RTPROT_UNSPEC;
2375         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2376         if (rt->rt_flags & RTCF_NOTIFY)
2377                 r->rtm_flags |= RTM_F_NOTIFY;
2378         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2379                 r->rtm_flags |= RTCF_DOREDIRECT;
2380 
2381         if (nla_put_be32(skb, RTA_DST, dst))
2382                 goto nla_put_failure;
2383         if (src) {
2384                 r->rtm_src_len = 32;
2385                 if (nla_put_be32(skb, RTA_SRC, src))
2386                         goto nla_put_failure;
2387         }
2388         if (rt->dst.dev &&
2389             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2390                 goto nla_put_failure;
2391 #ifdef CONFIG_IP_ROUTE_CLASSID
2392         if (rt->dst.tclassid &&
2393             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2394                 goto nla_put_failure;
2395 #endif
2396         if (!rt_is_input_route(rt) &&
2397             fl4->saddr != src) {
2398                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2399                         goto nla_put_failure;
2400         }
2401         if (rt->rt_uses_gateway &&
2402             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2403                 goto nla_put_failure;
2404 
2405         expires = rt->dst.expires;
2406         if (expires) {
2407                 unsigned long now = jiffies;
2408 
2409                 if (time_before(now, expires))
2410                         expires -= now;
2411                 else
2412                         expires = 0;
2413         }
2414 
2415         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2416         if (rt->rt_pmtu && expires)
2417                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2418         if (rtnetlink_put_metrics(skb, metrics) < 0)
2419                 goto nla_put_failure;
2420 
2421         if (fl4->flowi4_mark &&
2422             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2423                 goto nla_put_failure;
2424 
2425         error = rt->dst.error;
2426 
2427         if (rt_is_input_route(rt)) {
2428 #ifdef CONFIG_IP_MROUTE
2429                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2430                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2431                         int err = ipmr_get_route(net, skb,
2432                                                  fl4->saddr, fl4->daddr,
2433                                                  r, nowait, portid);
2434 
2435                         if (err <= 0) {
2436                                 if (!nowait) {
2437                                         if (err == 0)
2438                                                 return 0;
2439                                         goto nla_put_failure;
2440                                 } else {
2441                                         if (err == -EMSGSIZE)
2442                                                 goto nla_put_failure;
2443                                         error = err;
2444                                 }
2445                         }
2446                 } else
2447 #endif
2448                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2449                                 goto nla_put_failure;
2450         }
2451 
2452         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2453                 goto nla_put_failure;
2454 
2455         return nlmsg_end(skb, nlh);
2456 
2457 nla_put_failure:
2458         nlmsg_cancel(skb, nlh);
2459         return -EMSGSIZE;
2460 }
2461 
2462 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2463 {
2464         struct net *net = sock_net(in_skb->sk);
2465         struct rtmsg *rtm;
2466         struct nlattr *tb[RTA_MAX+1];
2467         struct rtable *rt = NULL;
2468         struct flowi4 fl4;
2469         __be32 dst = 0;
2470         __be32 src = 0;
2471         u32 iif;
2472         int err;
2473         int mark;
2474         struct sk_buff *skb;
2475 
2476         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2477         if (err < 0)
2478                 goto errout;
2479 
2480         rtm = nlmsg_data(nlh);
2481 
2482         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2483         if (skb == NULL) {
2484                 err = -ENOBUFS;
2485                 goto errout;
2486         }
2487 
2488         /* Reserve room for dummy headers, this skb can pass
2489            through good chunk of routing engine.
2490          */
2491         skb_reset_mac_header(skb);
2492         skb_reset_network_header(skb);
2493 
2494         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2495         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2496         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2497 
2498         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2499         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2500         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2501         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2502 
2503         memset(&fl4, 0, sizeof(fl4));
2504         fl4.daddr = dst;
2505         fl4.saddr = src;
2506         fl4.flowi4_tos = rtm->rtm_tos;
2507         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2508         fl4.flowi4_mark = mark;
2509 
2510         if (iif) {
2511                 struct net_device *dev;
2512 
2513                 dev = __dev_get_by_index(net, iif);
2514                 if (dev == NULL) {
2515                         err = -ENODEV;
2516                         goto errout_free;
2517                 }
2518 
2519                 skb->protocol   = htons(ETH_P_IP);
2520                 skb->dev        = dev;
2521                 skb->mark       = mark;
2522                 local_bh_disable();
2523                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2524                 local_bh_enable();
2525 
2526                 rt = skb_rtable(skb);
2527                 if (err == 0 && rt->dst.error)
2528                         err = -rt->dst.error;
2529         } else {
2530                 rt = ip_route_output_key(net, &fl4);
2531 
2532                 err = 0;
2533                 if (IS_ERR(rt))
2534                         err = PTR_ERR(rt);
2535         }
2536 
2537         if (err)
2538                 goto errout_free;
2539 
2540         skb_dst_set(skb, &rt->dst);
2541         if (rtm->rtm_flags & RTM_F_NOTIFY)
2542                 rt->rt_flags |= RTCF_NOTIFY;
2543 
2544         err = rt_fill_info(net, dst, src, &fl4, skb,
2545                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2546                            RTM_NEWROUTE, 0, 0);
2547         if (err <= 0)
2548                 goto errout_free;
2549 
2550         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2551 errout:
2552         return err;
2553 
2554 errout_free:
2555         kfree_skb(skb);
2556         goto errout;
2557 }
2558 
2559 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2560 {
2561         return skb->len;
2562 }
2563 
2564 void ip_rt_multicast_event(struct in_device *in_dev)
2565 {
2566         rt_cache_flush(dev_net(in_dev->dev));
2567 }
2568 
2569 #ifdef CONFIG_SYSCTL
2570 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2571 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2572 static int ip_rt_gc_elasticity __read_mostly    = 8;
2573 
2574 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2575                                         void __user *buffer,
2576                                         size_t *lenp, loff_t *ppos)
2577 {
2578         struct net *net = (struct net *)__ctl->extra1;
2579 
2580         if (write) {
2581                 rt_cache_flush(net);
2582                 fnhe_genid_bump(net);
2583                 return 0;
2584         }
2585 
2586         return -EINVAL;
2587 }
2588 
2589 static struct ctl_table ipv4_route_table[] = {
2590         {
2591                 .procname       = "gc_thresh",
2592                 .data           = &ipv4_dst_ops.gc_thresh,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec,
2596         },
2597         {
2598                 .procname       = "max_size",
2599                 .data           = &ip_rt_max_size,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         {
2605                 /*  Deprecated. Use gc_min_interval_ms */
2606 
2607                 .procname       = "gc_min_interval",
2608                 .data           = &ip_rt_gc_min_interval,
2609                 .maxlen         = sizeof(int),
2610                 .mode           = 0644,
2611                 .proc_handler   = proc_dointvec_jiffies,
2612         },
2613         {
2614                 .procname       = "gc_min_interval_ms",
2615                 .data           = &ip_rt_gc_min_interval,
2616                 .maxlen         = sizeof(int),
2617                 .mode           = 0644,
2618                 .proc_handler   = proc_dointvec_ms_jiffies,
2619         },
2620         {
2621                 .procname       = "gc_timeout",
2622                 .data           = &ip_rt_gc_timeout,
2623                 .maxlen         = sizeof(int),
2624                 .mode           = 0644,
2625                 .proc_handler   = proc_dointvec_jiffies,
2626         },
2627         {
2628                 .procname       = "gc_interval",
2629                 .data           = &ip_rt_gc_interval,
2630                 .maxlen         = sizeof(int),
2631                 .mode           = 0644,
2632                 .proc_handler   = proc_dointvec_jiffies,
2633         },
2634         {
2635                 .procname       = "redirect_load",
2636                 .data           = &ip_rt_redirect_load,
2637                 .maxlen         = sizeof(int),
2638                 .mode           = 0644,
2639                 .proc_handler   = proc_dointvec,
2640         },
2641         {
2642                 .procname       = "redirect_number",
2643                 .data           = &ip_rt_redirect_number,
2644                 .maxlen         = sizeof(int),
2645                 .mode           = 0644,
2646                 .proc_handler   = proc_dointvec,
2647         },
2648         {
2649                 .procname       = "redirect_silence",
2650                 .data           = &ip_rt_redirect_silence,
2651                 .maxlen         = sizeof(int),
2652                 .mode           = 0644,
2653                 .proc_handler   = proc_dointvec,
2654         },
2655         {
2656                 .procname       = "error_cost",
2657                 .data           = &ip_rt_error_cost,
2658                 .maxlen         = sizeof(int),
2659                 .mode           = 0644,
2660                 .proc_handler   = proc_dointvec,
2661         },
2662         {
2663                 .procname       = "error_burst",
2664                 .data           = &ip_rt_error_burst,
2665                 .maxlen         = sizeof(int),
2666                 .mode           = 0644,
2667                 .proc_handler   = proc_dointvec,
2668         },
2669         {
2670                 .procname       = "gc_elasticity",
2671                 .data           = &ip_rt_gc_elasticity,
2672                 .maxlen         = sizeof(int),
2673                 .mode           = 0644,
2674                 .proc_handler   = proc_dointvec,
2675         },
2676         {
2677                 .procname       = "mtu_expires",
2678                 .data           = &ip_rt_mtu_expires,
2679                 .maxlen         = sizeof(int),
2680                 .mode           = 0644,
2681                 .proc_handler   = proc_dointvec_jiffies,
2682         },
2683         {
2684                 .procname       = "min_pmtu",
2685                 .data           = &ip_rt_min_pmtu,
2686                 .maxlen         = sizeof(int),
2687                 .mode           = 0644,
2688                 .proc_handler   = proc_dointvec,
2689         },
2690         {
2691                 .procname       = "min_adv_mss",
2692                 .data           = &ip_rt_min_advmss,
2693                 .maxlen         = sizeof(int),
2694                 .mode           = 0644,
2695                 .proc_handler   = proc_dointvec,
2696         },
2697         { }
2698 };
2699 
2700 static struct ctl_table ipv4_route_flush_table[] = {
2701         {
2702                 .procname       = "flush",
2703                 .maxlen         = sizeof(int),
2704                 .mode           = 0200,
2705                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2706         },
2707         { },
2708 };
2709 
2710 static __net_init int sysctl_route_net_init(struct net *net)
2711 {
2712         struct ctl_table *tbl;
2713 
2714         tbl = ipv4_route_flush_table;
2715         if (!net_eq(net, &init_net)) {
2716                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2717                 if (tbl == NULL)
2718                         goto err_dup;
2719 
2720                 /* Don't export sysctls to unprivileged users */
2721                 if (net->user_ns != &init_user_ns)
2722                         tbl[0].procname = NULL;
2723         }
2724         tbl[0].extra1 = net;
2725 
2726         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2727         if (net->ipv4.route_hdr == NULL)
2728                 goto err_reg;
2729         return 0;
2730 
2731 err_reg:
2732         if (tbl != ipv4_route_flush_table)
2733                 kfree(tbl);
2734 err_dup:
2735         return -ENOMEM;
2736 }
2737 
2738 static __net_exit void sysctl_route_net_exit(struct net *net)
2739 {
2740         struct ctl_table *tbl;
2741 
2742         tbl = net->ipv4.route_hdr->ctl_table_arg;
2743         unregister_net_sysctl_table(net->ipv4.route_hdr);
2744         BUG_ON(tbl == ipv4_route_flush_table);
2745         kfree(tbl);
2746 }
2747 
2748 static __net_initdata struct pernet_operations sysctl_route_ops = {
2749         .init = sysctl_route_net_init,
2750         .exit = sysctl_route_net_exit,
2751 };
2752 #endif
2753 
2754 static __net_init int rt_genid_init(struct net *net)
2755 {
2756         atomic_set(&net->ipv4.rt_genid, 0);
2757         atomic_set(&net->fnhe_genid, 0);
2758         get_random_bytes(&net->ipv4.dev_addr_genid,
2759                          sizeof(net->ipv4.dev_addr_genid));
2760         return 0;
2761 }
2762 
2763 static __net_initdata struct pernet_operations rt_genid_ops = {
2764         .init = rt_genid_init,
2765 };
2766 
2767 static int __net_init ipv4_inetpeer_init(struct net *net)
2768 {
2769         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2770 
2771         if (!bp)
2772                 return -ENOMEM;
2773         inet_peer_base_init(bp);
2774         net->ipv4.peers = bp;
2775         return 0;
2776 }
2777 
2778 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2779 {
2780         struct inet_peer_base *bp = net->ipv4.peers;
2781 
2782         net->ipv4.peers = NULL;
2783         inetpeer_invalidate_tree(bp);
2784         kfree(bp);
2785 }
2786 
2787 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2788         .init   =       ipv4_inetpeer_init,
2789         .exit   =       ipv4_inetpeer_exit,
2790 };
2791 
2792 #ifdef CONFIG_IP_ROUTE_CLASSID
2793 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2794 #endif /* CONFIG_IP_ROUTE_CLASSID */
2795 
2796 int __init ip_rt_init(void)
2797 {
2798         int rc = 0;
2799 
2800         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2801         if (!ip_idents)
2802                 panic("IP: failed to allocate ip_idents\n");
2803 
2804         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2805 
2806 #ifdef CONFIG_IP_ROUTE_CLASSID
2807         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2808         if (!ip_rt_acct)
2809                 panic("IP: failed to allocate ip_rt_acct\n");
2810 #endif
2811 
2812         ipv4_dst_ops.kmem_cachep =
2813                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2814                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2815 
2816         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2817 
2818         if (dst_entries_init(&ipv4_dst_ops) < 0)
2819                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2820 
2821         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2822                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2823 
2824         ipv4_dst_ops.gc_thresh = ~0;
2825         ip_rt_max_size = INT_MAX;
2826 
2827         devinet_init();
2828         ip_fib_init();
2829 
2830         if (ip_rt_proc_init())
2831                 pr_err("Unable to create route proc files\n");
2832 #ifdef CONFIG_XFRM
2833         xfrm_init();
2834         xfrm4_init();
2835 #endif
2836         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2837 
2838 #ifdef CONFIG_SYSCTL
2839         register_pernet_subsys(&sysctl_route_ops);
2840 #endif
2841         register_pernet_subsys(&rt_genid_ops);
2842         register_pernet_subsys(&ipv4_inetpeer_ops);
2843         return rc;
2844 }
2845 
2846 #ifdef CONFIG_SYSCTL
2847 /*
2848  * We really need to sanitize the damn ipv4 init order, then all
2849  * this nonsense will go away.
2850  */
2851 void __init ip_static_sysctl_init(void)
2852 {
2853         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2854 }
2855 #endif
2856 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp