~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/route.c

Version: ~ [ linux-5.13-rc5 ] ~ [ linux-5.12.9 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.42 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.124 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.193 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.235 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.271 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.271 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              ROUTE - implementation of the IP router.
  7  *
  8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
  9  *
 10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 15  *
 16  * Fixes:
 17  *              Alan Cox        :       Verify area fixes.
 18  *              Alan Cox        :       cli() protects routing changes
 19  *              Rui Oliveira    :       ICMP routing table updates
 20  *              (rco@di.uminho.pt)      Routing table insertion and update
 21  *              Linus Torvalds  :       Rewrote bits to be sensible
 22  *              Alan Cox        :       Added BSD route gw semantics
 23  *              Alan Cox        :       Super /proc >4K 
 24  *              Alan Cox        :       MTU in route table
 25  *              Alan Cox        :       MSS actually. Also added the window
 26  *                                      clamper.
 27  *              Sam Lantinga    :       Fixed route matching in rt_del()
 28  *              Alan Cox        :       Routing cache support.
 29  *              Alan Cox        :       Removed compatibility cruft.
 30  *              Alan Cox        :       RTF_REJECT support.
 31  *              Alan Cox        :       TCP irtt support.
 32  *              Jonathan Naylor :       Added Metric support.
 33  *      Miquel van Smoorenburg  :       BSD API fixes.
 34  *      Miquel van Smoorenburg  :       Metrics.
 35  *              Alan Cox        :       Use __u32 properly
 36  *              Alan Cox        :       Aligned routing errors more closely with BSD
 37  *                                      our system is still very different.
 38  *              Alan Cox        :       Faster /proc handling
 39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 40  *                                      routing caches and better behaviour.
 41  *              
 42  *              Olaf Erb        :       irtt wasn't being copied right.
 43  *              Bjorn Ekwall    :       Kerneld route support.
 44  *              Alan Cox        :       Multicast fixed (I hope)
 45  *              Pavel Krauz     :       Limited broadcast fixed
 46  *              Mike McLagan    :       Routing by source
 47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 48  *                                      route.c and rewritten from scratch.
 49  *              Andi Kleen      :       Load-limit warning messages.
 50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 54  *              Marc Boucher    :       routing by fwmark
 55  *      Robert Olsson           :       Added rt_cache statistics
 56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 57  *
 58  *              This program is free software; you can redistribute it and/or
 59  *              modify it under the terms of the GNU General Public License
 60  *              as published by the Free Software Foundation; either version
 61  *              2 of the License, or (at your option) any later version.
 62  */
 63 
 64 #include <linux/config.h>
 65 #include <linux/module.h>
 66 #include <asm/uaccess.h>
 67 #include <asm/system.h>
 68 #include <asm/bitops.h>
 69 #include <linux/types.h>
 70 #include <linux/kernel.h>
 71 #include <linux/sched.h>
 72 #include <linux/mm.h>
 73 #include <linux/string.h>
 74 #include <linux/socket.h>
 75 #include <linux/sockios.h>
 76 #include <linux/errno.h>
 77 #include <linux/in.h>
 78 #include <linux/inet.h>
 79 #include <linux/netdevice.h>
 80 #include <linux/proc_fs.h>
 81 #include <linux/init.h>
 82 #include <linux/skbuff.h>
 83 #include <linux/rtnetlink.h>
 84 #include <linux/inetdevice.h>
 85 #include <linux/igmp.h>
 86 #include <linux/pkt_sched.h>
 87 #include <linux/mroute.h>
 88 #include <linux/netfilter_ipv4.h>
 89 #include <linux/random.h>
 90 #include <linux/jhash.h>
 91 #include <linux/rcupdate.h>
 92 #include <linux/times.h>
 93 #include <net/protocol.h>
 94 #include <net/ip.h>
 95 #include <net/route.h>
 96 #include <net/inetpeer.h>
 97 #include <net/sock.h>
 98 #include <net/ip_fib.h>
 99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106 
107 #define IP_MAX_MTU      0xFFF0
108 
109 #define RT_GC_TIMEOUT (300*HZ)
110 
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128 
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130 
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134 
135 /*
136  *      Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void              ipv4_link_failure(struct sk_buff *skb);
143 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
144 static int rt_garbage_collect(void);
145 
146 
147 struct dst_ops ipv4_dst_ops = {
148         .family =               AF_INET,
149         .protocol =             __constant_htons(ETH_P_IP),
150         .gc =                   rt_garbage_collect,
151         .check =                ipv4_dst_check,
152         .destroy =              ipv4_dst_destroy,
153         .negative_advice =      ipv4_negative_advice,
154         .link_failure =         ipv4_link_failure,
155         .update_pmtu =          ip_rt_update_pmtu,
156         .entry_size =           sizeof(struct rtable),
157 };
158 
159 #define ECN_OR_COST(class)      TC_PRIO_##class
160 
161 __u8 ip_tos2prio[16] = {
162         TC_PRIO_BESTEFFORT,
163         ECN_OR_COST(FILLER),
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(BESTEFFORT),
166         TC_PRIO_BULK,
167         ECN_OR_COST(BULK),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_INTERACTIVE,
171         ECN_OR_COST(INTERACTIVE),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE_BULK,
175         ECN_OR_COST(INTERACTIVE_BULK),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK)
178 };
179 
180 
181 /*
182  * Route cache.
183  */
184 
185 /* The locking scheme is rather straight forward:
186  *
187  * 1) Read-Copy Update protects the buckets of the central route hash.
188  * 2) Only writers remove entries, and they hold the lock
189  *    as they look at rtable reference counts.
190  * 3) Only readers acquire references to rtable entries,
191  *    they do so with atomic increments and with the
192  *    lock held.
193  */
194 
195 struct rt_hash_bucket {
196         struct rtable   *chain;
197         spinlock_t      lock;
198 } __attribute__((__aligned__(8)));
199 
200 static struct rt_hash_bucket    *rt_hash_table;
201 static unsigned                 rt_hash_mask;
202 static int                      rt_hash_log;
203 static unsigned int             rt_hash_rnd;
204 
205 struct rt_cache_stat *rt_cache_stat;
206 
207 static int rt_intern_hash(unsigned hash, struct rtable *rth,
208                                 struct rtable **res);
209 
210 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
211 {
212         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
213                 & rt_hash_mask);
214 }
215 
216 #ifdef CONFIG_PROC_FS
217 struct rt_cache_iter_state {
218         int bucket;
219 };
220 
221 static struct rtable *rt_cache_get_first(struct seq_file *seq)
222 {
223         struct rtable *r = NULL;
224         struct rt_cache_iter_state *st = seq->private;
225 
226         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
227                 rcu_read_lock();
228                 r = rt_hash_table[st->bucket].chain;
229                 if (r)
230                         break;
231                 rcu_read_unlock();
232         }
233         return r;
234 }
235 
236 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
237 {
238         struct rt_cache_iter_state *st = seq->private;
239 
240         smp_read_barrier_depends();
241         r = r->u.rt_next;
242         while (!r) {
243                 rcu_read_unlock();
244                 if (--st->bucket < 0)
245                         break;
246                 rcu_read_lock();
247                 r = rt_hash_table[st->bucket].chain;
248         }
249         return r;
250 }
251 
252 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
253 {
254         struct rtable *r = rt_cache_get_first(seq);
255 
256         if (r)
257                 while (pos && (r = rt_cache_get_next(seq, r)))
258                         --pos;
259         return pos ? NULL : r;
260 }
261 
262 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
263 {
264         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
265 }
266 
267 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         struct rtable *r = NULL;
270 
271         if (v == SEQ_START_TOKEN)
272                 r = rt_cache_get_first(seq);
273         else
274                 r = rt_cache_get_next(seq, v);
275         ++*pos;
276         return r;
277 }
278 
279 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
280 {
281         if (v && v != SEQ_START_TOKEN)
282                 rcu_read_unlock();
283 }
284 
285 static int rt_cache_seq_show(struct seq_file *seq, void *v)
286 {
287         if (v == SEQ_START_TOKEN)
288                 seq_printf(seq, "%-127s\n",
289                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
290                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
291                            "HHUptod\tSpecDst");
292         else {
293                 struct rtable *r = v;
294                 char temp[256];
295 
296                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
297                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
298                         r->u.dst.dev ? r->u.dst.dev->name : "*",
299                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
300                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
301                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
302                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
303                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
304                         dst_metric(&r->u.dst, RTAX_WINDOW),
305                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
306                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
307                         r->fl.fl4_tos,
308                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
309                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
310                                        dev_queue_xmit) : 0,
311                         r->rt_spec_dst);
312                 seq_printf(seq, "%-127s\n", temp);
313         }
314         return 0;
315 }
316 
317 static struct seq_operations rt_cache_seq_ops = {
318         .start  = rt_cache_seq_start,
319         .next   = rt_cache_seq_next,
320         .stop   = rt_cache_seq_stop,
321         .show   = rt_cache_seq_show,
322 };
323 
324 static int rt_cache_seq_open(struct inode *inode, struct file *file)
325 {
326         struct seq_file *seq;
327         int rc = -ENOMEM;
328         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
329 
330         if (!s)
331                 goto out;
332         rc = seq_open(file, &rt_cache_seq_ops);
333         if (rc)
334                 goto out_kfree;
335         seq          = file->private_data;
336         seq->private = s;
337         memset(s, 0, sizeof(*s));
338 out:
339         return rc;
340 out_kfree:
341         kfree(s);
342         goto out;
343 }
344 
345 static struct file_operations rt_cache_seq_fops = {
346         .owner   = THIS_MODULE,
347         .open    = rt_cache_seq_open,
348         .read    = seq_read,
349         .llseek  = seq_lseek,
350         .release = seq_release_private,
351 };
352 
353 
354 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
355 {
356         int cpu;
357 
358         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
359                 if (!cpu_possible(cpu))
360                         continue;
361                 *pos = cpu;
362                 return per_cpu_ptr(rt_cache_stat, cpu);
363         }
364         return NULL;
365 }
366 
367 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
368 {
369         int cpu;
370 
371         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
372                 if (!cpu_possible(cpu))
373                         continue;
374                 *pos = cpu;
375                 return per_cpu_ptr(rt_cache_stat, cpu);
376         }
377         return NULL;
378         
379 }
380 
381 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
382 {
383 
384 }
385 
386 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
387 {
388         struct rt_cache_stat *st = v;
389         
390         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
391                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
392                    atomic_read(&ipv4_dst_ops.entries),
393                    st->in_hit,
394                    st->in_slow_tot,
395                    st->in_slow_mc,
396                    st->in_no_route,
397                    st->in_brd,
398                    st->in_martian_dst,
399                    st->in_martian_src,
400 
401                    st->out_hit,
402                    st->out_slow_tot,
403                    st->out_slow_mc, 
404 
405                    st->gc_total,
406                    st->gc_ignored,
407                    st->gc_goal_miss,
408                    st->gc_dst_overflow,
409                    st->in_hlist_search,
410                    st->out_hlist_search
411                 );
412         return 0;
413 }
414 
415 static struct seq_operations rt_cpu_seq_ops = {
416         .start  = rt_cpu_seq_start,
417         .next   = rt_cpu_seq_next,
418         .stop   = rt_cpu_seq_stop,
419         .show   = rt_cpu_seq_show,
420 };
421 
422 
423 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
424 {
425         return seq_open(file, &rt_cpu_seq_ops);
426 }
427 
428 static struct file_operations rt_cpu_seq_fops = {
429         .owner   = THIS_MODULE,
430         .open    = rt_cpu_seq_open,
431         .read    = seq_read,
432         .llseek  = seq_lseek,
433         .release = seq_release_private,
434 };
435 
436 #endif /* CONFIG_PROC_FS */
437   
438 static __inline__ void rt_free(struct rtable *rt)
439 {
440         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
441 }
442 
443 static __inline__ void rt_drop(struct rtable *rt)
444 {
445         ip_rt_put(rt);
446         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
447 }
448 
449 static __inline__ int rt_fast_clean(struct rtable *rth)
450 {
451         /* Kill broadcast/multicast entries very aggresively, if they
452            collide in hash table with more useful entries */
453         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
454                 rth->fl.iif && rth->u.rt_next;
455 }
456 
457 static __inline__ int rt_valuable(struct rtable *rth)
458 {
459         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
460                 rth->u.dst.expires;
461 }
462 
463 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
464 {
465         unsigned long age;
466         int ret = 0;
467 
468         if (atomic_read(&rth->u.dst.__refcnt))
469                 goto out;
470 
471         ret = 1;
472         if (rth->u.dst.expires &&
473             time_after_eq(jiffies, rth->u.dst.expires))
474                 goto out;
475 
476         age = jiffies - rth->u.dst.lastuse;
477         ret = 0;
478         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
479             (age <= tmo2 && rt_valuable(rth)))
480                 goto out;
481         ret = 1;
482 out:    return ret;
483 }
484 
485 /* Bits of score are:
486  * 31: very valuable
487  * 30: not quite useless
488  * 29..0: usage counter
489  */
490 static inline u32 rt_score(struct rtable *rt)
491 {
492         u32 score = jiffies - rt->u.dst.lastuse;
493 
494         score = ~score & ~(3<<30);
495 
496         if (rt_valuable(rt))
497                 score |= (1<<31);
498 
499         if (!rt->fl.iif ||
500             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
501                 score |= (1<<30);
502 
503         return score;
504 }
505 
506 /* This runs via a timer and thus is always in BH context. */
507 static void rt_check_expire(unsigned long dummy)
508 {
509         static int rover;
510         int i = rover, t;
511         struct rtable *rth, **rthp;
512         unsigned long now = jiffies;
513 
514         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
515              t -= ip_rt_gc_timeout) {
516                 unsigned long tmo = ip_rt_gc_timeout;
517 
518                 i = (i + 1) & rt_hash_mask;
519                 rthp = &rt_hash_table[i].chain;
520 
521                 spin_lock(&rt_hash_table[i].lock);
522                 while ((rth = *rthp) != NULL) {
523                         if (rth->u.dst.expires) {
524                                 /* Entry is expired even if it is in use */
525                                 if (time_before_eq(now, rth->u.dst.expires)) {
526                                         tmo >>= 1;
527                                         rthp = &rth->u.rt_next;
528                                         continue;
529                                 }
530                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
531                                 tmo >>= 1;
532                                 rthp = &rth->u.rt_next;
533                                 continue;
534                         }
535 
536                         /* Cleanup aged off entries. */
537                         *rthp = rth->u.rt_next;
538                         rt_free(rth);
539                 }
540                 spin_unlock(&rt_hash_table[i].lock);
541 
542                 /* Fallback loop breaker. */
543                 if (time_after(jiffies, now))
544                         break;
545         }
546         rover = i;
547         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
548 }
549 
550 /* This can run from both BH and non-BH contexts, the latter
551  * in the case of a forced flush event.
552  */
553 static void rt_run_flush(unsigned long dummy)
554 {
555         int i;
556         struct rtable *rth, *next;
557 
558         rt_deadline = 0;
559 
560         get_random_bytes(&rt_hash_rnd, 4);
561 
562         for (i = rt_hash_mask; i >= 0; i--) {
563                 spin_lock_bh(&rt_hash_table[i].lock);
564                 rth = rt_hash_table[i].chain;
565                 if (rth)
566                         rt_hash_table[i].chain = NULL;
567                 spin_unlock_bh(&rt_hash_table[i].lock);
568 
569                 for (; rth; rth = next) {
570                         next = rth->u.rt_next;
571                         rt_free(rth);
572                 }
573         }
574 }
575 
576 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
577 
578 void rt_cache_flush(int delay)
579 {
580         unsigned long now = jiffies;
581         int user_mode = !in_softirq();
582 
583         if (delay < 0)
584                 delay = ip_rt_min_delay;
585 
586         spin_lock_bh(&rt_flush_lock);
587 
588         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
589                 long tmo = (long)(rt_deadline - now);
590 
591                 /* If flush timer is already running
592                    and flush request is not immediate (delay > 0):
593 
594                    if deadline is not achieved, prolongate timer to "delay",
595                    otherwise fire it at deadline time.
596                  */
597 
598                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
599                         tmo = 0;
600                 
601                 if (delay > tmo)
602                         delay = tmo;
603         }
604 
605         if (delay <= 0) {
606                 spin_unlock_bh(&rt_flush_lock);
607                 rt_run_flush(0);
608                 return;
609         }
610 
611         if (rt_deadline == 0)
612                 rt_deadline = now + ip_rt_max_delay;
613 
614         mod_timer(&rt_flush_timer, now+delay);
615         spin_unlock_bh(&rt_flush_lock);
616 }
617 
618 static void rt_secret_rebuild(unsigned long dummy)
619 {
620         unsigned long now = jiffies;
621 
622         rt_cache_flush(0);
623         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
624 }
625 
626 /*
627    Short description of GC goals.
628 
629    We want to build algorithm, which will keep routing cache
630    at some equilibrium point, when number of aged off entries
631    is kept approximately equal to newly generated ones.
632 
633    Current expiration strength is variable "expire".
634    We try to adjust it dynamically, so that if networking
635    is idle expires is large enough to keep enough of warm entries,
636    and when load increases it reduces to limit cache size.
637  */
638 
639 static int rt_garbage_collect(void)
640 {
641         static unsigned long expire = RT_GC_TIMEOUT;
642         static unsigned long last_gc;
643         static int rover;
644         static int equilibrium;
645         struct rtable *rth, **rthp;
646         unsigned long now = jiffies;
647         int goal;
648 
649         /*
650          * Garbage collection is pretty expensive,
651          * do not make it too frequently.
652          */
653 
654         RT_CACHE_STAT_INC(gc_total);
655 
656         if (now - last_gc < ip_rt_gc_min_interval &&
657             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
658                 RT_CACHE_STAT_INC(gc_ignored);
659                 goto out;
660         }
661 
662         /* Calculate number of entries, which we want to expire now. */
663         goal = atomic_read(&ipv4_dst_ops.entries) -
664                 (ip_rt_gc_elasticity << rt_hash_log);
665         if (goal <= 0) {
666                 if (equilibrium < ipv4_dst_ops.gc_thresh)
667                         equilibrium = ipv4_dst_ops.gc_thresh;
668                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
669                 if (goal > 0) {
670                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
671                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672                 }
673         } else {
674                 /* We are in dangerous area. Try to reduce cache really
675                  * aggressively.
676                  */
677                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
678                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
679         }
680 
681         if (now - last_gc >= ip_rt_gc_min_interval)
682                 last_gc = now;
683 
684         if (goal <= 0) {
685                 equilibrium += goal;
686                 goto work_done;
687         }
688 
689         do {
690                 int i, k;
691 
692                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
693                         unsigned long tmo = expire;
694 
695                         k = (k + 1) & rt_hash_mask;
696                         rthp = &rt_hash_table[k].chain;
697                         spin_lock_bh(&rt_hash_table[k].lock);
698                         while ((rth = *rthp) != NULL) {
699                                 if (!rt_may_expire(rth, tmo, expire)) {
700                                         tmo >>= 1;
701                                         rthp = &rth->u.rt_next;
702                                         continue;
703                                 }
704                                 *rthp = rth->u.rt_next;
705                                 rt_free(rth);
706                                 goal--;
707                         }
708                         spin_unlock_bh(&rt_hash_table[k].lock);
709                         if (goal <= 0)
710                                 break;
711                 }
712                 rover = k;
713 
714                 if (goal <= 0)
715                         goto work_done;
716 
717                 /* Goal is not achieved. We stop process if:
718 
719                    - if expire reduced to zero. Otherwise, expire is halfed.
720                    - if table is not full.
721                    - if we are called from interrupt.
722                    - jiffies check is just fallback/debug loop breaker.
723                      We will not spin here for long time in any case.
724                  */
725 
726                 RT_CACHE_STAT_INC(gc_goal_miss);
727 
728                 if (expire == 0)
729                         break;
730 
731                 expire >>= 1;
732 #if RT_CACHE_DEBUG >= 2
733                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
734                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
735 #endif
736 
737                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
738                         goto out;
739         } while (!in_softirq() && time_before_eq(jiffies, now));
740 
741         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
742                 goto out;
743         if (net_ratelimit())
744                 printk(KERN_WARNING "dst cache overflow\n");
745         RT_CACHE_STAT_INC(gc_dst_overflow);
746         return 1;
747 
748 work_done:
749         expire += ip_rt_gc_min_interval;
750         if (expire > ip_rt_gc_timeout ||
751             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
752                 expire = ip_rt_gc_timeout;
753 #if RT_CACHE_DEBUG >= 2
754         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
755                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
756 #endif
757 out:    return 0;
758 }
759 
760 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
761 {
762         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
763                fl1->oif     == fl2->oif &&
764                fl1->iif     == fl2->iif;
765 }
766 
767 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
768 {
769         struct rtable   *rth, **rthp;
770         unsigned long   now;
771         struct rtable *cand, **candp;
772         u32             min_score;
773         int             chain_length;
774         int attempts = !in_softirq();
775 
776 restart:
777         chain_length = 0;
778         min_score = ~(u32)0;
779         cand = NULL;
780         candp = NULL;
781         now = jiffies;
782 
783         rthp = &rt_hash_table[hash].chain;
784 
785         spin_lock_bh(&rt_hash_table[hash].lock);
786         while ((rth = *rthp) != NULL) {
787                 if (compare_keys(&rth->fl, &rt->fl)) {
788                         /* Put it first */
789                         *rthp = rth->u.rt_next;
790                         /*
791                          * Since lookup is lockfree, the deletion
792                          * must be visible to another weakly ordered CPU before
793                          * the insertion at the start of the hash chain.
794                          */
795                         smp_wmb();
796                         rth->u.rt_next = rt_hash_table[hash].chain;
797                         /*
798                          * Since lookup is lockfree, the update writes
799                          * must be ordered for consistency on SMP.
800                          */
801                         smp_wmb();
802                         rt_hash_table[hash].chain = rth;
803 
804                         rth->u.dst.__use++;
805                         dst_hold(&rth->u.dst);
806                         rth->u.dst.lastuse = now;
807                         spin_unlock_bh(&rt_hash_table[hash].lock);
808 
809                         rt_drop(rt);
810                         *rp = rth;
811                         return 0;
812                 }
813 
814                 if (!atomic_read(&rth->u.dst.__refcnt)) {
815                         u32 score = rt_score(rth);
816 
817                         if (score <= min_score) {
818                                 cand = rth;
819                                 candp = rthp;
820                                 min_score = score;
821                         }
822                 }
823 
824                 chain_length++;
825 
826                 rthp = &rth->u.rt_next;
827         }
828 
829         if (cand) {
830                 /* ip_rt_gc_elasticity used to be average length of chain
831                  * length, when exceeded gc becomes really aggressive.
832                  *
833                  * The second limit is less certain. At the moment it allows
834                  * only 2 entries per bucket. We will see.
835                  */
836                 if (chain_length > ip_rt_gc_elasticity) {
837                         *candp = cand->u.rt_next;
838                         rt_free(cand);
839                 }
840         }
841 
842         /* Try to bind route to arp only if it is output
843            route or unicast forwarding path.
844          */
845         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
846                 int err = arp_bind_neighbour(&rt->u.dst);
847                 if (err) {
848                         spin_unlock_bh(&rt_hash_table[hash].lock);
849 
850                         if (err != -ENOBUFS) {
851                                 rt_drop(rt);
852                                 return err;
853                         }
854 
855                         /* Neighbour tables are full and nothing
856                            can be released. Try to shrink route cache,
857                            it is most likely it holds some neighbour records.
858                          */
859                         if (attempts-- > 0) {
860                                 int saved_elasticity = ip_rt_gc_elasticity;
861                                 int saved_int = ip_rt_gc_min_interval;
862                                 ip_rt_gc_elasticity     = 1;
863                                 ip_rt_gc_min_interval   = 0;
864                                 rt_garbage_collect();
865                                 ip_rt_gc_min_interval   = saved_int;
866                                 ip_rt_gc_elasticity     = saved_elasticity;
867                                 goto restart;
868                         }
869 
870                         if (net_ratelimit())
871                                 printk(KERN_WARNING "Neighbour table overflow.\n");
872                         rt_drop(rt);
873                         return -ENOBUFS;
874                 }
875         }
876 
877         rt->u.rt_next = rt_hash_table[hash].chain;
878 #if RT_CACHE_DEBUG >= 2
879         if (rt->u.rt_next) {
880                 struct rtable *trt;
881                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
882                        NIPQUAD(rt->rt_dst));
883                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
884                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
885                 printk("\n");
886         }
887 #endif
888         rt_hash_table[hash].chain = rt;
889         spin_unlock_bh(&rt_hash_table[hash].lock);
890         *rp = rt;
891         return 0;
892 }
893 
894 void rt_bind_peer(struct rtable *rt, int create)
895 {
896         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
897         struct inet_peer *peer;
898 
899         peer = inet_getpeer(rt->rt_dst, create);
900 
901         spin_lock_bh(&rt_peer_lock);
902         if (rt->peer == NULL) {
903                 rt->peer = peer;
904                 peer = NULL;
905         }
906         spin_unlock_bh(&rt_peer_lock);
907         if (peer)
908                 inet_putpeer(peer);
909 }
910 
911 /*
912  * Peer allocation may fail only in serious out-of-memory conditions.  However
913  * we still can generate some output.
914  * Random ID selection looks a bit dangerous because we have no chances to
915  * select ID being unique in a reasonable period of time.
916  * But broken packet identifier may be better than no packet at all.
917  */
918 static void ip_select_fb_ident(struct iphdr *iph)
919 {
920         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
921         static u32 ip_fallback_id;
922         u32 salt;
923 
924         spin_lock_bh(&ip_fb_id_lock);
925         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
926         iph->id = htons(salt & 0xFFFF);
927         ip_fallback_id = salt;
928         spin_unlock_bh(&ip_fb_id_lock);
929 }
930 
931 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
932 {
933         struct rtable *rt = (struct rtable *) dst;
934 
935         if (rt) {
936                 if (rt->peer == NULL)
937                         rt_bind_peer(rt, 1);
938 
939                 /* If peer is attached to destination, it is never detached,
940                    so that we need not to grab a lock to dereference it.
941                  */
942                 if (rt->peer) {
943                         iph->id = htons(inet_getid(rt->peer, more));
944                         return;
945                 }
946         } else
947                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
948 
949         ip_select_fb_ident(iph);
950 }
951 
952 static void rt_del(unsigned hash, struct rtable *rt)
953 {
954         struct rtable **rthp;
955 
956         spin_lock_bh(&rt_hash_table[hash].lock);
957         ip_rt_put(rt);
958         for (rthp = &rt_hash_table[hash].chain; *rthp;
959              rthp = &(*rthp)->u.rt_next)
960                 if (*rthp == rt) {
961                         *rthp = rt->u.rt_next;
962                         rt_free(rt);
963                         break;
964                 }
965         spin_unlock_bh(&rt_hash_table[hash].lock);
966 }
967 
968 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
969                     u32 saddr, u8 tos, struct net_device *dev)
970 {
971         int i, k;
972         struct in_device *in_dev = in_dev_get(dev);
973         struct rtable *rth, **rthp;
974         u32  skeys[2] = { saddr, 0 };
975         int  ikeys[2] = { dev->ifindex, 0 };
976 
977         tos &= IPTOS_RT_MASK;
978 
979         if (!in_dev)
980                 return;
981 
982         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
983             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
984                 goto reject_redirect;
985 
986         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
987                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
988                         goto reject_redirect;
989                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
990                         goto reject_redirect;
991         } else {
992                 if (inet_addr_type(new_gw) != RTN_UNICAST)
993                         goto reject_redirect;
994         }
995 
996         for (i = 0; i < 2; i++) {
997                 for (k = 0; k < 2; k++) {
998                         unsigned hash = rt_hash_code(daddr,
999                                                      skeys[i] ^ (ikeys[k] << 5),
1000                                                      tos);
1001 
1002                         rthp=&rt_hash_table[hash].chain;
1003 
1004                         rcu_read_lock();
1005                         while ((rth = *rthp) != NULL) {
1006                                 struct rtable *rt;
1007 
1008                                 smp_read_barrier_depends();
1009                                 if (rth->fl.fl4_dst != daddr ||
1010                                     rth->fl.fl4_src != skeys[i] ||
1011                                     rth->fl.fl4_tos != tos ||
1012                                     rth->fl.oif != ikeys[k] ||
1013                                     rth->fl.iif != 0) {
1014                                         rthp = &rth->u.rt_next;
1015                                         continue;
1016                                 }
1017 
1018                                 if (rth->rt_dst != daddr ||
1019                                     rth->rt_src != saddr ||
1020                                     rth->u.dst.error ||
1021                                     rth->rt_gateway != old_gw ||
1022                                     rth->u.dst.dev != dev)
1023                                         break;
1024 
1025                                 dst_hold(&rth->u.dst);
1026                                 rcu_read_unlock();
1027 
1028                                 rt = dst_alloc(&ipv4_dst_ops);
1029                                 if (rt == NULL) {
1030                                         ip_rt_put(rth);
1031                                         in_dev_put(in_dev);
1032                                         return;
1033                                 }
1034 
1035                                 /* Copy all the information. */
1036                                 *rt = *rth;
1037                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1038                                 rt->u.dst.__use         = 1;
1039                                 atomic_set(&rt->u.dst.__refcnt, 1);
1040                                 rt->u.dst.child         = NULL;
1041                                 if (rt->u.dst.dev)
1042                                         dev_hold(rt->u.dst.dev);
1043                                 rt->u.dst.obsolete      = 0;
1044                                 rt->u.dst.lastuse       = jiffies;
1045                                 rt->u.dst.path          = &rt->u.dst;
1046                                 rt->u.dst.neighbour     = NULL;
1047                                 rt->u.dst.hh            = NULL;
1048                                 rt->u.dst.xfrm          = NULL;
1049 
1050                                 rt->rt_flags            |= RTCF_REDIRECTED;
1051 
1052                                 /* Gateway is different ... */
1053                                 rt->rt_gateway          = new_gw;
1054 
1055                                 /* Redirect received -> path was valid */
1056                                 dst_confirm(&rth->u.dst);
1057 
1058                                 if (rt->peer)
1059                                         atomic_inc(&rt->peer->refcnt);
1060 
1061                                 if (arp_bind_neighbour(&rt->u.dst) ||
1062                                     !(rt->u.dst.neighbour->nud_state &
1063                                             NUD_VALID)) {
1064                                         if (rt->u.dst.neighbour)
1065                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1066                                         ip_rt_put(rth);
1067                                         rt_drop(rt);
1068                                         goto do_next;
1069                                 }
1070 
1071                                 rt_del(hash, rth);
1072                                 if (!rt_intern_hash(hash, rt, &rt))
1073                                         ip_rt_put(rt);
1074                                 goto do_next;
1075                         }
1076                         rcu_read_unlock();
1077                 do_next:
1078                         ;
1079                 }
1080         }
1081         in_dev_put(in_dev);
1082         return;
1083 
1084 reject_redirect:
1085 #ifdef CONFIG_IP_ROUTE_VERBOSE
1086         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1087                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1088                         "%u.%u.%u.%u ignored.\n"
1089                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1090                         "tos %02x\n",
1091                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1092                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1093 #endif
1094         in_dev_put(in_dev);
1095 }
1096 
1097 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1098 {
1099         struct rtable *rt = (struct rtable*)dst;
1100         struct dst_entry *ret = dst;
1101 
1102         if (rt) {
1103                 if (dst->obsolete) {
1104                         ip_rt_put(rt);
1105                         ret = NULL;
1106                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1107                            rt->u.dst.expires) {
1108                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1109                                                      rt->fl.fl4_src ^
1110                                                         (rt->fl.oif << 5),
1111                                                      rt->fl.fl4_tos);
1112 #if RT_CACHE_DEBUG >= 1
1113                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1114                                           "%u.%u.%u.%u/%02x dropped\n",
1115                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1116 #endif
1117                         rt_del(hash, rt);
1118                         ret = NULL;
1119                 }
1120         }
1121         return ret;
1122 }
1123 
1124 /*
1125  * Algorithm:
1126  *      1. The first ip_rt_redirect_number redirects are sent
1127  *         with exponential backoff, then we stop sending them at all,
1128  *         assuming that the host ignores our redirects.
1129  *      2. If we did not see packets requiring redirects
1130  *         during ip_rt_redirect_silence, we assume that the host
1131  *         forgot redirected route and start to send redirects again.
1132  *
1133  * This algorithm is much cheaper and more intelligent than dumb load limiting
1134  * in icmp.c.
1135  *
1136  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1137  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1138  */
1139 
1140 void ip_rt_send_redirect(struct sk_buff *skb)
1141 {
1142         struct rtable *rt = (struct rtable*)skb->dst;
1143         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1144 
1145         if (!in_dev)
1146                 return;
1147 
1148         if (!IN_DEV_TX_REDIRECTS(in_dev))
1149                 goto out;
1150 
1151         /* No redirected packets during ip_rt_redirect_silence;
1152          * reset the algorithm.
1153          */
1154         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1155                 rt->u.dst.rate_tokens = 0;
1156 
1157         /* Too many ignored redirects; do not send anything
1158          * set u.dst.rate_last to the last seen redirected packet.
1159          */
1160         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1161                 rt->u.dst.rate_last = jiffies;
1162                 goto out;
1163         }
1164 
1165         /* Check for load limit; set rate_last to the latest sent
1166          * redirect.
1167          */
1168         if (time_after(jiffies,
1169                        (rt->u.dst.rate_last +
1170                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1171                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1172                 rt->u.dst.rate_last = jiffies;
1173                 ++rt->u.dst.rate_tokens;
1174 #ifdef CONFIG_IP_ROUTE_VERBOSE
1175                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1176                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1177                     net_ratelimit())
1178                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1179                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1180                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1181                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1182 #endif
1183         }
1184 out:
1185         in_dev_put(in_dev);
1186 }
1187 
1188 static int ip_error(struct sk_buff *skb)
1189 {
1190         struct rtable *rt = (struct rtable*)skb->dst;
1191         unsigned long now;
1192         int code;
1193 
1194         switch (rt->u.dst.error) {
1195                 case EINVAL:
1196                 default:
1197                         goto out;
1198                 case EHOSTUNREACH:
1199                         code = ICMP_HOST_UNREACH;
1200                         break;
1201                 case ENETUNREACH:
1202                         code = ICMP_NET_UNREACH;
1203                         break;
1204                 case EACCES:
1205                         code = ICMP_PKT_FILTERED;
1206                         break;
1207         }
1208 
1209         now = jiffies;
1210         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1211         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1212                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1213         rt->u.dst.rate_last = now;
1214         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1215                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1216                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1217         }
1218 
1219 out:    kfree_skb(skb);
1220         return 0;
1221 } 
1222 
1223 /*
1224  *      The last two values are not from the RFC but
1225  *      are needed for AMPRnet AX.25 paths.
1226  */
1227 
1228 static unsigned short mtu_plateau[] =
1229 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1230 
1231 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1232 {
1233         int i;
1234         
1235         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1236                 if (old_mtu > mtu_plateau[i])
1237                         return mtu_plateau[i];
1238         return 68;
1239 }
1240 
1241 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1242 {
1243         int i;
1244         unsigned short old_mtu = ntohs(iph->tot_len);
1245         struct rtable *rth;
1246         u32  skeys[2] = { iph->saddr, 0, };
1247         u32  daddr = iph->daddr;
1248         u8   tos = iph->tos & IPTOS_RT_MASK;
1249         unsigned short est_mtu = 0;
1250 
1251         if (ipv4_config.no_pmtu_disc)
1252                 return 0;
1253 
1254         for (i = 0; i < 2; i++) {
1255                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1256 
1257                 rcu_read_lock();
1258                 for (rth = rt_hash_table[hash].chain; rth;
1259                      rth = rth->u.rt_next) {
1260                         smp_read_barrier_depends();
1261                         if (rth->fl.fl4_dst == daddr &&
1262                             rth->fl.fl4_src == skeys[i] &&
1263                             rth->rt_dst  == daddr &&
1264                             rth->rt_src  == iph->saddr &&
1265                             rth->fl.fl4_tos == tos &&
1266                             rth->fl.iif == 0 &&
1267                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1268                                 unsigned short mtu = new_mtu;
1269 
1270                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1271 
1272                                         /* BSD 4.2 compatibility hack :-( */
1273                                         if (mtu == 0 &&
1274                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1275                                             old_mtu >= 68 + (iph->ihl << 2))
1276                                                 old_mtu -= iph->ihl << 2;
1277 
1278                                         mtu = guess_mtu(old_mtu);
1279                                 }
1280                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1281                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1282                                                 dst_confirm(&rth->u.dst);
1283                                                 if (mtu < ip_rt_min_pmtu) {
1284                                                         mtu = ip_rt_min_pmtu;
1285                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1286                                                                 (1 << RTAX_MTU);
1287                                                 }
1288                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1289                                                 dst_set_expires(&rth->u.dst,
1290                                                         ip_rt_mtu_expires);
1291                                         }
1292                                         est_mtu = mtu;
1293                                 }
1294                         }
1295                 }
1296                 rcu_read_unlock();
1297         }
1298         return est_mtu ? : new_mtu;
1299 }
1300 
1301 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1302 {
1303         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1304             !(dst_metric_locked(dst, RTAX_MTU))) {
1305                 if (mtu < ip_rt_min_pmtu) {
1306                         mtu = ip_rt_min_pmtu;
1307                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1308                 }
1309                 dst->metrics[RTAX_MTU-1] = mtu;
1310                 dst_set_expires(dst, ip_rt_mtu_expires);
1311         }
1312 }
1313 
1314 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1315 {
1316         dst_release(dst);
1317         return NULL;
1318 }
1319 
1320 static void ipv4_dst_destroy(struct dst_entry *dst)
1321 {
1322         struct rtable *rt = (struct rtable *) dst;
1323         struct inet_peer *peer = rt->peer;
1324 
1325         if (peer) {
1326                 rt->peer = NULL;
1327                 inet_putpeer(peer);
1328         }
1329 }
1330 
1331 static void ipv4_link_failure(struct sk_buff *skb)
1332 {
1333         struct rtable *rt;
1334 
1335         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1336 
1337         rt = (struct rtable *) skb->dst;
1338         if (rt)
1339                 dst_set_expires(&rt->u.dst, 0);
1340 }
1341 
1342 static int ip_rt_bug(struct sk_buff *skb)
1343 {
1344         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1345                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1346                 skb->dev ? skb->dev->name : "?");
1347         kfree_skb(skb);
1348         return 0;
1349 }
1350 
1351 /*
1352    We do not cache source address of outgoing interface,
1353    because it is used only by IP RR, TS and SRR options,
1354    so that it out of fast path.
1355 
1356    BTW remember: "addr" is allowed to be not aligned
1357    in IP options!
1358  */
1359 
1360 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1361 {
1362         u32 src;
1363         struct fib_result res;
1364 
1365         if (rt->fl.iif == 0)
1366                 src = rt->rt_src;
1367         else if (fib_lookup(&rt->fl, &res) == 0) {
1368 #ifdef CONFIG_IP_ROUTE_NAT
1369                 if (res.type == RTN_NAT)
1370                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1371                                                 RT_SCOPE_UNIVERSE);
1372                 else
1373 #endif
1374                         src = FIB_RES_PREFSRC(res);
1375                 fib_res_put(&res);
1376         } else
1377                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1378                                         RT_SCOPE_UNIVERSE);
1379         memcpy(addr, &src, 4);
1380 }
1381 
1382 #ifdef CONFIG_NET_CLS_ROUTE
1383 static void set_class_tag(struct rtable *rt, u32 tag)
1384 {
1385         if (!(rt->u.dst.tclassid & 0xFFFF))
1386                 rt->u.dst.tclassid |= tag & 0xFFFF;
1387         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1388                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1389 }
1390 #endif
1391 
1392 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1393 {
1394         struct fib_info *fi = res->fi;
1395 
1396         if (fi) {
1397                 if (FIB_RES_GW(*res) &&
1398                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1399                         rt->rt_gateway = FIB_RES_GW(*res);
1400                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1401                        sizeof(rt->u.dst.metrics));
1402                 if (fi->fib_mtu == 0) {
1403                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1404                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1405                             rt->rt_gateway != rt->rt_dst &&
1406                             rt->u.dst.dev->mtu > 576)
1407                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1408                 }
1409 #ifdef CONFIG_NET_CLS_ROUTE
1410                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1411 #endif
1412         } else
1413                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1414 
1415         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1416                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1417         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1418                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1419         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1420                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1421                                        ip_rt_min_advmss);
1422         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1423                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1424 
1425 #ifdef CONFIG_NET_CLS_ROUTE
1426 #ifdef CONFIG_IP_MULTIPLE_TABLES
1427         set_class_tag(rt, fib_rules_tclass(res));
1428 #endif
1429         set_class_tag(rt, itag);
1430 #endif
1431         rt->rt_type = res->type;
1432 }
1433 
1434 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1435                                 u8 tos, struct net_device *dev, int our)
1436 {
1437         unsigned hash;
1438         struct rtable *rth;
1439         u32 spec_dst;
1440         struct in_device *in_dev = in_dev_get(dev);
1441         u32 itag = 0;
1442 
1443         /* Primary sanity checks. */
1444 
1445         if (in_dev == NULL)
1446                 return -EINVAL;
1447 
1448         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1449             skb->protocol != htons(ETH_P_IP))
1450                 goto e_inval;
1451 
1452         if (ZERONET(saddr)) {
1453                 if (!LOCAL_MCAST(daddr))
1454                         goto e_inval;
1455                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1456         } else if (fib_validate_source(saddr, 0, tos, 0,
1457                                         dev, &spec_dst, &itag) < 0)
1458                 goto e_inval;
1459 
1460         rth = dst_alloc(&ipv4_dst_ops);
1461         if (!rth)
1462                 goto e_nobufs;
1463 
1464         rth->u.dst.output= ip_rt_bug;
1465 
1466         atomic_set(&rth->u.dst.__refcnt, 1);
1467         rth->u.dst.flags= DST_HOST;
1468         if (in_dev->cnf.no_policy)
1469                 rth->u.dst.flags |= DST_NOPOLICY;
1470         rth->fl.fl4_dst = daddr;
1471         rth->rt_dst     = daddr;
1472         rth->fl.fl4_tos = tos;
1473 #ifdef CONFIG_IP_ROUTE_FWMARK
1474         rth->fl.fl4_fwmark= skb->nfmark;
1475 #endif
1476         rth->fl.fl4_src = saddr;
1477         rth->rt_src     = saddr;
1478 #ifdef CONFIG_IP_ROUTE_NAT
1479         rth->rt_dst_map = daddr;
1480         rth->rt_src_map = saddr;
1481 #endif
1482 #ifdef CONFIG_NET_CLS_ROUTE
1483         rth->u.dst.tclassid = itag;
1484 #endif
1485         rth->rt_iif     =
1486         rth->fl.iif     = dev->ifindex;
1487         rth->u.dst.dev  = &loopback_dev;
1488         dev_hold(rth->u.dst.dev);
1489         rth->fl.oif     = 0;
1490         rth->rt_gateway = daddr;
1491         rth->rt_spec_dst= spec_dst;
1492         rth->rt_type    = RTN_MULTICAST;
1493         rth->rt_flags   = RTCF_MULTICAST;
1494         if (our) {
1495                 rth->u.dst.input= ip_local_deliver;
1496                 rth->rt_flags |= RTCF_LOCAL;
1497         }
1498 
1499 #ifdef CONFIG_IP_MROUTE
1500         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1501                 rth->u.dst.input = ip_mr_input;
1502 #endif
1503         RT_CACHE_STAT_INC(in_slow_mc);
1504 
1505         in_dev_put(in_dev);
1506         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1507         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1508 
1509 e_nobufs:
1510         in_dev_put(in_dev);
1511         return -ENOBUFS;
1512 
1513 e_inval:
1514         in_dev_put(in_dev);
1515         return -EINVAL;
1516 }
1517 
1518 /*
1519  *      NOTE. We drop all the packets that has local source
1520  *      addresses, because every properly looped back packet
1521  *      must have correct destination already attached by output routine.
1522  *
1523  *      Such approach solves two big problems:
1524  *      1. Not simplex devices are handled properly.
1525  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1526  */
1527 
1528 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1529                         u8 tos, struct net_device *dev)
1530 {
1531         struct fib_result res;
1532         struct in_device *in_dev = in_dev_get(dev);
1533         struct in_device *out_dev = NULL;
1534         struct flowi fl = { .nl_u = { .ip4_u =
1535                                       { .daddr = daddr,
1536                                         .saddr = saddr,
1537                                         .tos = tos,
1538                                         .scope = RT_SCOPE_UNIVERSE,
1539 #ifdef CONFIG_IP_ROUTE_FWMARK
1540                                         .fwmark = skb->nfmark
1541 #endif
1542                                       } },
1543                             .iif = dev->ifindex };
1544         unsigned        flags = 0;
1545         u32             itag = 0;
1546         struct rtable * rth;
1547         unsigned        hash;
1548         u32             spec_dst;
1549         int             err = -EINVAL;
1550         int             free_res = 0;
1551 
1552         /* IP on this device is disabled. */
1553 
1554         if (!in_dev)
1555                 goto out;
1556 
1557         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1558 
1559         /* Check for the most weird martians, which can be not detected
1560            by fib_lookup.
1561          */
1562 
1563         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1564                 goto martian_source;
1565 
1566         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1567                 goto brd_input;
1568 
1569         /* Accept zero addresses only to limited broadcast;
1570          * I even do not know to fix it or not. Waiting for complains :-)
1571          */
1572         if (ZERONET(saddr))
1573                 goto martian_source;
1574 
1575         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1576                 goto martian_destination;
1577 
1578         /*
1579          *      Now we are ready to route packet.
1580          */
1581         if ((err = fib_lookup(&fl, &res)) != 0) {
1582                 if (!IN_DEV_FORWARD(in_dev))
1583                         goto e_inval;
1584                 goto no_route;
1585         }
1586         free_res = 1;
1587 
1588         RT_CACHE_STAT_INC(in_slow_tot);
1589 
1590 #ifdef CONFIG_IP_ROUTE_NAT
1591         /* Policy is applied before mapping destination,
1592            but rerouting after map should be made with old source.
1593          */
1594 
1595         if (1) {
1596                 u32 src_map = saddr;
1597                 if (res.r)
1598                         src_map = fib_rules_policy(saddr, &res, &flags);
1599 
1600                 if (res.type == RTN_NAT) {
1601                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1602                         fib_res_put(&res);
1603                         free_res = 0;
1604                         if (fib_lookup(&fl, &res))
1605                                 goto e_inval;
1606                         free_res = 1;
1607                         if (res.type != RTN_UNICAST)
1608                                 goto e_inval;
1609                         flags |= RTCF_DNAT;
1610                 }
1611                 fl.fl4_src = src_map;
1612         }
1613 #endif
1614 
1615         if (res.type == RTN_BROADCAST)
1616                 goto brd_input;
1617 
1618         if (res.type == RTN_LOCAL) {
1619                 int result;
1620                 result = fib_validate_source(saddr, daddr, tos,
1621                                              loopback_dev.ifindex,
1622                                              dev, &spec_dst, &itag);
1623                 if (result < 0)
1624                         goto martian_source;
1625                 if (result)
1626                         flags |= RTCF_DIRECTSRC;
1627                 spec_dst = daddr;
1628                 goto local_input;
1629         }
1630 
1631         if (!IN_DEV_FORWARD(in_dev))
1632                 goto e_inval;
1633         if (res.type != RTN_UNICAST)
1634                 goto martian_destination;
1635 
1636 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1637         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1638                 fib_select_multipath(&fl, &res);
1639 #endif
1640         out_dev = in_dev_get(FIB_RES_DEV(res));
1641         if (out_dev == NULL) {
1642                 if (net_ratelimit())
1643                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1644                                          "Please, report\n");
1645                 goto e_inval;
1646         }
1647 
1648         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1649                                   &spec_dst, &itag);
1650         if (err < 0)
1651                 goto martian_source;
1652 
1653         if (err)
1654                 flags |= RTCF_DIRECTSRC;
1655 
1656         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1657             (IN_DEV_SHARED_MEDIA(out_dev) ||
1658              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1659                 flags |= RTCF_DOREDIRECT;
1660 
1661         if (skb->protocol != htons(ETH_P_IP)) {
1662                 /* Not IP (i.e. ARP). Do not create route, if it is
1663                  * invalid for proxy arp. DNAT routes are always valid.
1664                  */
1665                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1666                         goto e_inval;
1667         }
1668 
1669         rth = dst_alloc(&ipv4_dst_ops);
1670         if (!rth)
1671                 goto e_nobufs;
1672 
1673         atomic_set(&rth->u.dst.__refcnt, 1);
1674         rth->u.dst.flags= DST_HOST;
1675         if (in_dev->cnf.no_policy)
1676                 rth->u.dst.flags |= DST_NOPOLICY;
1677         if (in_dev->cnf.no_xfrm)
1678                 rth->u.dst.flags |= DST_NOXFRM;
1679         rth->fl.fl4_dst = daddr;
1680         rth->rt_dst     = daddr;
1681         rth->fl.fl4_tos = tos;
1682 #ifdef CONFIG_IP_ROUTE_FWMARK
1683         rth->fl.fl4_fwmark= skb->nfmark;
1684 #endif
1685         rth->fl.fl4_src = saddr;
1686         rth->rt_src     = saddr;
1687         rth->rt_gateway = daddr;
1688 #ifdef CONFIG_IP_ROUTE_NAT
1689         rth->rt_src_map = fl.fl4_src;
1690         rth->rt_dst_map = fl.fl4_dst;
1691         if (flags&RTCF_DNAT)
1692                 rth->rt_gateway = fl.fl4_dst;
1693 #endif
1694         rth->rt_iif     =
1695         rth->fl.iif     = dev->ifindex;
1696         rth->u.dst.dev  = out_dev->dev;
1697         dev_hold(rth->u.dst.dev);
1698         rth->fl.oif     = 0;
1699         rth->rt_spec_dst= spec_dst;
1700 
1701         rth->u.dst.input = ip_forward;
1702         rth->u.dst.output = ip_output;
1703 
1704         rt_set_nexthop(rth, &res, itag);
1705 
1706         rth->rt_flags = flags;
1707 
1708 #ifdef CONFIG_NET_FASTROUTE
1709         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1710                 struct net_device *odev = rth->u.dst.dev;
1711                 if (odev != dev &&
1712                     dev->accept_fastpath &&
1713                     odev->mtu >= dev->mtu &&
1714                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1715                         rth->rt_flags |= RTCF_FAST;
1716         }
1717 #endif
1718 
1719 intern:
1720         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1721 done:
1722         in_dev_put(in_dev);
1723         if (out_dev)
1724                 in_dev_put(out_dev);
1725         if (free_res)
1726                 fib_res_put(&res);
1727 out:    return err;
1728 
1729 brd_input:
1730         if (skb->protocol != htons(ETH_P_IP))
1731                 goto e_inval;
1732 
1733         if (ZERONET(saddr))
1734                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1735         else {
1736                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1737                                           &itag);
1738                 if (err < 0)
1739                         goto martian_source;
1740                 if (err)
1741                         flags |= RTCF_DIRECTSRC;
1742         }
1743         flags |= RTCF_BROADCAST;
1744         res.type = RTN_BROADCAST;
1745         RT_CACHE_STAT_INC(in_brd);
1746 
1747 local_input:
1748         rth = dst_alloc(&ipv4_dst_ops);
1749         if (!rth)
1750                 goto e_nobufs;
1751 
1752         rth->u.dst.output= ip_rt_bug;
1753 
1754         atomic_set(&rth->u.dst.__refcnt, 1);
1755         rth->u.dst.flags= DST_HOST;
1756         if (in_dev->cnf.no_policy)
1757                 rth->u.dst.flags |= DST_NOPOLICY;
1758         rth->fl.fl4_dst = daddr;
1759         rth->rt_dst     = daddr;
1760         rth->fl.fl4_tos = tos;
1761 #ifdef CONFIG_IP_ROUTE_FWMARK
1762         rth->fl.fl4_fwmark= skb->nfmark;
1763 #endif
1764         rth->fl.fl4_src = saddr;
1765         rth->rt_src     = saddr;
1766 #ifdef CONFIG_IP_ROUTE_NAT
1767         rth->rt_dst_map = fl.fl4_dst;
1768         rth->rt_src_map = fl.fl4_src;
1769 #endif
1770 #ifdef CONFIG_NET_CLS_ROUTE
1771         rth->u.dst.tclassid = itag;
1772 #endif
1773         rth->rt_iif     =
1774         rth->fl.iif     = dev->ifindex;
1775         rth->u.dst.dev  = &loopback_dev;
1776         dev_hold(rth->u.dst.dev);
1777         rth->rt_gateway = daddr;
1778         rth->rt_spec_dst= spec_dst;
1779         rth->u.dst.input= ip_local_deliver;
1780         rth->rt_flags   = flags|RTCF_LOCAL;
1781         if (res.type == RTN_UNREACHABLE) {
1782                 rth->u.dst.input= ip_error;
1783                 rth->u.dst.error= -err;
1784                 rth->rt_flags   &= ~RTCF_LOCAL;
1785         }
1786         rth->rt_type    = res.type;
1787         goto intern;
1788 
1789 no_route:
1790         RT_CACHE_STAT_INC(in_no_route);
1791         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1792         res.type = RTN_UNREACHABLE;
1793         goto local_input;
1794 
1795         /*
1796          *      Do not cache martian addresses: they should be logged (RFC1812)
1797          */
1798 martian_destination:
1799         RT_CACHE_STAT_INC(in_martian_dst);
1800 #ifdef CONFIG_IP_ROUTE_VERBOSE
1801         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1802                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1803                         "%u.%u.%u.%u, dev %s\n",
1804                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1805 #endif
1806 e_inval:
1807         err = -EINVAL;
1808         goto done;
1809 
1810 e_nobufs:
1811         err = -ENOBUFS;
1812         goto done;
1813 
1814 martian_source:
1815 
1816         RT_CACHE_STAT_INC(in_martian_src);
1817 #ifdef CONFIG_IP_ROUTE_VERBOSE
1818         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1819                 /*
1820                  *      RFC1812 recommendation, if source is martian,
1821                  *      the only hint is MAC header.
1822                  */
1823                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1824                         "%u.%u.%u.%u, on dev %s\n",
1825                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1826                 if (dev->hard_header_len) {
1827                         int i;
1828                         unsigned char *p = skb->mac.raw;
1829                         printk(KERN_WARNING "ll header: ");
1830                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1831                                 printk("%02x", *p);
1832                                 if (i < (dev->hard_header_len - 1))
1833                                         printk(":");
1834                         }
1835                         printk("\n");
1836                 }
1837         }
1838 #endif
1839         goto e_inval;
1840 }
1841 
1842 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1843                    u8 tos, struct net_device *dev)
1844 {
1845         struct rtable * rth;
1846         unsigned        hash;
1847         int iif = dev->ifindex;
1848 
1849         tos &= IPTOS_RT_MASK;
1850         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1851 
1852         rcu_read_lock();
1853         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1854                 smp_read_barrier_depends();
1855                 if (rth->fl.fl4_dst == daddr &&
1856                     rth->fl.fl4_src == saddr &&
1857                     rth->fl.iif == iif &&
1858                     rth->fl.oif == 0 &&
1859 #ifdef CONFIG_IP_ROUTE_FWMARK
1860                     rth->fl.fl4_fwmark == skb->nfmark &&
1861 #endif
1862                     rth->fl.fl4_tos == tos) {
1863                         rth->u.dst.lastuse = jiffies;
1864                         dst_hold(&rth->u.dst);
1865                         rth->u.dst.__use++;
1866                         RT_CACHE_STAT_INC(in_hit);
1867                         rcu_read_unlock();
1868                         skb->dst = (struct dst_entry*)rth;
1869                         return 0;
1870                 }
1871                 RT_CACHE_STAT_INC(in_hlist_search);
1872         }
1873         rcu_read_unlock();
1874 
1875         /* Multicast recognition logic is moved from route cache to here.
1876            The problem was that too many Ethernet cards have broken/missing
1877            hardware multicast filters :-( As result the host on multicasting
1878            network acquires a lot of useless route cache entries, sort of
1879            SDR messages from all the world. Now we try to get rid of them.
1880            Really, provided software IP multicast filter is organized
1881            reasonably (at least, hashed), it does not result in a slowdown
1882            comparing with route cache reject entries.
1883            Note, that multicast routers are not affected, because
1884            route cache entry is created eventually.
1885          */
1886         if (MULTICAST(daddr)) {
1887                 struct in_device *in_dev;
1888 
1889                 read_lock(&inetdev_lock);
1890                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1891                         int our = ip_check_mc(in_dev, daddr, saddr,
1892                                 skb->nh.iph->protocol);
1893                         if (our
1894 #ifdef CONFIG_IP_MROUTE
1895                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1896 #endif
1897                             ) {
1898                                 read_unlock(&inetdev_lock);
1899                                 return ip_route_input_mc(skb, daddr, saddr,
1900                                                          tos, dev, our);
1901                         }
1902                 }
1903                 read_unlock(&inetdev_lock);
1904                 return -EINVAL;
1905         }
1906         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1907 }
1908 
1909 /*
1910  * Major route resolver routine.
1911  */
1912 
1913 int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1914 {
1915         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1916         struct flowi fl = { .nl_u = { .ip4_u =
1917                                       { .daddr = oldflp->fl4_dst,
1918                                         .saddr = oldflp->fl4_src,
1919                                         .tos = tos & IPTOS_RT_MASK,
1920                                         .scope = ((tos & RTO_ONLINK) ?
1921                                                   RT_SCOPE_LINK :
1922                                                   RT_SCOPE_UNIVERSE),
1923 #ifdef CONFIG_IP_ROUTE_FWMARK
1924                                         .fwmark = oldflp->fl4_fwmark
1925 #endif
1926                                       } },
1927                             .iif = loopback_dev.ifindex,
1928                             .oif = oldflp->oif };
1929         struct fib_result res;
1930         unsigned flags = 0;
1931         struct rtable *rth;
1932         struct net_device *dev_out = NULL;
1933         struct in_device *in_dev = NULL;
1934         unsigned hash;
1935         int free_res = 0;
1936         int err;
1937 
1938         res.fi          = NULL;
1939 #ifdef CONFIG_IP_MULTIPLE_TABLES
1940         res.r           = NULL;
1941 #endif
1942 
1943         if (oldflp->fl4_src) {
1944                 err = -EINVAL;
1945                 if (MULTICAST(oldflp->fl4_src) ||
1946                     BADCLASS(oldflp->fl4_src) ||
1947                     ZERONET(oldflp->fl4_src))
1948                         goto out;
1949 
1950                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1951                 dev_out = ip_dev_find(oldflp->fl4_src);
1952                 if (dev_out == NULL)
1953                         goto out;
1954 
1955                 /* I removed check for oif == dev_out->oif here.
1956                    It was wrong for two reasons:
1957                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1958                       assigned to multiple interfaces.
1959                    2. Moreover, we are allowed to send packets with saddr
1960                       of another iface. --ANK
1961                  */
1962 
1963                 if (oldflp->oif == 0
1964                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1965                         /* Special hack: user can direct multicasts
1966                            and limited broadcast via necessary interface
1967                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1968                            This hack is not just for fun, it allows
1969                            vic,vat and friends to work.
1970                            They bind socket to loopback, set ttl to zero
1971                            and expect that it will work.
1972                            From the viewpoint of routing cache they are broken,
1973                            because we are not allowed to build multicast path
1974                            with loopback source addr (look, routing cache
1975                            cannot know, that ttl is zero, so that packet
1976                            will not leave this host and route is valid).
1977                            Luckily, this hack is good workaround.
1978                          */
1979 
1980                         fl.oif = dev_out->ifindex;
1981                         goto make_route;
1982                 }
1983                 if (dev_out)
1984                         dev_put(dev_out);
1985                 dev_out = NULL;
1986         }
1987         if (oldflp->oif) {
1988                 dev_out = dev_get_by_index(oldflp->oif);
1989                 err = -ENODEV;
1990                 if (dev_out == NULL)
1991                         goto out;
1992                 if (__in_dev_get(dev_out) == NULL) {
1993                         dev_put(dev_out);
1994                         goto out;       /* Wrong error code */
1995                 }
1996 
1997                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1998                         if (!fl.fl4_src)
1999                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2000                                                               RT_SCOPE_LINK);
2001                         goto make_route;
2002                 }
2003                 if (!fl.fl4_src) {
2004                         if (MULTICAST(oldflp->fl4_dst))
2005                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2006                                                               fl.fl4_scope);
2007                         else if (!oldflp->fl4_dst)
2008                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2009                                                               RT_SCOPE_HOST);
2010                 }
2011         }
2012 
2013         if (!fl.fl4_dst) {
2014                 fl.fl4_dst = fl.fl4_src;
2015                 if (!fl.fl4_dst)
2016                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2017                 if (dev_out)
2018                         dev_put(dev_out);
2019                 dev_out = &loopback_dev;
2020                 dev_hold(dev_out);
2021                 fl.oif = loopback_dev.ifindex;
2022                 res.type = RTN_LOCAL;
2023                 flags |= RTCF_LOCAL;
2024                 goto make_route;
2025         }
2026 
2027         if (fib_lookup(&fl, &res)) {
2028                 res.fi = NULL;
2029                 if (oldflp->oif) {
2030                         /* Apparently, routing tables are wrong. Assume,
2031                            that the destination is on link.
2032 
2033                            WHY? DW.
2034                            Because we are allowed to send to iface
2035                            even if it has NO routes and NO assigned
2036                            addresses. When oif is specified, routing
2037                            tables are looked up with only one purpose:
2038                            to catch if destination is gatewayed, rather than
2039                            direct. Moreover, if MSG_DONTROUTE is set,
2040                            we send packet, ignoring both routing tables
2041                            and ifaddr state. --ANK
2042 
2043 
2044                            We could make it even if oif is unknown,
2045                            likely IPv6, but we do not.
2046                          */
2047 
2048                         if (fl.fl4_src == 0)
2049                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2050                                                               RT_SCOPE_LINK);
2051                         res.type = RTN_UNICAST;
2052                         goto make_route;
2053                 }
2054                 if (dev_out)
2055                         dev_put(dev_out);
2056                 err = -ENETUNREACH;
2057                 goto out;
2058         }
2059         free_res = 1;
2060 
2061         if (res.type == RTN_NAT)
2062                 goto e_inval;
2063 
2064         if (res.type == RTN_LOCAL) {
2065                 if (!fl.fl4_src)
2066                         fl.fl4_src = fl.fl4_dst;
2067                 if (dev_out)
2068                         dev_put(dev_out);
2069                 dev_out = &loopback_dev;
2070                 dev_hold(dev_out);
2071                 fl.oif = dev_out->ifindex;
2072                 if (res.fi)
2073                         fib_info_put(res.fi);
2074                 res.fi = NULL;
2075                 flags |= RTCF_LOCAL;
2076                 goto make_route;
2077         }
2078 
2079 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2080         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2081                 fib_select_multipath(&fl, &res);
2082         else
2083 #endif
2084         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2085                 fib_select_default(&fl, &res);
2086 
2087         if (!fl.fl4_src)
2088                 fl.fl4_src = FIB_RES_PREFSRC(res);
2089 
2090         if (dev_out)
2091                 dev_put(dev_out);
2092         dev_out = FIB_RES_DEV(res);
2093         dev_hold(dev_out);
2094         fl.oif = dev_out->ifindex;
2095 
2096 make_route:
2097         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2098                 goto e_inval;
2099 
2100         if (fl.fl4_dst == 0xFFFFFFFF)
2101                 res.type = RTN_BROADCAST;
2102         else if (MULTICAST(fl.fl4_dst))
2103                 res.type = RTN_MULTICAST;
2104         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2105                 goto e_inval;
2106 
2107         if (dev_out->flags & IFF_LOOPBACK)
2108                 flags |= RTCF_LOCAL;
2109 
2110         in_dev = in_dev_get(dev_out);
2111         if (!in_dev)
2112                 goto e_inval;
2113 
2114         if (res.type == RTN_BROADCAST) {
2115                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2116                 if (res.fi) {
2117                         fib_info_put(res.fi);
2118                         res.fi = NULL;
2119                 }
2120         } else if (res.type == RTN_MULTICAST) {
2121                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2122                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2123                         flags &= ~RTCF_LOCAL;
2124                 /* If multicast route do not exist use
2125                    default one, but do not gateway in this case.
2126                    Yes, it is hack.
2127                  */
2128                 if (res.fi && res.prefixlen < 4) {
2129                         fib_info_put(res.fi);
2130                         res.fi = NULL;
2131                 }
2132         }
2133 
2134         rth = dst_alloc(&ipv4_dst_ops);
2135         if (!rth)
2136                 goto e_nobufs;
2137 
2138         atomic_set(&rth->u.dst.__refcnt, 1);
2139         rth->u.dst.flags= DST_HOST;
2140         if (in_dev->cnf.no_xfrm)
2141                 rth->u.dst.flags |= DST_NOXFRM;
2142         if (in_dev->cnf.no_policy)
2143                 rth->u.dst.flags |= DST_NOPOLICY;
2144         rth->fl.fl4_dst = oldflp->fl4_dst;
2145         rth->fl.fl4_tos = tos;
2146         rth->fl.fl4_src = oldflp->fl4_src;
2147         rth->fl.oif     = oldflp->oif;
2148 #ifdef CONFIG_IP_ROUTE_FWMARK
2149         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2150 #endif
2151         rth->rt_dst     = fl.fl4_dst;
2152         rth->rt_src     = fl.fl4_src;
2153 #ifdef CONFIG_IP_ROUTE_NAT
2154         rth->rt_dst_map = fl.fl4_dst;
2155         rth->rt_src_map = fl.fl4_src;
2156 #endif
2157         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2158         rth->u.dst.dev  = dev_out;
2159         dev_hold(dev_out);
2160         rth->rt_gateway = fl.fl4_dst;
2161         rth->rt_spec_dst= fl.fl4_src;
2162 
2163         rth->u.dst.output=ip_output;
2164 
2165         RT_CACHE_STAT_INC(out_slow_tot);
2166 
2167         if (flags & RTCF_LOCAL) {
2168                 rth->u.dst.input = ip_local_deliver;
2169                 rth->rt_spec_dst = fl.fl4_dst;
2170         }
2171         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2172                 rth->rt_spec_dst = fl.fl4_src;
2173                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2174                         rth->u.dst.output = ip_mc_output;
2175                         RT_CACHE_STAT_INC(out_slow_mc);
2176                 }
2177 #ifdef CONFIG_IP_MROUTE
2178                 if (res.type == RTN_MULTICAST) {
2179                         if (IN_DEV_MFORWARD(in_dev) &&
2180                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2181                                 rth->u.dst.input = ip_mr_input;
2182                                 rth->u.dst.output = ip_mc_output;
2183                         }
2184                 }
2185 #endif
2186         }
2187 
2188         rt_set_nexthop(rth, &res, 0);
2189         
2190 
2191         rth->rt_flags = flags;
2192 
2193         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2194         err = rt_intern_hash(hash, rth, rp);
2195 done:
2196         if (free_res)
2197                 fib_res_put(&res);
2198         if (dev_out)
2199                 dev_put(dev_out);
2200         if (in_dev)
2201                 in_dev_put(in_dev);
2202 out:    return err;
2203 
2204 e_inval:
2205         err = -EINVAL;
2206         goto done;
2207 e_nobufs:
2208         err = -ENOBUFS;
2209         goto done;
2210 }
2211 
2212 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2213 {
2214         unsigned hash;
2215         struct rtable *rth;
2216 
2217         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2218 
2219         rcu_read_lock();
2220         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2221                 smp_read_barrier_depends();
2222                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2223                     rth->fl.fl4_src == flp->fl4_src &&
2224                     rth->fl.iif == 0 &&
2225                     rth->fl.oif == flp->oif &&
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2228 #endif
2229                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2230                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2231                         rth->u.dst.lastuse = jiffies;
2232                         dst_hold(&rth->u.dst);
2233                         rth->u.dst.__use++;
2234                         RT_CACHE_STAT_INC(out_hit);
2235                         rcu_read_unlock();
2236                         *rp = rth;
2237                         return 0;
2238                 }
2239                 RT_CACHE_STAT_INC(out_hlist_search);
2240         }
2241         rcu_read_unlock();
2242 
2243         return ip_route_output_slow(rp, flp);
2244 }
2245 
2246 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2247 {
2248         int err;
2249 
2250         if ((err = __ip_route_output_key(rp, flp)) != 0)
2251                 return err;
2252         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2253 }
2254 
2255 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2256 {
2257         int err;
2258 
2259         if ((err = __ip_route_output_key(rp, flp)) != 0)
2260                 return err;
2261         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2262 }
2263 
2264 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2265                         int nowait)
2266 {
2267         struct rtable *rt = (struct rtable*)skb->dst;
2268         struct rtmsg *r;
2269         struct nlmsghdr  *nlh;
2270         unsigned char    *b = skb->tail;
2271         struct rta_cacheinfo ci;
2272 #ifdef CONFIG_IP_MROUTE
2273         struct rtattr *eptr;
2274 #endif
2275         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2276         r = NLMSG_DATA(nlh);
2277         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2278         r->rtm_family    = AF_INET;
2279         r->rtm_dst_len  = 32;
2280         r->rtm_src_len  = 0;
2281         r->rtm_tos      = rt->fl.fl4_tos;
2282         r->rtm_table    = RT_TABLE_MAIN;
2283         r->rtm_type     = rt->rt_type;
2284         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2285         r->rtm_protocol = RTPROT_UNSPEC;
2286         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2287         if (rt->rt_flags & RTCF_NOTIFY)
2288                 r->rtm_flags |= RTM_F_NOTIFY;
2289         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2290         if (rt->fl.fl4_src) {
2291                 r->rtm_src_len = 32;
2292                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2293         }
2294         if (rt->u.dst.dev)
2295                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2296 #ifdef CONFIG_NET_CLS_ROUTE
2297         if (rt->u.dst.tclassid)
2298                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2299 #endif
2300         if (rt->fl.iif)
2301                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2302         else if (rt->rt_src != rt->fl.fl4_src)
2303                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2304         if (rt->rt_dst != rt->rt_gateway)
2305                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2306         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2307                 goto rtattr_failure;
2308         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2309         ci.rta_used     = rt->u.dst.__use;
2310         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2311         if (rt->u.dst.expires)
2312                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2313         else
2314                 ci.rta_expires = 0;
2315         ci.rta_error    = rt->u.dst.error;
2316         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2317         if (rt->peer) {
2318                 ci.rta_id = rt->peer->ip_id_count;
2319                 if (rt->peer->tcp_ts_stamp) {
2320                         ci.rta_ts = rt->peer->tcp_ts;
2321                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2322                 }
2323         }
2324 #ifdef CONFIG_IP_MROUTE
2325         eptr = (struct rtattr*)skb->tail;
2326 #endif
2327         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2328         if (rt->fl.iif) {
2329 #ifdef CONFIG_IP_MROUTE
2330                 u32 dst = rt->rt_dst;
2331 
2332                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2333                     ipv4_devconf.mc_forwarding) {
2334                         int err = ipmr_get_route(skb, r, nowait);
2335                         if (err <= 0) {
2336                                 if (!nowait) {
2337                                         if (err == 0)
2338                                                 return 0;
2339                                         goto nlmsg_failure;
2340                                 } else {
2341                                         if (err == -EMSGSIZE)
2342                                                 goto nlmsg_failure;
2343                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2344                                 }
2345                         }
2346                 } else
2347 #endif
2348                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2349         }
2350 
2351         nlh->nlmsg_len = skb->tail - b;
2352         return skb->len;
2353 
2354 nlmsg_failure:
2355 rtattr_failure:
2356         skb_trim(skb, b - skb->data);
2357         return -1;
2358 }
2359 
2360 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2361 {
2362         struct rtattr **rta = arg;
2363         struct rtmsg *rtm = NLMSG_DATA(nlh);
2364         struct rtable *rt = NULL;
2365         u32 dst = 0;
2366         u32 src = 0;
2367         int iif = 0;
2368         int err = -ENOBUFS;
2369         struct sk_buff *skb;
2370 
2371         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372         if (!skb)
2373                 goto out;
2374 
2375         /* Reserve room for dummy headers, this skb can pass
2376            through good chunk of routing engine.
2377          */
2378         skb->mac.raw = skb->data;
2379         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2380 
2381         if (rta[RTA_SRC - 1])
2382                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2383         if (rta[RTA_DST - 1])
2384                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2385         if (rta[RTA_IIF - 1])
2386                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2387 
2388         if (iif) {
2389                 struct net_device *dev = __dev_get_by_index(iif);
2390                 err = -ENODEV;
2391                 if (!dev)
2392                         goto out_free;
2393                 skb->protocol   = htons(ETH_P_IP);
2394                 skb->dev        = dev;
2395                 local_bh_disable();
2396                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2397                 local_bh_enable();
2398                 rt = (struct rtable*)skb->dst;
2399                 if (!err && rt->u.dst.error)
2400                         err = -rt->u.dst.error;
2401         } else {
2402                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2403                                                          .saddr = src,
2404                                                          .tos = rtm->rtm_tos } } };
2405                 int oif = 0;
2406                 if (rta[RTA_OIF - 1])
2407                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2408                 fl.oif = oif;
2409                 err = ip_route_output_key(&rt, &fl);
2410         }
2411         if (err)
2412                 goto out_free;
2413 
2414         skb->dst = &rt->u.dst;
2415         if (rtm->rtm_flags & RTM_F_NOTIFY)
2416                 rt->rt_flags |= RTCF_NOTIFY;
2417 
2418         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2419 
2420         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2421                                 RTM_NEWROUTE, 0);
2422         if (!err)
2423                 goto out_free;
2424         if (err < 0) {
2425                 err = -EMSGSIZE;
2426                 goto out_free;
2427         }
2428 
2429         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2430         if (err > 0)
2431                 err = 0;
2432 out:    return err;
2433 
2434 out_free:
2435         kfree_skb(skb);
2436         goto out;
2437 }
2438 
2439 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2440 {
2441         struct rtable *rt;
2442         int h, s_h;
2443         int idx, s_idx;
2444 
2445         s_h = cb->args[0];
2446         s_idx = idx = cb->args[1];
2447         for (h = 0; h <= rt_hash_mask; h++) {
2448                 if (h < s_h) continue;
2449                 if (h > s_h)
2450                         s_idx = 0;
2451                 rcu_read_lock();
2452                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2453                      rt = rt->u.rt_next, idx++) {
2454                         smp_read_barrier_depends();
2455                         if (idx < s_idx)
2456                                 continue;
2457                         skb->dst = dst_clone(&rt->u.dst);
2458                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2459                                          cb->nlh->nlmsg_seq,
2460                                          RTM_NEWROUTE, 1) <= 0) {
2461                                 dst_release(xchg(&skb->dst, NULL));
2462                                 rcu_read_unlock();
2463                                 goto done;
2464                         }
2465                         dst_release(xchg(&skb->dst, NULL));
2466                 }
2467                 rcu_read_unlock();
2468         }
2469 
2470 done:
2471         cb->args[0] = h;
2472         cb->args[1] = idx;
2473         return skb->len;
2474 }
2475 
2476 void ip_rt_multicast_event(struct in_device *in_dev)
2477 {
2478         rt_cache_flush(0);
2479 }
2480 
2481 #ifdef CONFIG_SYSCTL
2482 static int flush_delay;
2483 
2484 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2485                                         struct file *filp, void *buffer,
2486                                         size_t *lenp)
2487 {
2488         if (write) {
2489                 proc_dointvec(ctl, write, filp, buffer, lenp);
2490                 rt_cache_flush(flush_delay);
2491                 return 0;
2492         } 
2493 
2494         return -EINVAL;
2495 }
2496 
2497 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2498                                                 int nlen, void *oldval,
2499                                                 size_t *oldlenp, void *newval,
2500                                                 size_t newlen, void **context)
2501 {
2502         int delay;
2503         if (newlen != sizeof(int))
2504                 return -EINVAL;
2505         if (get_user(delay, (int *)newval))
2506                 return -EFAULT; 
2507         rt_cache_flush(delay); 
2508         return 0;
2509 }
2510 
2511 ctl_table ipv4_route_table[] = {
2512         {
2513                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2514                 .procname       = "flush",
2515                 .data           = &flush_delay,
2516                 .maxlen         = sizeof(int),
2517                 .mode           = 0644,
2518                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2519                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2520         },
2521         {
2522                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2523                 .procname       = "min_delay",
2524                 .data           = &ip_rt_min_delay,
2525                 .maxlen         = sizeof(int),
2526                 .mode           = 0644,
2527                 .proc_handler   = &proc_dointvec_jiffies,
2528                 .strategy       = &sysctl_jiffies,
2529         },
2530         {
2531                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2532                 .procname       = "max_delay",
2533                 .data           = &ip_rt_max_delay,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = &proc_dointvec_jiffies,
2537                 .strategy       = &sysctl_jiffies,
2538         },
2539         {
2540                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2541                 .procname       = "gc_thresh",
2542                 .data           = &ipv4_dst_ops.gc_thresh,
2543                 .maxlen         = sizeof(int),
2544                 .mode           = 0644,
2545                 .proc_handler   = &proc_dointvec,
2546         },
2547         {
2548                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2549                 .procname       = "max_size",
2550                 .data           = &ip_rt_max_size,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = &proc_dointvec,
2554         },
2555         {
2556                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2557                 .procname       = "gc_min_interval",
2558                 .data           = &ip_rt_gc_min_interval,
2559                 .maxlen         = sizeof(int),
2560                 .mode           = 0644,
2561                 .proc_handler   = &proc_dointvec_jiffies,
2562                 .strategy       = &sysctl_jiffies,
2563         },
2564         {
2565                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2566                 .procname       = "gc_timeout",
2567                 .data           = &ip_rt_gc_timeout,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = &proc_dointvec_jiffies,
2571                 .strategy       = &sysctl_jiffies,
2572         },
2573         {
2574                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2575                 .procname       = "gc_interval",
2576                 .data           = &ip_rt_gc_interval,
2577                 .maxlen         = sizeof(int),
2578                 .mode           = 0644,
2579                 .proc_handler   = &proc_dointvec_jiffies,
2580                 .strategy       = &sysctl_jiffies,
2581         },
2582         {
2583                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2584                 .procname       = "redirect_load",
2585                 .data           = &ip_rt_redirect_load,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = &proc_dointvec,
2589         },
2590         {
2591                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2592                 .procname       = "redirect_number",
2593                 .data           = &ip_rt_redirect_number,
2594                 .maxlen         = sizeof(int),
2595                 .mode           = 0644,
2596                 .proc_handler   = &proc_dointvec,
2597         },
2598         {
2599                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2600                 .procname       = "redirect_silence",
2601                 .data           = &ip_rt_redirect_silence,
2602                 .maxlen         = sizeof(int),
2603                 .mode           = 0644,
2604                 .proc_handler   = &proc_dointvec,
2605         },
2606         {
2607                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2608                 .procname       = "error_cost",
2609                 .data           = &ip_rt_error_cost,
2610                 .maxlen         = sizeof(int),
2611                 .mode           = 0644,
2612                 .proc_handler   = &proc_dointvec,
2613         },
2614         {
2615                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2616                 .procname       = "error_burst",
2617                 .data           = &ip_rt_error_burst,
2618                 .maxlen         = sizeof(int),
2619                 .mode           = 0644,
2620                 .proc_handler   = &proc_dointvec,
2621         },
2622         {
2623                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2624                 .procname       = "gc_elasticity",
2625                 .data           = &ip_rt_gc_elasticity,
2626                 .maxlen         = sizeof(int),
2627                 .mode           = 0644,
2628                 .proc_handler   = &proc_dointvec,
2629         },
2630         {
2631                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2632                 .procname       = "mtu_expires",
2633                 .data           = &ip_rt_mtu_expires,
2634                 .maxlen         = sizeof(int),
2635                 .mode           = 0644,
2636                 .proc_handler   = &proc_dointvec_jiffies,
2637                 .strategy       = &sysctl_jiffies,
2638         },
2639         {
2640                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2641                 .procname       = "min_pmtu",
2642                 .data           = &ip_rt_min_pmtu,
2643                 .maxlen         = sizeof(int),
2644                 .mode           = 0644,
2645                 .proc_handler   = &proc_dointvec,
2646         },
2647         {
2648                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2649                 .procname       = "min_adv_mss",
2650                 .data           = &ip_rt_min_advmss,
2651                 .maxlen         = sizeof(int),
2652                 .mode           = 0644,
2653                 .proc_handler   = &proc_dointvec,
2654         },
2655         {
2656                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2657                 .procname       = "secret_interval",
2658                 .data           = &ip_rt_secret_interval,
2659                 .maxlen         = sizeof(int),
2660                 .mode           = 0644,
2661                 .proc_handler   = &proc_dointvec_jiffies,
2662                 .strategy       = &sysctl_jiffies,
2663         },
2664         { .ctl_name = 0 }
2665 };
2666 #endif
2667 
2668 #ifdef CONFIG_NET_CLS_ROUTE
2669 struct ip_rt_acct *ip_rt_acct;
2670 
2671 /* This code sucks.  But you should have seen it before! --RR */
2672 
2673 /* IP route accounting ptr for this logical cpu number. */
2674 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2675 
2676 #ifdef CONFIG_PROC_FS
2677 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2678                            int length, int *eof, void *data)
2679 {
2680         unsigned int i;
2681 
2682         if ((offset & 3) || (length & 3))
2683                 return -EIO;
2684 
2685         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2686                 *eof = 1;
2687                 return 0;
2688         }
2689 
2690         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2691                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2692                 *eof = 1;
2693         }
2694 
2695         offset /= sizeof(u32);
2696 
2697         if (length > 0) {
2698                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2699                 u32 *dst = (u32 *) buffer;
2700 
2701                 /* Copy first cpu. */
2702                 *start = buffer;
2703                 memcpy(dst, src, length);
2704 
2705                 /* Add the other cpus in, one int at a time */
2706                 for (i = 1; i < NR_CPUS; i++) {
2707                         unsigned int j;
2708 
2709                         if (!cpu_online(i))
2710                                 continue;
2711 
2712                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2713 
2714                         for (j = 0; j < length/4; j++)
2715                                 dst[j] += src[j];
2716                 }
2717         }
2718         return length;
2719 }
2720 #endif /* CONFIG_PROC_FS */
2721 #endif /* CONFIG_NET_CLS_ROUTE */
2722 
2723 int __init ip_rt_init(void)
2724 {
2725         int i, order, goal, rc = 0;
2726 
2727         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2728                              (jiffies ^ (jiffies >> 7)));
2729 
2730 #ifdef CONFIG_NET_CLS_ROUTE
2731         for (order = 0;
2732              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2733                 /* NOTHING */;
2734         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2735         if (!ip_rt_acct)
2736                 panic("IP: failed to allocate ip_rt_acct\n");
2737         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2738 #endif
2739 
2740         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2741                                                      sizeof(struct rtable),
2742                                                      0, SLAB_HWCACHE_ALIGN,
2743                                                      NULL, NULL);
2744 
2745         if (!ipv4_dst_ops.kmem_cachep)
2746                 panic("IP: failed to allocate ip_dst_cache\n");
2747 
2748         goal = num_physpages >> (26 - PAGE_SHIFT);
2749 
2750         for (order = 0; (1UL << order) < goal; order++)
2751                 /* NOTHING */;
2752 
2753         do {
2754                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2755                         sizeof(struct rt_hash_bucket);
2756                 while (rt_hash_mask & (rt_hash_mask - 1))
2757                         rt_hash_mask--;
2758                 rt_hash_table = (struct rt_hash_bucket *)
2759                         __get_free_pages(GFP_ATOMIC, order);
2760         } while (rt_hash_table == NULL && --order > 0);
2761 
2762         if (!rt_hash_table)
2763                 panic("Failed to allocate IP route cache hash table\n");
2764 
2765         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2766                rt_hash_mask,
2767                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2768 
2769         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2770                 /* NOTHING */;
2771 
2772         rt_hash_mask--;
2773         for (i = 0; i <= rt_hash_mask; i++) {
2774                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2775                 rt_hash_table[i].chain = NULL;
2776         }
2777 
2778         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2779         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2780 
2781         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2782         if (!rt_cache_stat)
2783                 return -ENOMEM;
2784 
2785         devinet_init();
2786         ip_fib_init();
2787 
2788         init_timer(&rt_flush_timer);
2789         rt_flush_timer.function = rt_run_flush;
2790         init_timer(&rt_periodic_timer);
2791         rt_periodic_timer.function = rt_check_expire;
2792         init_timer(&rt_secret_timer);
2793         rt_secret_timer.function = rt_secret_rebuild;
2794 
2795         /* All the timers, started at system startup tend
2796            to synchronize. Perturb it a bit.
2797          */
2798         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2799                                         ip_rt_gc_interval;
2800         add_timer(&rt_periodic_timer);
2801 
2802         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2803                 ip_rt_secret_interval;
2804         add_timer(&rt_secret_timer);
2805 
2806 #ifdef CONFIG_PROC_FS
2807         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2808             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2809                 free_percpu(rt_cache_stat);
2810                 return -ENOMEM;
2811         }
2812 
2813 #ifdef CONFIG_NET_CLS_ROUTE
2814         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2815 #endif
2816 #endif
2817 #ifdef CONFIG_XFRM
2818         xfrm_init();
2819         xfrm4_init();
2820 #endif
2821         return rc;
2822 }
2823 
2824 EXPORT_SYMBOL(__ip_select_ident);
2825 EXPORT_SYMBOL(ip_route_input);
2826 EXPORT_SYMBOL(ip_route_output_key);
2827 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp