~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/fib_semantics.c

Version: ~ [ linux-5.3-rc5 ] ~ [ linux-5.2.9 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.67 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.139 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.189 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.189 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.72 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              IPv4 Forwarding Information Base: semantics.
  7  *
  8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  9  *
 10  *              This program is free software; you can redistribute it and/or
 11  *              modify it under the terms of the GNU General Public License
 12  *              as published by the Free Software Foundation; either version
 13  *              2 of the License, or (at your option) any later version.
 14  */
 15 
 16 #include <linux/uaccess.h>
 17 #include <linux/bitops.h>
 18 #include <linux/types.h>
 19 #include <linux/kernel.h>
 20 #include <linux/jiffies.h>
 21 #include <linux/mm.h>
 22 #include <linux/string.h>
 23 #include <linux/socket.h>
 24 #include <linux/sockios.h>
 25 #include <linux/errno.h>
 26 #include <linux/in.h>
 27 #include <linux/inet.h>
 28 #include <linux/inetdevice.h>
 29 #include <linux/netdevice.h>
 30 #include <linux/if_arp.h>
 31 #include <linux/proc_fs.h>
 32 #include <linux/skbuff.h>
 33 #include <linux/init.h>
 34 #include <linux/slab.h>
 35 
 36 #include <net/arp.h>
 37 #include <net/ip.h>
 38 #include <net/protocol.h>
 39 #include <net/route.h>
 40 #include <net/tcp.h>
 41 #include <net/sock.h>
 42 #include <net/ip_fib.h>
 43 #include <net/netlink.h>
 44 #include <net/nexthop.h>
 45 #include <net/lwtunnel.h>
 46 
 47 #include "fib_lookup.h"
 48 
 49 static DEFINE_SPINLOCK(fib_info_lock);
 50 static struct hlist_head *fib_info_hash;
 51 static struct hlist_head *fib_info_laddrhash;
 52 static unsigned int fib_info_hash_size;
 53 static unsigned int fib_info_cnt;
 54 
 55 #define DEVINDEX_HASHBITS 8
 56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
 57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 58 
 59 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 60 u32 fib_multipath_secret __read_mostly;
 61 
 62 #define for_nexthops(fi) {                                              \
 63         int nhsel; const struct fib_nh *nh;                             \
 64         for (nhsel = 0, nh = (fi)->fib_nh;                              \
 65              nhsel < (fi)->fib_nhs;                                     \
 66              nh++, nhsel++)
 67 
 68 #define change_nexthops(fi) {                                           \
 69         int nhsel; struct fib_nh *nexthop_nh;                           \
 70         for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh);   \
 71              nhsel < (fi)->fib_nhs;                                     \
 72              nexthop_nh++, nhsel++)
 73 
 74 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 75 
 76 /* Hope, that gcc will optimize it to get rid of dummy loop */
 77 
 78 #define for_nexthops(fi) {                                              \
 79         int nhsel; const struct fib_nh *nh = (fi)->fib_nh;              \
 80         for (nhsel = 0; nhsel < 1; nhsel++)
 81 
 82 #define change_nexthops(fi) {                                           \
 83         int nhsel;                                                      \
 84         struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);    \
 85         for (nhsel = 0; nhsel < 1; nhsel++)
 86 
 87 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 88 
 89 #define endfor_nexthops(fi) }
 90 
 91 
 92 const struct fib_prop fib_props[RTN_MAX + 1] = {
 93         [RTN_UNSPEC] = {
 94                 .error  = 0,
 95                 .scope  = RT_SCOPE_NOWHERE,
 96         },
 97         [RTN_UNICAST] = {
 98                 .error  = 0,
 99                 .scope  = RT_SCOPE_UNIVERSE,
100         },
101         [RTN_LOCAL] = {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },
105         [RTN_BROADCAST] = {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },
109         [RTN_ANYCAST] = {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },
113         [RTN_MULTICAST] = {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },
117         [RTN_BLACKHOLE] = {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },
121         [RTN_UNREACHABLE] = {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },
125         [RTN_PROHIBIT] = {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },
129         [RTN_THROW] = {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },
133         [RTN_NAT] = {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },
137         [RTN_XRESOLVE] = {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },
141 };
142 
143 static void rt_fibinfo_free(struct rtable __rcu **rtp)
144 {
145         struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146 
147         if (!rt)
148                 return;
149 
150         /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151          * because we waited an RCU grace period before calling
152          * free_fib_info_rcu()
153          */
154 
155         dst_free(&rt->dst);
156 }
157 
158 static void free_nh_exceptions(struct fib_nh *nh)
159 {
160         struct fnhe_hash_bucket *hash;
161         int i;
162 
163         hash = rcu_dereference_protected(nh->nh_exceptions, 1);
164         if (!hash)
165                 return;
166         for (i = 0; i < FNHE_HASH_SIZE; i++) {
167                 struct fib_nh_exception *fnhe;
168 
169                 fnhe = rcu_dereference_protected(hash[i].chain, 1);
170                 while (fnhe) {
171                         struct fib_nh_exception *next;
172                         
173                         next = rcu_dereference_protected(fnhe->fnhe_next, 1);
174 
175                         rt_fibinfo_free(&fnhe->fnhe_rth_input);
176                         rt_fibinfo_free(&fnhe->fnhe_rth_output);
177 
178                         kfree(fnhe);
179 
180                         fnhe = next;
181                 }
182         }
183         kfree(hash);
184 }
185 
186 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
187 {
188         int cpu;
189 
190         if (!rtp)
191                 return;
192 
193         for_each_possible_cpu(cpu) {
194                 struct rtable *rt;
195 
196                 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
197                 if (rt)
198                         dst_free(&rt->dst);
199         }
200         free_percpu(rtp);
201 }
202 
203 /* Release a nexthop info record */
204 static void free_fib_info_rcu(struct rcu_head *head)
205 {
206         struct fib_info *fi = container_of(head, struct fib_info, rcu);
207         struct dst_metrics *m;
208 
209         change_nexthops(fi) {
210                 if (nexthop_nh->nh_dev)
211                         dev_put(nexthop_nh->nh_dev);
212                 lwtstate_put(nexthop_nh->nh_lwtstate);
213                 free_nh_exceptions(nexthop_nh);
214                 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
215                 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
216         } endfor_nexthops(fi);
217 
218         m = fi->fib_metrics;
219         if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
220                 kfree(m);
221         kfree(fi);
222 }
223 
224 void free_fib_info(struct fib_info *fi)
225 {
226         if (fi->fib_dead == 0) {
227                 pr_warn("Freeing alive fib_info %p\n", fi);
228                 return;
229         }
230         fib_info_cnt--;
231 #ifdef CONFIG_IP_ROUTE_CLASSID
232         change_nexthops(fi) {
233                 if (nexthop_nh->nh_tclassid)
234                         fi->fib_net->ipv4.fib_num_tclassid_users--;
235         } endfor_nexthops(fi);
236 #endif
237         call_rcu(&fi->rcu, free_fib_info_rcu);
238 }
239 EXPORT_SYMBOL_GPL(free_fib_info);
240 
241 void fib_release_info(struct fib_info *fi)
242 {
243         spin_lock_bh(&fib_info_lock);
244         if (fi && --fi->fib_treeref == 0) {
245                 hlist_del(&fi->fib_hash);
246                 if (fi->fib_prefsrc)
247                         hlist_del(&fi->fib_lhash);
248                 change_nexthops(fi) {
249                         if (!nexthop_nh->nh_dev)
250                                 continue;
251                         hlist_del(&nexthop_nh->nh_hash);
252                 } endfor_nexthops(fi)
253                 fi->fib_dead = 1;
254                 fib_info_put(fi);
255         }
256         spin_unlock_bh(&fib_info_lock);
257 }
258 
259 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
260 {
261         const struct fib_nh *onh = ofi->fib_nh;
262 
263         for_nexthops(fi) {
264                 if (nh->nh_oif != onh->nh_oif ||
265                     nh->nh_gw  != onh->nh_gw ||
266                     nh->nh_scope != onh->nh_scope ||
267 #ifdef CONFIG_IP_ROUTE_MULTIPATH
268                     nh->nh_weight != onh->nh_weight ||
269 #endif
270 #ifdef CONFIG_IP_ROUTE_CLASSID
271                     nh->nh_tclassid != onh->nh_tclassid ||
272 #endif
273                     lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
274                     ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
275                         return -1;
276                 onh++;
277         } endfor_nexthops(fi);
278         return 0;
279 }
280 
281 static inline unsigned int fib_devindex_hashfn(unsigned int val)
282 {
283         unsigned int mask = DEVINDEX_HASHSIZE - 1;
284 
285         return (val ^
286                 (val >> DEVINDEX_HASHBITS) ^
287                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
288 }
289 
290 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
291 {
292         unsigned int mask = (fib_info_hash_size - 1);
293         unsigned int val = fi->fib_nhs;
294 
295         val ^= (fi->fib_protocol << 8) | fi->fib_scope;
296         val ^= (__force u32)fi->fib_prefsrc;
297         val ^= fi->fib_priority;
298         for_nexthops(fi) {
299                 val ^= fib_devindex_hashfn(nh->nh_oif);
300         } endfor_nexthops(fi)
301 
302         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
303 }
304 
305 static struct fib_info *fib_find_info(const struct fib_info *nfi)
306 {
307         struct hlist_head *head;
308         struct fib_info *fi;
309         unsigned int hash;
310 
311         hash = fib_info_hashfn(nfi);
312         head = &fib_info_hash[hash];
313 
314         hlist_for_each_entry(fi, head, fib_hash) {
315                 if (!net_eq(fi->fib_net, nfi->fib_net))
316                         continue;
317                 if (fi->fib_nhs != nfi->fib_nhs)
318                         continue;
319                 if (nfi->fib_protocol == fi->fib_protocol &&
320                     nfi->fib_scope == fi->fib_scope &&
321                     nfi->fib_prefsrc == fi->fib_prefsrc &&
322                     nfi->fib_priority == fi->fib_priority &&
323                     nfi->fib_type == fi->fib_type &&
324                     memcmp(nfi->fib_metrics, fi->fib_metrics,
325                            sizeof(u32) * RTAX_MAX) == 0 &&
326                     !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
327                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
328                         return fi;
329         }
330 
331         return NULL;
332 }
333 
334 /* Check, that the gateway is already configured.
335  * Used only by redirect accept routine.
336  */
337 int ip_fib_check_default(__be32 gw, struct net_device *dev)
338 {
339         struct hlist_head *head;
340         struct fib_nh *nh;
341         unsigned int hash;
342 
343         spin_lock(&fib_info_lock);
344 
345         hash = fib_devindex_hashfn(dev->ifindex);
346         head = &fib_info_devhash[hash];
347         hlist_for_each_entry(nh, head, nh_hash) {
348                 if (nh->nh_dev == dev &&
349                     nh->nh_gw == gw &&
350                     !(nh->nh_flags & RTNH_F_DEAD)) {
351                         spin_unlock(&fib_info_lock);
352                         return 0;
353                 }
354         }
355 
356         spin_unlock(&fib_info_lock);
357 
358         return -1;
359 }
360 
361 static inline size_t fib_nlmsg_size(struct fib_info *fi)
362 {
363         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
364                          + nla_total_size(4) /* RTA_TABLE */
365                          + nla_total_size(4) /* RTA_DST */
366                          + nla_total_size(4) /* RTA_PRIORITY */
367                          + nla_total_size(4) /* RTA_PREFSRC */
368                          + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
369 
370         /* space for nested metrics */
371         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
372 
373         if (fi->fib_nhs) {
374                 size_t nh_encapsize = 0;
375                 /* Also handles the special case fib_nhs == 1 */
376 
377                 /* each nexthop is packed in an attribute */
378                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
379 
380                 /* may contain flow and gateway attribute */
381                 nhsize += 2 * nla_total_size(4);
382 
383                 /* grab encap info */
384                 for_nexthops(fi) {
385                         if (nh->nh_lwtstate) {
386                                 /* RTA_ENCAP_TYPE */
387                                 nh_encapsize += lwtunnel_get_encap_size(
388                                                 nh->nh_lwtstate);
389                                 /* RTA_ENCAP */
390                                 nh_encapsize +=  nla_total_size(2);
391                         }
392                 } endfor_nexthops(fi);
393 
394                 /* all nexthops are packed in a nested attribute */
395                 payload += nla_total_size((fi->fib_nhs * nhsize) +
396                                           nh_encapsize);
397 
398         }
399 
400         return payload;
401 }
402 
403 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
404                int dst_len, u32 tb_id, const struct nl_info *info,
405                unsigned int nlm_flags)
406 {
407         struct sk_buff *skb;
408         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
409         int err = -ENOBUFS;
410 
411         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
412         if (!skb)
413                 goto errout;
414 
415         err = fib_dump_info(skb, info->portid, seq, event, tb_id,
416                             fa->fa_type, key, dst_len,
417                             fa->fa_tos, fa->fa_info, nlm_flags);
418         if (err < 0) {
419                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
420                 WARN_ON(err == -EMSGSIZE);
421                 kfree_skb(skb);
422                 goto errout;
423         }
424         rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
425                     info->nlh, GFP_KERNEL);
426         return;
427 errout:
428         if (err < 0)
429                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
430 }
431 
432 static int fib_detect_death(struct fib_info *fi, int order,
433                             struct fib_info **last_resort, int *last_idx,
434                             int dflt)
435 {
436         struct neighbour *n;
437         int state = NUD_NONE;
438 
439         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
440         if (n) {
441                 state = n->nud_state;
442                 neigh_release(n);
443         } else {
444                 return 0;
445         }
446         if (state == NUD_REACHABLE)
447                 return 0;
448         if ((state & NUD_VALID) && order != dflt)
449                 return 0;
450         if ((state & NUD_VALID) ||
451             (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
452                 *last_resort = fi;
453                 *last_idx = order;
454         }
455         return 1;
456 }
457 
458 #ifdef CONFIG_IP_ROUTE_MULTIPATH
459 
460 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
461 {
462         int nhs = 0;
463 
464         while (rtnh_ok(rtnh, remaining)) {
465                 nhs++;
466                 rtnh = rtnh_next(rtnh, &remaining);
467         }
468 
469         /* leftover implies invalid nexthop configuration, discard it */
470         return remaining > 0 ? 0 : nhs;
471 }
472 
473 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
474                        int remaining, struct fib_config *cfg)
475 {
476         int ret;
477 
478         change_nexthops(fi) {
479                 int attrlen;
480 
481                 if (!rtnh_ok(rtnh, remaining))
482                         return -EINVAL;
483 
484                 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
485                         return -EINVAL;
486 
487                 nexthop_nh->nh_flags =
488                         (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
489                 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
490                 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
491 
492                 attrlen = rtnh_attrlen(rtnh);
493                 if (attrlen > 0) {
494                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
495 
496                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
497                         nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
498 #ifdef CONFIG_IP_ROUTE_CLASSID
499                         nla = nla_find(attrs, attrlen, RTA_FLOW);
500                         nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
501                         if (nexthop_nh->nh_tclassid)
502                                 fi->fib_net->ipv4.fib_num_tclassid_users++;
503 #endif
504                         nla = nla_find(attrs, attrlen, RTA_ENCAP);
505                         if (nla) {
506                                 struct lwtunnel_state *lwtstate;
507                                 struct nlattr *nla_entype;
508 
509                                 nla_entype = nla_find(attrs, attrlen,
510                                                       RTA_ENCAP_TYPE);
511                                 if (!nla_entype)
512                                         goto err_inval;
513 
514                                 ret = lwtunnel_build_state(nla_get_u16(
515                                                            nla_entype),
516                                                            nla,  AF_INET, cfg,
517                                                            &lwtstate);
518                                 if (ret)
519                                         goto errout;
520                                 nexthop_nh->nh_lwtstate =
521                                         lwtstate_get(lwtstate);
522                         }
523                 }
524 
525                 rtnh = rtnh_next(rtnh, &remaining);
526         } endfor_nexthops(fi);
527 
528         return 0;
529 
530 err_inval:
531         ret = -EINVAL;
532 
533 errout:
534         return ret;
535 }
536 
537 static void fib_rebalance(struct fib_info *fi)
538 {
539         int total;
540         int w;
541         struct in_device *in_dev;
542 
543         if (fi->fib_nhs < 2)
544                 return;
545 
546         total = 0;
547         for_nexthops(fi) {
548                 if (nh->nh_flags & RTNH_F_DEAD)
549                         continue;
550 
551                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
552 
553                 if (in_dev &&
554                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
555                     nh->nh_flags & RTNH_F_LINKDOWN)
556                         continue;
557 
558                 total += nh->nh_weight;
559         } endfor_nexthops(fi);
560 
561         w = 0;
562         change_nexthops(fi) {
563                 int upper_bound;
564 
565                 in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
566 
567                 if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
568                         upper_bound = -1;
569                 } else if (in_dev &&
570                            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
571                            nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
572                         upper_bound = -1;
573                 } else {
574                         w += nexthop_nh->nh_weight;
575                         upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
576                                                             total) - 1;
577                 }
578 
579                 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
580         } endfor_nexthops(fi);
581 
582         net_get_random_once(&fib_multipath_secret,
583                             sizeof(fib_multipath_secret));
584 }
585 
586 static inline void fib_add_weight(struct fib_info *fi,
587                                   const struct fib_nh *nh)
588 {
589         fi->fib_weight += nh->nh_weight;
590 }
591 
592 #else /* CONFIG_IP_ROUTE_MULTIPATH */
593 
594 #define fib_rebalance(fi) do { } while (0)
595 #define fib_add_weight(fi, nh) do { } while (0)
596 
597 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
598 
599 static int fib_encap_match(u16 encap_type,
600                            struct nlattr *encap,
601                            const struct fib_nh *nh,
602                            const struct fib_config *cfg)
603 {
604         struct lwtunnel_state *lwtstate;
605         int ret, result = 0;
606 
607         if (encap_type == LWTUNNEL_ENCAP_NONE)
608                 return 0;
609 
610         ret = lwtunnel_build_state(encap_type, encap,
611                                    AF_INET, cfg, &lwtstate);
612         if (!ret) {
613                 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
614                 lwtstate_free(lwtstate);
615         }
616 
617         return result;
618 }
619 
620 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
621 {
622 #ifdef CONFIG_IP_ROUTE_MULTIPATH
623         struct rtnexthop *rtnh;
624         int remaining;
625 #endif
626 
627         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
628                 return 1;
629 
630         if (cfg->fc_oif || cfg->fc_gw) {
631                 if (cfg->fc_encap) {
632                         if (fib_encap_match(cfg->fc_encap_type,
633                                             cfg->fc_encap, fi->fib_nh, cfg))
634                             return 1;
635                 }
636                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
637                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
638                         return 0;
639                 return 1;
640         }
641 
642 #ifdef CONFIG_IP_ROUTE_MULTIPATH
643         if (!cfg->fc_mp)
644                 return 0;
645 
646         rtnh = cfg->fc_mp;
647         remaining = cfg->fc_mp_len;
648 
649         for_nexthops(fi) {
650                 int attrlen;
651 
652                 if (!rtnh_ok(rtnh, remaining))
653                         return -EINVAL;
654 
655                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
656                         return 1;
657 
658                 attrlen = rtnh_attrlen(rtnh);
659                 if (attrlen > 0) {
660                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
661 
662                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
663                         if (nla && nla_get_in_addr(nla) != nh->nh_gw)
664                                 return 1;
665 #ifdef CONFIG_IP_ROUTE_CLASSID
666                         nla = nla_find(attrs, attrlen, RTA_FLOW);
667                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
668                                 return 1;
669 #endif
670                 }
671 
672                 rtnh = rtnh_next(rtnh, &remaining);
673         } endfor_nexthops(fi);
674 #endif
675         return 0;
676 }
677 
678 
679 /*
680  * Picture
681  * -------
682  *
683  * Semantics of nexthop is very messy by historical reasons.
684  * We have to take into account, that:
685  * a) gateway can be actually local interface address,
686  *    so that gatewayed route is direct.
687  * b) gateway must be on-link address, possibly
688  *    described not by an ifaddr, but also by a direct route.
689  * c) If both gateway and interface are specified, they should not
690  *    contradict.
691  * d) If we use tunnel routes, gateway could be not on-link.
692  *
693  * Attempt to reconcile all of these (alas, self-contradictory) conditions
694  * results in pretty ugly and hairy code with obscure logic.
695  *
696  * I chose to generalized it instead, so that the size
697  * of code does not increase practically, but it becomes
698  * much more general.
699  * Every prefix is assigned a "scope" value: "host" is local address,
700  * "link" is direct route,
701  * [ ... "site" ... "interior" ... ]
702  * and "universe" is true gateway route with global meaning.
703  *
704  * Every prefix refers to a set of "nexthop"s (gw, oif),
705  * where gw must have narrower scope. This recursion stops
706  * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
707  * which means that gw is forced to be on link.
708  *
709  * Code is still hairy, but now it is apparently logically
710  * consistent and very flexible. F.e. as by-product it allows
711  * to co-exists in peace independent exterior and interior
712  * routing processes.
713  *
714  * Normally it looks as following.
715  *
716  * {universe prefix}  -> (gw, oif) [scope link]
717  *                |
718  *                |-> {link prefix} -> (gw, oif) [scope local]
719  *                                      |
720  *                                      |-> {local prefix} (terminal node)
721  */
722 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
723                         struct fib_nh *nh)
724 {
725         int err = 0;
726         struct net *net;
727         struct net_device *dev;
728 
729         net = cfg->fc_nlinfo.nl_net;
730         if (nh->nh_gw) {
731                 struct fib_result res;
732 
733                 if (nh->nh_flags & RTNH_F_ONLINK) {
734                         unsigned int addr_type;
735 
736                         if (cfg->fc_scope >= RT_SCOPE_LINK)
737                                 return -EINVAL;
738                         dev = __dev_get_by_index(net, nh->nh_oif);
739                         if (!dev)
740                                 return -ENODEV;
741                         if (!(dev->flags & IFF_UP))
742                                 return -ENETDOWN;
743                         addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
744                         if (addr_type != RTN_UNICAST)
745                                 return -EINVAL;
746                         if (!netif_carrier_ok(dev))
747                                 nh->nh_flags |= RTNH_F_LINKDOWN;
748                         nh->nh_dev = dev;
749                         dev_hold(dev);
750                         nh->nh_scope = RT_SCOPE_LINK;
751                         return 0;
752                 }
753                 rcu_read_lock();
754                 {
755                         struct fib_table *tbl = NULL;
756                         struct flowi4 fl4 = {
757                                 .daddr = nh->nh_gw,
758                                 .flowi4_scope = cfg->fc_scope + 1,
759                                 .flowi4_oif = nh->nh_oif,
760                                 .flowi4_iif = LOOPBACK_IFINDEX,
761                         };
762 
763                         /* It is not necessary, but requires a bit of thinking */
764                         if (fl4.flowi4_scope < RT_SCOPE_LINK)
765                                 fl4.flowi4_scope = RT_SCOPE_LINK;
766 
767                         if (cfg->fc_table)
768                                 tbl = fib_get_table(net, cfg->fc_table);
769 
770                         if (tbl)
771                                 err = fib_table_lookup(tbl, &fl4, &res,
772                                                        FIB_LOOKUP_IGNORE_LINKSTATE |
773                                                        FIB_LOOKUP_NOREF);
774 
775                         /* on error or if no table given do full lookup. This
776                          * is needed for example when nexthops are in the local
777                          * table rather than the given table
778                          */
779                         if (!tbl || err) {
780                                 err = fib_lookup(net, &fl4, &res,
781                                                  FIB_LOOKUP_IGNORE_LINKSTATE);
782                         }
783 
784                         if (err) {
785                                 rcu_read_unlock();
786                                 return err;
787                         }
788                 }
789                 err = -EINVAL;
790                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
791                         goto out;
792                 nh->nh_scope = res.scope;
793                 nh->nh_oif = FIB_RES_OIF(res);
794                 nh->nh_dev = dev = FIB_RES_DEV(res);
795                 if (!dev)
796                         goto out;
797                 dev_hold(dev);
798                 if (!netif_carrier_ok(dev))
799                         nh->nh_flags |= RTNH_F_LINKDOWN;
800                 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
801         } else {
802                 struct in_device *in_dev;
803 
804                 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
805                         return -EINVAL;
806 
807                 rcu_read_lock();
808                 err = -ENODEV;
809                 in_dev = inetdev_by_index(net, nh->nh_oif);
810                 if (!in_dev)
811                         goto out;
812                 err = -ENETDOWN;
813                 if (!(in_dev->dev->flags & IFF_UP))
814                         goto out;
815                 nh->nh_dev = in_dev->dev;
816                 dev_hold(nh->nh_dev);
817                 nh->nh_scope = RT_SCOPE_HOST;
818                 if (!netif_carrier_ok(nh->nh_dev))
819                         nh->nh_flags |= RTNH_F_LINKDOWN;
820                 err = 0;
821         }
822 out:
823         rcu_read_unlock();
824         return err;
825 }
826 
827 static inline unsigned int fib_laddr_hashfn(__be32 val)
828 {
829         unsigned int mask = (fib_info_hash_size - 1);
830 
831         return ((__force u32)val ^
832                 ((__force u32)val >> 7) ^
833                 ((__force u32)val >> 14)) & mask;
834 }
835 
836 static struct hlist_head *fib_info_hash_alloc(int bytes)
837 {
838         if (bytes <= PAGE_SIZE)
839                 return kzalloc(bytes, GFP_KERNEL);
840         else
841                 return (struct hlist_head *)
842                         __get_free_pages(GFP_KERNEL | __GFP_ZERO,
843                                          get_order(bytes));
844 }
845 
846 static void fib_info_hash_free(struct hlist_head *hash, int bytes)
847 {
848         if (!hash)
849                 return;
850 
851         if (bytes <= PAGE_SIZE)
852                 kfree(hash);
853         else
854                 free_pages((unsigned long) hash, get_order(bytes));
855 }
856 
857 static void fib_info_hash_move(struct hlist_head *new_info_hash,
858                                struct hlist_head *new_laddrhash,
859                                unsigned int new_size)
860 {
861         struct hlist_head *old_info_hash, *old_laddrhash;
862         unsigned int old_size = fib_info_hash_size;
863         unsigned int i, bytes;
864 
865         spin_lock_bh(&fib_info_lock);
866         old_info_hash = fib_info_hash;
867         old_laddrhash = fib_info_laddrhash;
868         fib_info_hash_size = new_size;
869 
870         for (i = 0; i < old_size; i++) {
871                 struct hlist_head *head = &fib_info_hash[i];
872                 struct hlist_node *n;
873                 struct fib_info *fi;
874 
875                 hlist_for_each_entry_safe(fi, n, head, fib_hash) {
876                         struct hlist_head *dest;
877                         unsigned int new_hash;
878 
879                         new_hash = fib_info_hashfn(fi);
880                         dest = &new_info_hash[new_hash];
881                         hlist_add_head(&fi->fib_hash, dest);
882                 }
883         }
884         fib_info_hash = new_info_hash;
885 
886         for (i = 0; i < old_size; i++) {
887                 struct hlist_head *lhead = &fib_info_laddrhash[i];
888                 struct hlist_node *n;
889                 struct fib_info *fi;
890 
891                 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
892                         struct hlist_head *ldest;
893                         unsigned int new_hash;
894 
895                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
896                         ldest = &new_laddrhash[new_hash];
897                         hlist_add_head(&fi->fib_lhash, ldest);
898                 }
899         }
900         fib_info_laddrhash = new_laddrhash;
901 
902         spin_unlock_bh(&fib_info_lock);
903 
904         bytes = old_size * sizeof(struct hlist_head *);
905         fib_info_hash_free(old_info_hash, bytes);
906         fib_info_hash_free(old_laddrhash, bytes);
907 }
908 
909 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
910 {
911         nh->nh_saddr = inet_select_addr(nh->nh_dev,
912                                         nh->nh_gw,
913                                         nh->nh_parent->fib_scope);
914         nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
915 
916         return nh->nh_saddr;
917 }
918 
919 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
920 {
921         if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
922             fib_prefsrc != cfg->fc_dst) {
923                 u32 tb_id = cfg->fc_table;
924                 int rc;
925 
926                 if (tb_id == RT_TABLE_MAIN)
927                         tb_id = RT_TABLE_LOCAL;
928 
929                 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
930                                           fib_prefsrc, tb_id);
931 
932                 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
933                         rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
934                                                   fib_prefsrc, RT_TABLE_LOCAL);
935                 }
936 
937                 if (rc != RTN_LOCAL)
938                         return false;
939         }
940         return true;
941 }
942 
943 static int
944 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
945 {
946         bool ecn_ca = false;
947         struct nlattr *nla;
948         int remaining;
949 
950         if (!cfg->fc_mx)
951                 return 0;
952 
953         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
954                 int type = nla_type(nla);
955                 u32 val;
956 
957                 if (!type)
958                         continue;
959                 if (type > RTAX_MAX)
960                         return -EINVAL;
961 
962                 if (type == RTAX_CC_ALGO) {
963                         char tmp[TCP_CA_NAME_MAX];
964 
965                         nla_strlcpy(tmp, nla, sizeof(tmp));
966                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
967                         if (val == TCP_CA_UNSPEC)
968                                 return -EINVAL;
969                 } else {
970                         val = nla_get_u32(nla);
971                 }
972                 if (type == RTAX_ADVMSS && val > 65535 - 40)
973                         val = 65535 - 40;
974                 if (type == RTAX_MTU && val > 65535 - 15)
975                         val = 65535 - 15;
976                 if (type == RTAX_HOPLIMIT && val > 255)
977                         val = 255;
978                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
979                         return -EINVAL;
980                 fi->fib_metrics->metrics[type - 1] = val;
981         }
982 
983         if (ecn_ca)
984                 fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
985 
986         return 0;
987 }
988 
989 struct fib_info *fib_create_info(struct fib_config *cfg)
990 {
991         int err;
992         struct fib_info *fi = NULL;
993         struct fib_info *ofi;
994         int nhs = 1;
995         struct net *net = cfg->fc_nlinfo.nl_net;
996 
997         if (cfg->fc_type > RTN_MAX)
998                 goto err_inval;
999 
1000         /* Fast check to catch the most weird cases */
1001         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
1002                 goto err_inval;
1003 
1004         if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
1005                 goto err_inval;
1006 
1007 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1008         if (cfg->fc_mp) {
1009                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
1010                 if (nhs == 0)
1011                         goto err_inval;
1012         }
1013 #endif
1014 
1015         err = -ENOBUFS;
1016         if (fib_info_cnt >= fib_info_hash_size) {
1017                 unsigned int new_size = fib_info_hash_size << 1;
1018                 struct hlist_head *new_info_hash;
1019                 struct hlist_head *new_laddrhash;
1020                 unsigned int bytes;
1021 
1022                 if (!new_size)
1023                         new_size = 16;
1024                 bytes = new_size * sizeof(struct hlist_head *);
1025                 new_info_hash = fib_info_hash_alloc(bytes);
1026                 new_laddrhash = fib_info_hash_alloc(bytes);
1027                 if (!new_info_hash || !new_laddrhash) {
1028                         fib_info_hash_free(new_info_hash, bytes);
1029                         fib_info_hash_free(new_laddrhash, bytes);
1030                 } else
1031                         fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
1032 
1033                 if (!fib_info_hash_size)
1034                         goto failure;
1035         }
1036 
1037         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1038         if (!fi)
1039                 goto failure;
1040         fib_info_cnt++;
1041         if (cfg->fc_mx) {
1042                 fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
1043                 if (!fi->fib_metrics)
1044                         goto failure;
1045                 atomic_set(&fi->fib_metrics->refcnt, 1);
1046         } else
1047                 fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
1048 
1049         fi->fib_net = net;
1050         fi->fib_protocol = cfg->fc_protocol;
1051         fi->fib_scope = cfg->fc_scope;
1052         fi->fib_flags = cfg->fc_flags;
1053         fi->fib_priority = cfg->fc_priority;
1054         fi->fib_prefsrc = cfg->fc_prefsrc;
1055         fi->fib_type = cfg->fc_type;
1056         fi->fib_tb_id = cfg->fc_table;
1057 
1058         fi->fib_nhs = nhs;
1059         change_nexthops(fi) {
1060                 nexthop_nh->nh_parent = fi;
1061                 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
1062                 if (!nexthop_nh->nh_pcpu_rth_output)
1063                         goto failure;
1064         } endfor_nexthops(fi)
1065 
1066         err = fib_convert_metrics(fi, cfg);
1067         if (err)
1068                 goto failure;
1069 
1070         if (cfg->fc_mp) {
1071 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1072                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
1073                 if (err != 0)
1074                         goto failure;
1075                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
1076                         goto err_inval;
1077                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
1078                         goto err_inval;
1079 #ifdef CONFIG_IP_ROUTE_CLASSID
1080                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
1081                         goto err_inval;
1082 #endif
1083 #else
1084                 goto err_inval;
1085 #endif
1086         } else {
1087                 struct fib_nh *nh = fi->fib_nh;
1088 
1089                 if (cfg->fc_encap) {
1090                         struct lwtunnel_state *lwtstate;
1091 
1092                         if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
1093                                 goto err_inval;
1094                         err = lwtunnel_build_state(cfg->fc_encap_type,
1095                                                    cfg->fc_encap, AF_INET, cfg,
1096                                                    &lwtstate);
1097                         if (err)
1098                                 goto failure;
1099 
1100                         nh->nh_lwtstate = lwtstate_get(lwtstate);
1101                 }
1102                 nh->nh_oif = cfg->fc_oif;
1103                 nh->nh_gw = cfg->fc_gw;
1104                 nh->nh_flags = cfg->fc_flags;
1105 #ifdef CONFIG_IP_ROUTE_CLASSID
1106                 nh->nh_tclassid = cfg->fc_flow;
1107                 if (nh->nh_tclassid)
1108                         fi->fib_net->ipv4.fib_num_tclassid_users++;
1109 #endif
1110 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1111                 nh->nh_weight = 1;
1112 #endif
1113         }
1114 
1115         if (fib_props[cfg->fc_type].error) {
1116                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
1117                         goto err_inval;
1118                 goto link_it;
1119         } else {
1120                 switch (cfg->fc_type) {
1121                 case RTN_UNICAST:
1122                 case RTN_LOCAL:
1123                 case RTN_BROADCAST:
1124                 case RTN_ANYCAST:
1125                 case RTN_MULTICAST:
1126                         break;
1127                 default:
1128                         goto err_inval;
1129                 }
1130         }
1131 
1132         if (cfg->fc_scope > RT_SCOPE_HOST)
1133                 goto err_inval;
1134 
1135         if (cfg->fc_scope == RT_SCOPE_HOST) {
1136                 struct fib_nh *nh = fi->fib_nh;
1137 
1138                 /* Local address is added. */
1139                 if (nhs != 1 || nh->nh_gw)
1140                         goto err_inval;
1141                 nh->nh_scope = RT_SCOPE_NOWHERE;
1142                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
1143                 err = -ENODEV;
1144                 if (!nh->nh_dev)
1145                         goto failure;
1146         } else {
1147                 int linkdown = 0;
1148 
1149                 change_nexthops(fi) {
1150                         err = fib_check_nh(cfg, fi, nexthop_nh);
1151                         if (err != 0)
1152                                 goto failure;
1153                         if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
1154                                 linkdown++;
1155                 } endfor_nexthops(fi)
1156                 if (linkdown == fi->fib_nhs)
1157                         fi->fib_flags |= RTNH_F_LINKDOWN;
1158         }
1159 
1160         if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
1161                 goto err_inval;
1162 
1163         change_nexthops(fi) {
1164                 fib_info_update_nh_saddr(net, nexthop_nh);
1165                 fib_add_weight(fi, nexthop_nh);
1166         } endfor_nexthops(fi)
1167 
1168         fib_rebalance(fi);
1169 
1170 link_it:
1171         ofi = fib_find_info(fi);
1172         if (ofi) {
1173                 fi->fib_dead = 1;
1174                 free_fib_info(fi);
1175                 ofi->fib_treeref++;
1176                 return ofi;
1177         }
1178 
1179         fi->fib_treeref++;
1180         atomic_inc(&fi->fib_clntref);
1181         spin_lock_bh(&fib_info_lock);
1182         hlist_add_head(&fi->fib_hash,
1183                        &fib_info_hash[fib_info_hashfn(fi)]);
1184         if (fi->fib_prefsrc) {
1185                 struct hlist_head *head;
1186 
1187                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
1188                 hlist_add_head(&fi->fib_lhash, head);
1189         }
1190         change_nexthops(fi) {
1191                 struct hlist_head *head;
1192                 unsigned int hash;
1193 
1194                 if (!nexthop_nh->nh_dev)
1195                         continue;
1196                 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
1197                 head = &fib_info_devhash[hash];
1198                 hlist_add_head(&nexthop_nh->nh_hash, head);
1199         } endfor_nexthops(fi)
1200         spin_unlock_bh(&fib_info_lock);
1201         return fi;
1202 
1203 err_inval:
1204         err = -EINVAL;
1205 
1206 failure:
1207         if (fi) {
1208                 fi->fib_dead = 1;
1209                 free_fib_info(fi);
1210         }
1211 
1212         return ERR_PTR(err);
1213 }
1214 
1215 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1216                   u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
1217                   struct fib_info *fi, unsigned int flags)
1218 {
1219         struct nlmsghdr *nlh;
1220         struct rtmsg *rtm;
1221 
1222         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1223         if (!nlh)
1224                 return -EMSGSIZE;
1225 
1226         rtm = nlmsg_data(nlh);
1227         rtm->rtm_family = AF_INET;
1228         rtm->rtm_dst_len = dst_len;
1229         rtm->rtm_src_len = 0;
1230         rtm->rtm_tos = tos;
1231         if (tb_id < 256)
1232                 rtm->rtm_table = tb_id;
1233         else
1234                 rtm->rtm_table = RT_TABLE_COMPAT;
1235         if (nla_put_u32(skb, RTA_TABLE, tb_id))
1236                 goto nla_put_failure;
1237         rtm->rtm_type = type;
1238         rtm->rtm_flags = fi->fib_flags;
1239         rtm->rtm_scope = fi->fib_scope;
1240         rtm->rtm_protocol = fi->fib_protocol;
1241 
1242         if (rtm->rtm_dst_len &&
1243             nla_put_in_addr(skb, RTA_DST, dst))
1244                 goto nla_put_failure;
1245         if (fi->fib_priority &&
1246             nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
1247                 goto nla_put_failure;
1248         if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
1249                 goto nla_put_failure;
1250 
1251         if (fi->fib_prefsrc &&
1252             nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
1253                 goto nla_put_failure;
1254         if (fi->fib_nhs == 1) {
1255                 struct in_device *in_dev;
1256 
1257                 if (fi->fib_nh->nh_gw &&
1258                     nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
1259                         goto nla_put_failure;
1260                 if (fi->fib_nh->nh_oif &&
1261                     nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
1262                         goto nla_put_failure;
1263                 if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
1264                         in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
1265                         if (in_dev &&
1266                             IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1267                                 rtm->rtm_flags |= RTNH_F_DEAD;
1268                 }
1269 #ifdef CONFIG_IP_ROUTE_CLASSID
1270                 if (fi->fib_nh[0].nh_tclassid &&
1271                     nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
1272                         goto nla_put_failure;
1273 #endif
1274                 if (fi->fib_nh->nh_lwtstate &&
1275                     lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
1276                         goto nla_put_failure;
1277         }
1278 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1279         if (fi->fib_nhs > 1) {
1280                 struct rtnexthop *rtnh;
1281                 struct nlattr *mp;
1282 
1283                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1284                 if (!mp)
1285                         goto nla_put_failure;
1286 
1287                 for_nexthops(fi) {
1288                         struct in_device *in_dev;
1289 
1290                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1291                         if (!rtnh)
1292                                 goto nla_put_failure;
1293 
1294                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1295                         if (nh->nh_flags & RTNH_F_LINKDOWN) {
1296                                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
1297                                 if (in_dev &&
1298                                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1299                                         rtnh->rtnh_flags |= RTNH_F_DEAD;
1300                         }
1301                         rtnh->rtnh_hops = nh->nh_weight - 1;
1302                         rtnh->rtnh_ifindex = nh->nh_oif;
1303 
1304                         if (nh->nh_gw &&
1305                             nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
1306                                 goto nla_put_failure;
1307 #ifdef CONFIG_IP_ROUTE_CLASSID
1308                         if (nh->nh_tclassid &&
1309                             nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
1310                                 goto nla_put_failure;
1311 #endif
1312                         if (nh->nh_lwtstate &&
1313                             lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
1314                                 goto nla_put_failure;
1315 
1316                         /* length of rtnetlink header + attributes */
1317                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1318                 } endfor_nexthops(fi);
1319 
1320                 nla_nest_end(skb, mp);
1321         }
1322 #endif
1323         nlmsg_end(skb, nlh);
1324         return 0;
1325 
1326 nla_put_failure:
1327         nlmsg_cancel(skb, nlh);
1328         return -EMSGSIZE;
1329 }
1330 
1331 /*
1332  * Update FIB if:
1333  * - local address disappeared -> we must delete all the entries
1334  *   referring to it.
1335  * - device went down -> we must shutdown all nexthops going via it.
1336  */
1337 int fib_sync_down_addr(struct net_device *dev, __be32 local)
1338 {
1339         int ret = 0;
1340         unsigned int hash = fib_laddr_hashfn(local);
1341         struct hlist_head *head = &fib_info_laddrhash[hash];
1342         struct net *net = dev_net(dev);
1343         int tb_id = l3mdev_fib_table(dev);
1344         struct fib_info *fi;
1345 
1346         if (!fib_info_laddrhash || local == 0)
1347                 return 0;
1348 
1349         hlist_for_each_entry(fi, head, fib_lhash) {
1350                 if (!net_eq(fi->fib_net, net) ||
1351                     fi->fib_tb_id != tb_id)
1352                         continue;
1353                 if (fi->fib_prefsrc == local) {
1354                         fi->fib_flags |= RTNH_F_DEAD;
1355                         ret++;
1356                 }
1357         }
1358         return ret;
1359 }
1360 
1361 static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
1362                                  enum fib_event_type event_type)
1363 {
1364         struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
1365         struct fib_nh_notifier_info info = {
1366                 .fib_nh = fib_nh,
1367         };
1368 
1369         switch (event_type) {
1370         case FIB_EVENT_NH_ADD:
1371                 if (fib_nh->nh_flags & RTNH_F_DEAD)
1372                         break;
1373                 if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1374                     fib_nh->nh_flags & RTNH_F_LINKDOWN)
1375                         break;
1376                 return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
1377                                           &info.info);
1378         case FIB_EVENT_NH_DEL:
1379                 if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1380                      fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
1381                     (fib_nh->nh_flags & RTNH_F_DEAD))
1382                         return call_fib_notifiers(dev_net(fib_nh->nh_dev),
1383                                                   event_type, &info.info);
1384         default:
1385                 break;
1386         }
1387 
1388         return NOTIFY_DONE;
1389 }
1390 
1391 /* Event              force Flags           Description
1392  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
1393  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
1394  * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
1395  * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
1396  */
1397 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1398 {
1399         int ret = 0;
1400         int scope = RT_SCOPE_NOWHERE;
1401         struct fib_info *prev_fi = NULL;
1402         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1403         struct hlist_head *head = &fib_info_devhash[hash];
1404         struct fib_nh *nh;
1405 
1406         if (force)
1407                 scope = -1;
1408 
1409         hlist_for_each_entry(nh, head, nh_hash) {
1410                 struct fib_info *fi = nh->nh_parent;
1411                 int dead;
1412 
1413                 BUG_ON(!fi->fib_nhs);
1414                 if (nh->nh_dev != dev || fi == prev_fi)
1415                         continue;
1416                 prev_fi = fi;
1417                 dead = 0;
1418                 change_nexthops(fi) {
1419                         if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1420                                 dead++;
1421                         else if (nexthop_nh->nh_dev == dev &&
1422                                  nexthop_nh->nh_scope != scope) {
1423                                 switch (event) {
1424                                 case NETDEV_DOWN:
1425                                 case NETDEV_UNREGISTER:
1426                                         nexthop_nh->nh_flags |= RTNH_F_DEAD;
1427                                         /* fall through */
1428                                 case NETDEV_CHANGE:
1429                                         nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
1430                                         break;
1431                                 }
1432                                 call_fib_nh_notifiers(nexthop_nh,
1433                                                       FIB_EVENT_NH_DEL);
1434                                 dead++;
1435                         }
1436 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1437                         if (event == NETDEV_UNREGISTER &&
1438                             nexthop_nh->nh_dev == dev) {
1439                                 dead = fi->fib_nhs;
1440                                 break;
1441                         }
1442 #endif
1443                 } endfor_nexthops(fi)
1444                 if (dead == fi->fib_nhs) {
1445                         switch (event) {
1446                         case NETDEV_DOWN:
1447                         case NETDEV_UNREGISTER:
1448                                 fi->fib_flags |= RTNH_F_DEAD;
1449                                 /* fall through */
1450                         case NETDEV_CHANGE:
1451                                 fi->fib_flags |= RTNH_F_LINKDOWN;
1452                                 break;
1453                         }
1454                         ret++;
1455                 }
1456 
1457                 fib_rebalance(fi);
1458         }
1459 
1460         return ret;
1461 }
1462 
1463 /* Must be invoked inside of an RCU protected region.  */
1464 static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1465 {
1466         struct fib_info *fi = NULL, *last_resort = NULL;
1467         struct hlist_head *fa_head = res->fa_head;
1468         struct fib_table *tb = res->table;
1469         u8 slen = 32 - res->prefixlen;
1470         int order = -1, last_idx = -1;
1471         struct fib_alias *fa, *fa1 = NULL;
1472         u32 last_prio = res->fi->fib_priority;
1473         u8 last_tos = 0;
1474 
1475         hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1476                 struct fib_info *next_fi = fa->fa_info;
1477 
1478                 if (fa->fa_slen != slen)
1479                         continue;
1480                 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1481                         continue;
1482                 if (fa->tb_id != tb->tb_id)
1483                         continue;
1484                 if (next_fi->fib_priority > last_prio &&
1485                     fa->fa_tos == last_tos) {
1486                         if (last_tos)
1487                                 continue;
1488                         break;
1489                 }
1490                 if (next_fi->fib_flags & RTNH_F_DEAD)
1491                         continue;
1492                 last_tos = fa->fa_tos;
1493                 last_prio = next_fi->fib_priority;
1494 
1495                 if (next_fi->fib_scope != res->scope ||
1496                     fa->fa_type != RTN_UNICAST)
1497                         continue;
1498                 if (!next_fi->fib_nh[0].nh_gw ||
1499                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1500                         continue;
1501 
1502                 fib_alias_accessed(fa);
1503 
1504                 if (!fi) {
1505                         if (next_fi != res->fi)
1506                                 break;
1507                         fa1 = fa;
1508                 } else if (!fib_detect_death(fi, order, &last_resort,
1509                                              &last_idx, fa1->fa_default)) {
1510                         fib_result_assign(res, fi);
1511                         fa1->fa_default = order;
1512                         goto out;
1513                 }
1514                 fi = next_fi;
1515                 order++;
1516         }
1517 
1518         if (order <= 0 || !fi) {
1519                 if (fa1)
1520                         fa1->fa_default = -1;
1521                 goto out;
1522         }
1523 
1524         if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1525                               fa1->fa_default)) {
1526                 fib_result_assign(res, fi);
1527                 fa1->fa_default = order;
1528                 goto out;
1529         }
1530 
1531         if (last_idx >= 0)
1532                 fib_result_assign(res, last_resort);
1533         fa1->fa_default = last_idx;
1534 out:
1535         return;
1536 }
1537 
1538 /*
1539  * Dead device goes up. We wake up dead nexthops.
1540  * It takes sense only on multipath routes.
1541  */
1542 int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1543 {
1544         struct fib_info *prev_fi;
1545         unsigned int hash;
1546         struct hlist_head *head;
1547         struct fib_nh *nh;
1548         int ret;
1549 
1550         if (!(dev->flags & IFF_UP))
1551                 return 0;
1552 
1553         if (nh_flags & RTNH_F_DEAD) {
1554                 unsigned int flags = dev_get_flags(dev);
1555 
1556                 if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1557                         nh_flags |= RTNH_F_LINKDOWN;
1558         }
1559 
1560         prev_fi = NULL;
1561         hash = fib_devindex_hashfn(dev->ifindex);
1562         head = &fib_info_devhash[hash];
1563         ret = 0;
1564 
1565         hlist_for_each_entry(nh, head, nh_hash) {
1566                 struct fib_info *fi = nh->nh_parent;
1567                 int alive;
1568 
1569                 BUG_ON(!fi->fib_nhs);
1570                 if (nh->nh_dev != dev || fi == prev_fi)
1571                         continue;
1572 
1573                 prev_fi = fi;
1574                 alive = 0;
1575                 change_nexthops(fi) {
1576                         if (!(nexthop_nh->nh_flags & nh_flags)) {
1577                                 alive++;
1578                                 continue;
1579                         }
1580                         if (!nexthop_nh->nh_dev ||
1581                             !(nexthop_nh->nh_dev->flags & IFF_UP))
1582                                 continue;
1583                         if (nexthop_nh->nh_dev != dev ||
1584                             !__in_dev_get_rtnl(dev))
1585                                 continue;
1586                         alive++;
1587                         nexthop_nh->nh_flags &= ~nh_flags;
1588                         call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
1589                 } endfor_nexthops(fi)
1590 
1591                 if (alive > 0) {
1592                         fi->fib_flags &= ~nh_flags;
1593                         ret++;
1594                 }
1595 
1596                 fib_rebalance(fi);
1597         }
1598 
1599         return ret;
1600 }
1601 
1602 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1603 static bool fib_good_nh(const struct fib_nh *nh)
1604 {
1605         int state = NUD_REACHABLE;
1606 
1607         if (nh->nh_scope == RT_SCOPE_LINK) {
1608                 struct neighbour *n;
1609 
1610                 rcu_read_lock_bh();
1611 
1612                 n = __ipv4_neigh_lookup_noref(nh->nh_dev,
1613                                               (__force u32)nh->nh_gw);
1614                 if (n)
1615                         state = n->nud_state;
1616 
1617                 rcu_read_unlock_bh();
1618         }
1619 
1620         return !!(state & NUD_VALID);
1621 }
1622 
1623 void fib_select_multipath(struct fib_result *res, int hash)
1624 {
1625         struct fib_info *fi = res->fi;
1626         struct net *net = fi->fib_net;
1627         bool first = false;
1628 
1629         for_nexthops(fi) {
1630                 if (hash > atomic_read(&nh->nh_upper_bound))
1631                         continue;
1632 
1633                 if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
1634                     fib_good_nh(nh)) {
1635                         res->nh_sel = nhsel;
1636                         return;
1637                 }
1638                 if (!first) {
1639                         res->nh_sel = nhsel;
1640                         first = true;
1641                 }
1642         } endfor_nexthops(fi);
1643 }
1644 #endif
1645 
1646 void fib_select_path(struct net *net, struct fib_result *res,
1647                      struct flowi4 *fl4, int mp_hash)
1648 {
1649         bool oif_check;
1650 
1651         oif_check = (fl4->flowi4_oif == 0 ||
1652                      fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
1653 
1654 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1655         if (res->fi->fib_nhs > 1 && oif_check) {
1656                 if (mp_hash < 0)
1657                         mp_hash = get_hash_from_flowi4(fl4) >> 1;
1658 
1659                 fib_select_multipath(res, mp_hash);
1660         }
1661         else
1662 #endif
1663         if (!res->prefixlen &&
1664             res->table->tb_num_default > 1 &&
1665             res->type == RTN_UNICAST && oif_check)
1666                 fib_select_default(fl4, res);
1667 
1668         if (!fl4->saddr)
1669                 fl4->saddr = FIB_RES_PREFSRC(net, *res);
1670 }
1671 EXPORT_SYMBOL_GPL(fib_select_path);
1672 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp