~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv6/ip6_output.c

Version: ~ [ linux-6.0-rc6 ] ~ [ linux-5.19.10 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.69 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.144 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.214 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.259 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.294 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.329 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      IPv6 output functions
  3  *      Linux INET6 implementation
  4  *
  5  *      Authors:
  6  *      Pedro Roque             <roque@di.fc.ul.pt>
  7  *
  8  *      Based on linux/net/ipv4/ip_output.c
  9  *
 10  *      This program is free software; you can redistribute it and/or
 11  *      modify it under the terms of the GNU General Public License
 12  *      as published by the Free Software Foundation; either version
 13  *      2 of the License, or (at your option) any later version.
 14  *
 15  *      Changes:
 16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
 17  *                              extension headers are implemented.
 18  *                              route changes now work.
 19  *                              ip6_forward does not confuse sniffers.
 20  *                              etc.
 21  *
 22  *      H. von Brand    :       Added missing #include <linux/string.h>
 23  *      Imran Patel     :       frag id should be in NBO
 24  *      Kazunori MIYAZAWA @USAGI
 25  *                      :       add ip6_append_data and related functions
 26  *                              for datagram xmit
 27  */
 28 
 29 #include <linux/errno.h>
 30 #include <linux/kernel.h>
 31 #include <linux/string.h>
 32 #include <linux/socket.h>
 33 #include <linux/net.h>
 34 #include <linux/netdevice.h>
 35 #include <linux/if_arp.h>
 36 #include <linux/in6.h>
 37 #include <linux/tcp.h>
 38 #include <linux/route.h>
 39 #include <linux/module.h>
 40 #include <linux/slab.h>
 41 
 42 #include <linux/bpf-cgroup.h>
 43 #include <linux/netfilter.h>
 44 #include <linux/netfilter_ipv6.h>
 45 
 46 #include <net/sock.h>
 47 #include <net/snmp.h>
 48 
 49 #include <net/ipv6.h>
 50 #include <net/ndisc.h>
 51 #include <net/protocol.h>
 52 #include <net/ip6_route.h>
 53 #include <net/addrconf.h>
 54 #include <net/rawv6.h>
 55 #include <net/icmp.h>
 56 #include <net/xfrm.h>
 57 #include <net/checksum.h>
 58 #include <linux/mroute6.h>
 59 #include <net/l3mdev.h>
 60 #include <net/lwtunnel.h>
 61 
 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
 63 {
 64         struct dst_entry *dst = skb_dst(skb);
 65         struct net_device *dev = dst->dev;
 66         struct neighbour *neigh;
 67         struct in6_addr *nexthop;
 68         int ret;
 69 
 70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 72 
 73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
 74                     ((mroute6_socket(net, skb) &&
 75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 77                                          &ipv6_hdr(skb)->saddr))) {
 78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 79 
 80                         /* Do not check for IFF_ALLMULTI; multicast routing
 81                            is not supported in any case.
 82                          */
 83                         if (newskb)
 84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 85                                         net, sk, newskb, NULL, newskb->dev,
 86                                         dev_loopback_xmit);
 87 
 88                         if (ipv6_hdr(skb)->hop_limit == 0) {
 89                                 IP6_INC_STATS(net, idev,
 90                                               IPSTATS_MIB_OUTDISCARDS);
 91                                 kfree_skb(skb);
 92                                 return 0;
 93                         }
 94                 }
 95 
 96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 97 
 98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105 
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108 
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112 
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125 
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134 
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148 
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164 
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170 
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203 
204         if (opt) {
205                 unsigned int head_room;
206 
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235 
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239 
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247 
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250 
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254 
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257 
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261 
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266 
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273 
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281 
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298 
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313 
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329 
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336 
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339 
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343 
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361 
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371 
372         return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         struct dst_entry *dst = skb_dst(skb);
379 
380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383         return dst_output(net, sk, skb);
384 }
385 
386 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
387 {
388         unsigned int mtu;
389         struct inet6_dev *idev;
390 
391         if (dst_metric_locked(dst, RTAX_MTU)) {
392                 mtu = dst_metric_raw(dst, RTAX_MTU);
393                 if (mtu)
394                         return mtu;
395         }
396 
397         mtu = IPV6_MIN_MTU;
398         rcu_read_lock();
399         idev = __in6_dev_get(dst->dev);
400         if (idev)
401                 mtu = idev->cnf.mtu6;
402         rcu_read_unlock();
403 
404         return mtu;
405 }
406 
407 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
408 {
409         if (skb->len <= mtu)
410                 return false;
411 
412         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
413         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
414                 return true;
415 
416         if (skb->ignore_df)
417                 return false;
418 
419         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
420                 return false;
421 
422         return true;
423 }
424 
425 int ip6_forward(struct sk_buff *skb)
426 {
427         struct dst_entry *dst = skb_dst(skb);
428         struct ipv6hdr *hdr = ipv6_hdr(skb);
429         struct inet6_skb_parm *opt = IP6CB(skb);
430         struct net *net = dev_net(dst->dev);
431         u32 mtu;
432 
433         if (net->ipv6.devconf_all->forwarding == 0)
434                 goto error;
435 
436         if (skb->pkt_type != PACKET_HOST)
437                 goto drop;
438 
439         if (unlikely(skb->sk))
440                 goto drop;
441 
442         if (skb_warn_if_lro(skb))
443                 goto drop;
444 
445         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
446                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447                                 IPSTATS_MIB_INDISCARDS);
448                 goto drop;
449         }
450 
451         skb_forward_csum(skb);
452 
453         /*
454          *      We DO NOT make any processing on
455          *      RA packets, pushing them to user level AS IS
456          *      without ane WARRANTY that application will be able
457          *      to interpret them. The reason is that we
458          *      cannot make anything clever here.
459          *
460          *      We are not end-node, so that if packet contains
461          *      AH/ESP, we cannot make anything.
462          *      Defragmentation also would be mistake, RA packets
463          *      cannot be fragmented, because there is no warranty
464          *      that different fragments will go along one path. --ANK
465          */
466         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
467                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
468                         return 0;
469         }
470 
471         /*
472          *      check and decrement ttl
473          */
474         if (hdr->hop_limit <= 1) {
475                 /* Force OUTPUT device used as source address */
476                 skb->dev = dst->dev;
477                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
479                                 IPSTATS_MIB_INHDRERRORS);
480 
481                 kfree_skb(skb);
482                 return -ETIMEDOUT;
483         }
484 
485         /* XXX: idev->cnf.proxy_ndp? */
486         if (net->ipv6.devconf_all->proxy_ndp &&
487             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
488                 int proxied = ip6_forward_proxy_check(skb);
489                 if (proxied > 0)
490                         return ip6_input(skb);
491                 else if (proxied < 0) {
492                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
493                                         IPSTATS_MIB_INDISCARDS);
494                         goto drop;
495                 }
496         }
497 
498         if (!xfrm6_route_forward(skb)) {
499                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
500                                 IPSTATS_MIB_INDISCARDS);
501                 goto drop;
502         }
503         dst = skb_dst(skb);
504 
505         /* IPv6 specs say nothing about it, but it is clear that we cannot
506            send redirects to source routed frames.
507            We don't send redirects to frames decapsulated from IPsec.
508          */
509         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
510                 struct in6_addr *target = NULL;
511                 struct inet_peer *peer;
512                 struct rt6_info *rt;
513 
514                 /*
515                  *      incoming and outgoing devices are the same
516                  *      send a redirect.
517                  */
518 
519                 rt = (struct rt6_info *) dst;
520                 if (rt->rt6i_flags & RTF_GATEWAY)
521                         target = &rt->rt6i_gateway;
522                 else
523                         target = &hdr->daddr;
524 
525                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
526 
527                 /* Limit redirects both by destination (here)
528                    and by source (inside ndisc_send_redirect)
529                  */
530                 if (inet_peer_xrlim_allow(peer, 1*HZ))
531                         ndisc_send_redirect(skb, target);
532                 if (peer)
533                         inet_putpeer(peer);
534         } else {
535                 int addrtype = ipv6_addr_type(&hdr->saddr);
536 
537                 /* This check is security critical. */
538                 if (addrtype == IPV6_ADDR_ANY ||
539                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
540                         goto error;
541                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
542                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
543                                     ICMPV6_NOT_NEIGHBOUR, 0);
544                         goto error;
545                 }
546         }
547 
548         mtu = ip6_dst_mtu_forward(dst);
549         if (mtu < IPV6_MIN_MTU)
550                 mtu = IPV6_MIN_MTU;
551 
552         if (ip6_pkt_too_big(skb, mtu)) {
553                 /* Again, force OUTPUT device used as source address */
554                 skb->dev = dst->dev;
555                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
556                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
557                                 IPSTATS_MIB_INTOOBIGERRORS);
558                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
559                                 IPSTATS_MIB_FRAGFAILS);
560                 kfree_skb(skb);
561                 return -EMSGSIZE;
562         }
563 
564         if (skb_cow(skb, dst->dev->hard_header_len)) {
565                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
566                                 IPSTATS_MIB_OUTDISCARDS);
567                 goto drop;
568         }
569 
570         hdr = ipv6_hdr(skb);
571 
572         /* Mangling hops number delayed to point after skb COW */
573 
574         hdr->hop_limit--;
575 
576         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
577                        net, NULL, skb, skb->dev, dst->dev,
578                        ip6_forward_finish);
579 
580 error:
581         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
582 drop:
583         kfree_skb(skb);
584         return -EINVAL;
585 }
586 
587 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
588 {
589         to->pkt_type = from->pkt_type;
590         to->priority = from->priority;
591         to->protocol = from->protocol;
592         skb_dst_drop(to);
593         skb_dst_set(to, dst_clone(skb_dst(from)));
594         to->dev = from->dev;
595         to->mark = from->mark;
596 
597 #ifdef CONFIG_NET_SCHED
598         to->tc_index = from->tc_index;
599 #endif
600         nf_copy(to, from);
601         skb_copy_secmark(to, from);
602 }
603 
604 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
605                  int (*output)(struct net *, struct sock *, struct sk_buff *))
606 {
607         struct sk_buff *frag;
608         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
609         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
610                                 inet6_sk(skb->sk) : NULL;
611         struct ipv6hdr *tmp_hdr;
612         struct frag_hdr *fh;
613         unsigned int mtu, hlen, left, len;
614         int hroom, troom;
615         __be32 frag_id;
616         int ptr, offset = 0, err = 0;
617         u8 *prevhdr, nexthdr = 0;
618 
619         err = ip6_find_1stfragopt(skb, &prevhdr);
620         if (err < 0)
621                 goto fail;
622         hlen = err;
623         nexthdr = *prevhdr;
624 
625         mtu = ip6_skb_dst_mtu(skb);
626 
627         /* We must not fragment if the socket is set to force MTU discovery
628          * or if the skb it not generated by a local socket.
629          */
630         if (unlikely(!skb->ignore_df && skb->len > mtu))
631                 goto fail_toobig;
632 
633         if (IP6CB(skb)->frag_max_size) {
634                 if (IP6CB(skb)->frag_max_size > mtu)
635                         goto fail_toobig;
636 
637                 /* don't send fragments larger than what we received */
638                 mtu = IP6CB(skb)->frag_max_size;
639                 if (mtu < IPV6_MIN_MTU)
640                         mtu = IPV6_MIN_MTU;
641         }
642 
643         if (np && np->frag_size < mtu) {
644                 if (np->frag_size)
645                         mtu = np->frag_size;
646         }
647         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
648                 goto fail_toobig;
649         mtu -= hlen + sizeof(struct frag_hdr);
650 
651         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
652                                     &ipv6_hdr(skb)->saddr);
653 
654         if (skb->ip_summed == CHECKSUM_PARTIAL &&
655             (err = skb_checksum_help(skb)))
656                 goto fail;
657 
658         hroom = LL_RESERVED_SPACE(rt->dst.dev);
659         if (skb_has_frag_list(skb)) {
660                 unsigned int first_len = skb_pagelen(skb);
661                 struct sk_buff *frag2;
662 
663                 if (first_len - hlen > mtu ||
664                     ((first_len - hlen) & 7) ||
665                     skb_cloned(skb) ||
666                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
667                         goto slow_path;
668 
669                 skb_walk_frags(skb, frag) {
670                         /* Correct geometry. */
671                         if (frag->len > mtu ||
672                             ((frag->len & 7) && frag->next) ||
673                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
674                                 goto slow_path_clean;
675 
676                         /* Partially cloned skb? */
677                         if (skb_shared(frag))
678                                 goto slow_path_clean;
679 
680                         BUG_ON(frag->sk);
681                         if (skb->sk) {
682                                 frag->sk = skb->sk;
683                                 frag->destructor = sock_wfree;
684                         }
685                         skb->truesize -= frag->truesize;
686                 }
687 
688                 err = 0;
689                 offset = 0;
690                 /* BUILD HEADER */
691 
692                 *prevhdr = NEXTHDR_FRAGMENT;
693                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
694                 if (!tmp_hdr) {
695                         err = -ENOMEM;
696                         goto fail;
697                 }
698                 frag = skb_shinfo(skb)->frag_list;
699                 skb_frag_list_init(skb);
700 
701                 __skb_pull(skb, hlen);
702                 fh = __skb_push(skb, sizeof(struct frag_hdr));
703                 __skb_push(skb, hlen);
704                 skb_reset_network_header(skb);
705                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
706 
707                 fh->nexthdr = nexthdr;
708                 fh->reserved = 0;
709                 fh->frag_off = htons(IP6_MF);
710                 fh->identification = frag_id;
711 
712                 first_len = skb_pagelen(skb);
713                 skb->data_len = first_len - skb_headlen(skb);
714                 skb->len = first_len;
715                 ipv6_hdr(skb)->payload_len = htons(first_len -
716                                                    sizeof(struct ipv6hdr));
717 
718                 for (;;) {
719                         /* Prepare header of the next frame,
720                          * before previous one went down. */
721                         if (frag) {
722                                 frag->ip_summed = CHECKSUM_NONE;
723                                 skb_reset_transport_header(frag);
724                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
725                                 __skb_push(frag, hlen);
726                                 skb_reset_network_header(frag);
727                                 memcpy(skb_network_header(frag), tmp_hdr,
728                                        hlen);
729                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
730                                 fh->nexthdr = nexthdr;
731                                 fh->reserved = 0;
732                                 fh->frag_off = htons(offset);
733                                 if (frag->next)
734                                         fh->frag_off |= htons(IP6_MF);
735                                 fh->identification = frag_id;
736                                 ipv6_hdr(frag)->payload_len =
737                                                 htons(frag->len -
738                                                       sizeof(struct ipv6hdr));
739                                 ip6_copy_metadata(frag, skb);
740                         }
741 
742                         err = output(net, sk, skb);
743                         if (!err)
744                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
745                                               IPSTATS_MIB_FRAGCREATES);
746 
747                         if (err || !frag)
748                                 break;
749 
750                         skb = frag;
751                         frag = skb->next;
752                         skb->next = NULL;
753                 }
754 
755                 kfree(tmp_hdr);
756 
757                 if (err == 0) {
758                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
759                                       IPSTATS_MIB_FRAGOKS);
760                         return 0;
761                 }
762 
763                 kfree_skb_list(frag);
764 
765                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
766                               IPSTATS_MIB_FRAGFAILS);
767                 return err;
768 
769 slow_path_clean:
770                 skb_walk_frags(skb, frag2) {
771                         if (frag2 == frag)
772                                 break;
773                         frag2->sk = NULL;
774                         frag2->destructor = NULL;
775                         skb->truesize += frag2->truesize;
776                 }
777         }
778 
779 slow_path:
780         left = skb->len - hlen;         /* Space per frame */
781         ptr = hlen;                     /* Where to start from */
782 
783         /*
784          *      Fragment the datagram.
785          */
786 
787         troom = rt->dst.dev->needed_tailroom;
788 
789         /*
790          *      Keep copying data until we run out.
791          */
792         while (left > 0)        {
793                 u8 *fragnexthdr_offset;
794 
795                 len = left;
796                 /* IF: it doesn't fit, use 'mtu' - the data space left */
797                 if (len > mtu)
798                         len = mtu;
799                 /* IF: we are not sending up to and including the packet end
800                    then align the next start on an eight byte boundary */
801                 if (len < left) {
802                         len &= ~7;
803                 }
804 
805                 /* Allocate buffer */
806                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
807                                  hroom + troom, GFP_ATOMIC);
808                 if (!frag) {
809                         err = -ENOMEM;
810                         goto fail;
811                 }
812 
813                 /*
814                  *      Set up data on packet
815                  */
816 
817                 ip6_copy_metadata(frag, skb);
818                 skb_reserve(frag, hroom);
819                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
820                 skb_reset_network_header(frag);
821                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
822                 frag->transport_header = (frag->network_header + hlen +
823                                           sizeof(struct frag_hdr));
824 
825                 /*
826                  *      Charge the memory for the fragment to any owner
827                  *      it might possess
828                  */
829                 if (skb->sk)
830                         skb_set_owner_w(frag, skb->sk);
831 
832                 /*
833                  *      Copy the packet header into the new buffer.
834                  */
835                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
836 
837                 fragnexthdr_offset = skb_network_header(frag);
838                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
839                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
840 
841                 /*
842                  *      Build fragment header.
843                  */
844                 fh->nexthdr = nexthdr;
845                 fh->reserved = 0;
846                 fh->identification = frag_id;
847 
848                 /*
849                  *      Copy a block of the IP datagram.
850                  */
851                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
852                                      len));
853                 left -= len;
854 
855                 fh->frag_off = htons(offset);
856                 if (left > 0)
857                         fh->frag_off |= htons(IP6_MF);
858                 ipv6_hdr(frag)->payload_len = htons(frag->len -
859                                                     sizeof(struct ipv6hdr));
860 
861                 ptr += len;
862                 offset += len;
863 
864                 /*
865                  *      Put this fragment into the sending queue.
866                  */
867                 err = output(net, sk, frag);
868                 if (err)
869                         goto fail;
870 
871                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872                               IPSTATS_MIB_FRAGCREATES);
873         }
874         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
875                       IPSTATS_MIB_FRAGOKS);
876         consume_skb(skb);
877         return err;
878 
879 fail_toobig:
880         if (skb->sk && dst_allfrag(skb_dst(skb)))
881                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
882 
883         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
884         err = -EMSGSIZE;
885 
886 fail:
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGFAILS);
889         kfree_skb(skb);
890         return err;
891 }
892 
893 static inline int ip6_rt_check(const struct rt6key *rt_key,
894                                const struct in6_addr *fl_addr,
895                                const struct in6_addr *addr_cache)
896 {
897         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
898                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
899 }
900 
901 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
902                                           struct dst_entry *dst,
903                                           const struct flowi6 *fl6)
904 {
905         struct ipv6_pinfo *np = inet6_sk(sk);
906         struct rt6_info *rt;
907 
908         if (!dst)
909                 goto out;
910 
911         if (dst->ops->family != AF_INET6) {
912                 dst_release(dst);
913                 return NULL;
914         }
915 
916         rt = (struct rt6_info *)dst;
917         /* Yes, checking route validity in not connected
918          * case is not very simple. Take into account,
919          * that we do not support routing by source, TOS,
920          * and MSG_DONTROUTE            --ANK (980726)
921          *
922          * 1. ip6_rt_check(): If route was host route,
923          *    check that cached destination is current.
924          *    If it is network route, we still may
925          *    check its validity using saved pointer
926          *    to the last used address: daddr_cache.
927          *    We do not want to save whole address now,
928          *    (because main consumer of this service
929          *    is tcp, which has not this problem),
930          *    so that the last trick works only on connected
931          *    sockets.
932          * 2. oif also should be the same.
933          */
934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
939               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
940                 dst_release(dst);
941                 dst = NULL;
942         }
943 
944 out:
945         return dst;
946 }
947 
948 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
949                                struct dst_entry **dst, struct flowi6 *fl6)
950 {
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952         struct neighbour *n;
953         struct rt6_info *rt;
954 #endif
955         int err;
956         int flags = 0;
957 
958         /* The correct way to handle this would be to do
959          * ip6_route_get_saddr, and then ip6_route_output; however,
960          * the route-specific preferred source forces the
961          * ip6_route_output call _before_ ip6_route_get_saddr.
962          *
963          * In source specific routing (no src=any default route),
964          * ip6_route_output will fail given src=any saddr, though, so
965          * that's why we try it again later.
966          */
967         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
968                 struct rt6_info *rt;
969                 bool had_dst = *dst != NULL;
970 
971                 if (!had_dst)
972                         *dst = ip6_route_output(net, sk, fl6);
973                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
974                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
975                                           sk ? inet6_sk(sk)->srcprefs : 0,
976                                           &fl6->saddr);
977                 if (err)
978                         goto out_err_release;
979 
980                 /* If we had an erroneous initial result, pretend it
981                  * never existed and let the SA-enabled version take
982                  * over.
983                  */
984                 if (!had_dst && (*dst)->error) {
985                         dst_release(*dst);
986                         *dst = NULL;
987                 }
988 
989                 if (fl6->flowi6_oif)
990                         flags |= RT6_LOOKUP_F_IFACE;
991         }
992 
993         if (!*dst)
994                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
995 
996         err = (*dst)->error;
997         if (err)
998                 goto out_err_release;
999 
1000 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1001         /*
1002          * Here if the dst entry we've looked up
1003          * has a neighbour entry that is in the INCOMPLETE
1004          * state and the src address from the flow is
1005          * marked as OPTIMISTIC, we release the found
1006          * dst entry and replace it instead with the
1007          * dst entry of the nexthop router
1008          */
1009         rt = (struct rt6_info *) *dst;
1010         rcu_read_lock_bh();
1011         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1012                                       rt6_nexthop(rt, &fl6->daddr));
1013         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1014         rcu_read_unlock_bh();
1015 
1016         if (err) {
1017                 struct inet6_ifaddr *ifp;
1018                 struct flowi6 fl_gw6;
1019                 int redirect;
1020 
1021                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1022                                       (*dst)->dev, 1);
1023 
1024                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1025                 if (ifp)
1026                         in6_ifa_put(ifp);
1027 
1028                 if (redirect) {
1029                         /*
1030                          * We need to get the dst entry for the
1031                          * default router instead
1032                          */
1033                         dst_release(*dst);
1034                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1035                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1036                         *dst = ip6_route_output(net, sk, &fl_gw6);
1037                         err = (*dst)->error;
1038                         if (err)
1039                                 goto out_err_release;
1040                 }
1041         }
1042 #endif
1043         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1044             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1045                 err = -EAFNOSUPPORT;
1046                 goto out_err_release;
1047         }
1048 
1049         return 0;
1050 
1051 out_err_release:
1052         dst_release(*dst);
1053         *dst = NULL;
1054 
1055         if (err == -ENETUNREACH)
1056                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1057         return err;
1058 }
1059 
1060 /**
1061  *      ip6_dst_lookup - perform route lookup on flow
1062  *      @sk: socket which provides route info
1063  *      @dst: pointer to dst_entry * for result
1064  *      @fl6: flow to lookup
1065  *
1066  *      This function performs a route lookup on the given flow.
1067  *
1068  *      It returns zero on success, or a standard errno code on error.
1069  */
1070 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1071                    struct flowi6 *fl6)
1072 {
1073         *dst = NULL;
1074         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1077 
1078 /**
1079  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1080  *      @sk: socket which provides route info
1081  *      @fl6: flow to lookup
1082  *      @final_dst: final destination address for ipsec lookup
1083  *
1084  *      This function performs a route lookup on the given flow.
1085  *
1086  *      It returns a valid dst pointer on success, or a pointer encoded
1087  *      error code.
1088  */
1089 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1090                                       const struct in6_addr *final_dst)
1091 {
1092         struct dst_entry *dst = NULL;
1093         int err;
1094 
1095         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1096         if (err)
1097                 return ERR_PTR(err);
1098         if (final_dst)
1099                 fl6->daddr = *final_dst;
1100 
1101         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1104 
1105 /**
1106  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1107  *      @sk: socket which provides the dst cache and route info
1108  *      @fl6: flow to lookup
1109  *      @final_dst: final destination address for ipsec lookup
1110  *
1111  *      This function performs a route lookup on the given flow with the
1112  *      possibility of using the cached route in the socket if it is valid.
1113  *      It will take the socket dst lock when operating on the dst cache.
1114  *      As a result, this function can only be used in process context.
1115  *
1116  *      It returns a valid dst pointer on success, or a pointer encoded
1117  *      error code.
1118  */
1119 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1120                                          const struct in6_addr *final_dst)
1121 {
1122         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123 
1124         dst = ip6_sk_dst_check(sk, dst, fl6);
1125         if (!dst)
1126                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1127 
1128         return dst;
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1131 
1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1133                                                gfp_t gfp)
1134 {
1135         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137 
1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1139                                                 gfp_t gfp)
1140 {
1141         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143 
1144 static void ip6_append_data_mtu(unsigned int *mtu,
1145                                 int *maxfraglen,
1146                                 unsigned int fragheaderlen,
1147                                 struct sk_buff *skb,
1148                                 struct rt6_info *rt,
1149                                 unsigned int orig_mtu)
1150 {
1151         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1152                 if (!skb) {
1153                         /* first fragment, reserve header_len */
1154                         *mtu = orig_mtu - rt->dst.header_len;
1155 
1156                 } else {
1157                         /*
1158                          * this fragment is not first, the headers
1159                          * space is regarded as data space.
1160                          */
1161                         *mtu = orig_mtu;
1162                 }
1163                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1164                               + fragheaderlen - sizeof(struct frag_hdr);
1165         }
1166 }
1167 
1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1169                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1170                           struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172         struct ipv6_pinfo *np = inet6_sk(sk);
1173         unsigned int mtu;
1174         struct ipv6_txoptions *opt = ipc6->opt;
1175 
1176         /*
1177          * setup for corking
1178          */
1179         if (opt) {
1180                 if (WARN_ON(v6_cork->opt))
1181                         return -EINVAL;
1182 
1183                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1184                 if (unlikely(!v6_cork->opt))
1185                         return -ENOBUFS;
1186 
1187                 v6_cork->opt->tot_len = sizeof(*opt);
1188                 v6_cork->opt->opt_flen = opt->opt_flen;
1189                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1190 
1191                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1192                                                     sk->sk_allocation);
1193                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1194                         return -ENOBUFS;
1195 
1196                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1197                                                     sk->sk_allocation);
1198                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1199                         return -ENOBUFS;
1200 
1201                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1202                                                    sk->sk_allocation);
1203                 if (opt->hopopt && !v6_cork->opt->hopopt)
1204                         return -ENOBUFS;
1205 
1206                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1207                                                     sk->sk_allocation);
1208                 if (opt->srcrt && !v6_cork->opt->srcrt)
1209                         return -ENOBUFS;
1210 
1211                 /* need source address above miyazawa*/
1212         }
1213         dst_hold(&rt->dst);
1214         cork->base.dst = &rt->dst;
1215         cork->fl.u.ip6 = *fl6;
1216         v6_cork->hop_limit = ipc6->hlimit;
1217         v6_cork->tclass = ipc6->tclass;
1218         if (rt->dst.flags & DST_XFRM_TUNNEL)
1219                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1221         else
1222                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1224         if (np->frag_size < mtu) {
1225                 if (np->frag_size)
1226                         mtu = np->frag_size;
1227         }
1228         if (mtu < IPV6_MIN_MTU)
1229                 return -EINVAL;
1230         cork->base.fragsize = mtu;
1231         if (dst_allfrag(rt->dst.path))
1232                 cork->base.flags |= IPCORK_ALLFRAG;
1233         cork->base.length = 0;
1234 
1235         return 0;
1236 }
1237 
1238 static int __ip6_append_data(struct sock *sk,
1239                              struct flowi6 *fl6,
1240                              struct sk_buff_head *queue,
1241                              struct inet_cork *cork,
1242                              struct inet6_cork *v6_cork,
1243                              struct page_frag *pfrag,
1244                              int getfrag(void *from, char *to, int offset,
1245                                          int len, int odd, struct sk_buff *skb),
1246                              void *from, int length, int transhdrlen,
1247                              unsigned int flags, struct ipcm6_cookie *ipc6,
1248                              const struct sockcm_cookie *sockc)
1249 {
1250         struct sk_buff *skb, *skb_prev = NULL;
1251         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1252         int exthdrlen = 0;
1253         int dst_exthdrlen = 0;
1254         int hh_len;
1255         int copy;
1256         int err;
1257         int offset = 0;
1258         __u8 tx_flags = 0;
1259         u32 tskey = 0;
1260         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1261         struct ipv6_txoptions *opt = v6_cork->opt;
1262         int csummode = CHECKSUM_NONE;
1263         unsigned int maxnonfragsize, headersize;
1264 
1265         skb = skb_peek_tail(queue);
1266         if (!skb) {
1267                 exthdrlen = opt ? opt->opt_flen : 0;
1268                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1269         }
1270 
1271         mtu = cork->fragsize;
1272         orig_mtu = mtu;
1273 
1274         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1275 
1276         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277                         (opt ? opt->opt_nflen : 0);
1278         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1279                      sizeof(struct frag_hdr);
1280 
1281         headersize = sizeof(struct ipv6hdr) +
1282                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1283                      (dst_allfrag(&rt->dst) ?
1284                       sizeof(struct frag_hdr) : 0) +
1285                      rt->rt6i_nfheader_len;
1286 
1287         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1288          * the first fragment
1289          */
1290         if (headersize + transhdrlen > mtu)
1291                 goto emsgsize;
1292 
1293         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1294             (sk->sk_protocol == IPPROTO_UDP ||
1295              sk->sk_protocol == IPPROTO_RAW)) {
1296                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1297                                 sizeof(struct ipv6hdr));
1298                 goto emsgsize;
1299         }
1300 
1301         if (ip6_sk_ignore_df(sk))
1302                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1303         else
1304                 maxnonfragsize = mtu;
1305 
1306         if (cork->length + length > maxnonfragsize - headersize) {
1307 emsgsize:
1308                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1309                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1310                 return -EMSGSIZE;
1311         }
1312 
1313         /* CHECKSUM_PARTIAL only with no extension headers and when
1314          * we are not going to fragment
1315          */
1316         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1317             headersize == sizeof(struct ipv6hdr) &&
1318             length <= mtu - headersize &&
1319             !(flags & MSG_MORE) &&
1320             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1321                 csummode = CHECKSUM_PARTIAL;
1322 
1323         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1324                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1325                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1326                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1327                         tskey = sk->sk_tskey++;
1328         }
1329 
1330         /*
1331          * Let's try using as much space as possible.
1332          * Use MTU if total length of the message fits into the MTU.
1333          * Otherwise, we need to reserve fragment header and
1334          * fragment alignment (= 8-15 octects, in total).
1335          *
1336          * Note that we may need to "move" the data from the tail of
1337          * of the buffer to the new fragment when we split
1338          * the message.
1339          *
1340          * FIXME: It may be fragmented into multiple chunks
1341          *        at once if non-fragmentable extension headers
1342          *        are too large.
1343          * --yoshfuji
1344          */
1345 
1346         cork->length += length;
1347         if (!skb)
1348                 goto alloc_new_skb;
1349 
1350         while (length > 0) {
1351                 /* Check if the remaining data fits into current packet. */
1352                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1353                 if (copy < length)
1354                         copy = maxfraglen - skb->len;
1355 
1356                 if (copy <= 0) {
1357                         char *data;
1358                         unsigned int datalen;
1359                         unsigned int fraglen;
1360                         unsigned int fraggap;
1361                         unsigned int alloclen;
1362 alloc_new_skb:
1363                         /* There's no room in the current skb */
1364                         if (skb)
1365                                 fraggap = skb->len - maxfraglen;
1366                         else
1367                                 fraggap = 0;
1368                         /* update mtu and maxfraglen if necessary */
1369                         if (!skb || !skb_prev)
1370                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1371                                                     fragheaderlen, skb, rt,
1372                                                     orig_mtu);
1373 
1374                         skb_prev = skb;
1375 
1376                         /*
1377                          * If remaining data exceeds the mtu,
1378                          * we know we need more fragment(s).
1379                          */
1380                         datalen = length + fraggap;
1381 
1382                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else
1388                                 alloclen = datalen + fragheaderlen;
1389 
1390                         alloclen += dst_exthdrlen;
1391 
1392                         if (datalen != length + fraggap) {
1393                                 /*
1394                                  * this is not the last fragment, the trailer
1395                                  * space is regarded as data space.
1396                                  */
1397                                 datalen += rt->dst.trailer_len;
1398                         }
1399 
1400                         alloclen += rt->dst.trailer_len;
1401                         fraglen = datalen + fragheaderlen;
1402 
1403                         /*
1404                          * We just reserve space for fragment header.
1405                          * Note: this may be overallocation if the message
1406                          * (without MSG_MORE) fits into the MTU.
1407                          */
1408                         alloclen += sizeof(struct frag_hdr);
1409 
1410                         copy = datalen - transhdrlen - fraggap;
1411                         if (copy < 0) {
1412                                 err = -EINVAL;
1413                                 goto error;
1414                         }
1415                         if (transhdrlen) {
1416                                 skb = sock_alloc_send_skb(sk,
1417                                                 alloclen + hh_len,
1418                                                 (flags & MSG_DONTWAIT), &err);
1419                         } else {
1420                                 skb = NULL;
1421                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1422                                     2 * sk->sk_sndbuf)
1423                                         skb = sock_wmalloc(sk,
1424                                                            alloclen + hh_len, 1,
1425                                                            sk->sk_allocation);
1426                                 if (unlikely(!skb))
1427                                         err = -ENOBUFS;
1428                         }
1429                         if (!skb)
1430                                 goto error;
1431                         /*
1432                          *      Fill in the control structures
1433                          */
1434                         skb->protocol = htons(ETH_P_IPV6);
1435                         skb->ip_summed = csummode;
1436                         skb->csum = 0;
1437                         /* reserve for fragmentation and ipsec header */
1438                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1439                                     dst_exthdrlen);
1440 
1441                         /* Only the initial fragment is time stamped */
1442                         skb_shinfo(skb)->tx_flags = tx_flags;
1443                         tx_flags = 0;
1444                         skb_shinfo(skb)->tskey = tskey;
1445                         tskey = 0;
1446 
1447                         /*
1448                          *      Find where to start putting bytes
1449                          */
1450                         data = skb_put(skb, fraglen);
1451                         skb_set_network_header(skb, exthdrlen);
1452                         data += fragheaderlen;
1453                         skb->transport_header = (skb->network_header +
1454                                                  fragheaderlen);
1455                         if (fraggap) {
1456                                 skb->csum = skb_copy_and_csum_bits(
1457                                         skb_prev, maxfraglen,
1458                                         data + transhdrlen, fraggap, 0);
1459                                 skb_prev->csum = csum_sub(skb_prev->csum,
1460                                                           skb->csum);
1461                                 data += fraggap;
1462                                 pskb_trim_unique(skb_prev, maxfraglen);
1463                         }
1464                         if (copy > 0 &&
1465                             getfrag(from, data + transhdrlen, offset,
1466                                     copy, fraggap, skb) < 0) {
1467                                 err = -EFAULT;
1468                                 kfree_skb(skb);
1469                                 goto error;
1470                         }
1471 
1472                         offset += copy;
1473                         length -= datalen - fraggap;
1474                         transhdrlen = 0;
1475                         exthdrlen = 0;
1476                         dst_exthdrlen = 0;
1477 
1478                         if ((flags & MSG_CONFIRM) && !skb_prev)
1479                                 skb_set_dst_pending_confirm(skb, 1);
1480 
1481                         /*
1482                          * Put the packet on the pending queue
1483                          */
1484                         __skb_queue_tail(queue, skb);
1485                         continue;
1486                 }
1487 
1488                 if (copy > length)
1489                         copy = length;
1490 
1491                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492                         unsigned int off;
1493 
1494                         off = skb->len;
1495                         if (getfrag(from, skb_put(skb, copy),
1496                                                 offset, copy, off, skb) < 0) {
1497                                 __skb_trim(skb, off);
1498                                 err = -EFAULT;
1499                                 goto error;
1500                         }
1501                 } else {
1502                         int i = skb_shinfo(skb)->nr_frags;
1503 
1504                         err = -ENOMEM;
1505                         if (!sk_page_frag_refill(sk, pfrag))
1506                                 goto error;
1507 
1508                         if (!skb_can_coalesce(skb, i, pfrag->page,
1509                                               pfrag->offset)) {
1510                                 err = -EMSGSIZE;
1511                                 if (i == MAX_SKB_FRAGS)
1512                                         goto error;
1513 
1514                                 __skb_fill_page_desc(skb, i, pfrag->page,
1515                                                      pfrag->offset, 0);
1516                                 skb_shinfo(skb)->nr_frags = ++i;
1517                                 get_page(pfrag->page);
1518                         }
1519                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1520                         if (getfrag(from,
1521                                     page_address(pfrag->page) + pfrag->offset,
1522                                     offset, copy, skb->len, skb) < 0)
1523                                 goto error_efault;
1524 
1525                         pfrag->offset += copy;
1526                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1527                         skb->len += copy;
1528                         skb->data_len += copy;
1529                         skb->truesize += copy;
1530                         refcount_add(copy, &sk->sk_wmem_alloc);
1531                 }
1532                 offset += copy;
1533                 length -= copy;
1534         }
1535 
1536         return 0;
1537 
1538 error_efault:
1539         err = -EFAULT;
1540 error:
1541         cork->length -= length;
1542         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1543         return err;
1544 }
1545 
1546 int ip6_append_data(struct sock *sk,
1547                     int getfrag(void *from, char *to, int offset, int len,
1548                                 int odd, struct sk_buff *skb),
1549                     void *from, int length, int transhdrlen,
1550                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1551                     struct rt6_info *rt, unsigned int flags,
1552                     const struct sockcm_cookie *sockc)
1553 {
1554         struct inet_sock *inet = inet_sk(sk);
1555         struct ipv6_pinfo *np = inet6_sk(sk);
1556         int exthdrlen;
1557         int err;
1558 
1559         if (flags&MSG_PROBE)
1560                 return 0;
1561         if (skb_queue_empty(&sk->sk_write_queue)) {
1562                 /*
1563                  * setup for corking
1564                  */
1565                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1566                                      ipc6, rt, fl6);
1567                 if (err)
1568                         return err;
1569 
1570                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1571                 length += exthdrlen;
1572                 transhdrlen += exthdrlen;
1573         } else {
1574                 fl6 = &inet->cork.fl.u.ip6;
1575                 transhdrlen = 0;
1576         }
1577 
1578         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1579                                  &np->cork, sk_page_frag(sk), getfrag,
1580                                  from, length, transhdrlen, flags, ipc6, sockc);
1581 }
1582 EXPORT_SYMBOL_GPL(ip6_append_data);
1583 
1584 static void ip6_cork_release(struct inet_cork_full *cork,
1585                              struct inet6_cork *v6_cork)
1586 {
1587         if (v6_cork->opt) {
1588                 kfree(v6_cork->opt->dst0opt);
1589                 kfree(v6_cork->opt->dst1opt);
1590                 kfree(v6_cork->opt->hopopt);
1591                 kfree(v6_cork->opt->srcrt);
1592                 kfree(v6_cork->opt);
1593                 v6_cork->opt = NULL;
1594         }
1595 
1596         if (cork->base.dst) {
1597                 dst_release(cork->base.dst);
1598                 cork->base.dst = NULL;
1599                 cork->base.flags &= ~IPCORK_ALLFRAG;
1600         }
1601         memset(&cork->fl, 0, sizeof(cork->fl));
1602 }
1603 
1604 struct sk_buff *__ip6_make_skb(struct sock *sk,
1605                                struct sk_buff_head *queue,
1606                                struct inet_cork_full *cork,
1607                                struct inet6_cork *v6_cork)
1608 {
1609         struct sk_buff *skb, *tmp_skb;
1610         struct sk_buff **tail_skb;
1611         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1612         struct ipv6_pinfo *np = inet6_sk(sk);
1613         struct net *net = sock_net(sk);
1614         struct ipv6hdr *hdr;
1615         struct ipv6_txoptions *opt = v6_cork->opt;
1616         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1617         struct flowi6 *fl6 = &cork->fl.u.ip6;
1618         unsigned char proto = fl6->flowi6_proto;
1619 
1620         skb = __skb_dequeue(queue);
1621         if (!skb)
1622                 goto out;
1623         tail_skb = &(skb_shinfo(skb)->frag_list);
1624 
1625         /* move skb->data to ip header from ext header */
1626         if (skb->data < skb_network_header(skb))
1627                 __skb_pull(skb, skb_network_offset(skb));
1628         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1629                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1630                 *tail_skb = tmp_skb;
1631                 tail_skb = &(tmp_skb->next);
1632                 skb->len += tmp_skb->len;
1633                 skb->data_len += tmp_skb->len;
1634                 skb->truesize += tmp_skb->truesize;
1635                 tmp_skb->destructor = NULL;
1636                 tmp_skb->sk = NULL;
1637         }
1638 
1639         /* Allow local fragmentation. */
1640         skb->ignore_df = ip6_sk_ignore_df(sk);
1641 
1642         *final_dst = fl6->daddr;
1643         __skb_pull(skb, skb_network_header_len(skb));
1644         if (opt && opt->opt_flen)
1645                 ipv6_push_frag_opts(skb, opt, &proto);
1646         if (opt && opt->opt_nflen)
1647                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1648 
1649         skb_push(skb, sizeof(struct ipv6hdr));
1650         skb_reset_network_header(skb);
1651         hdr = ipv6_hdr(skb);
1652 
1653         ip6_flow_hdr(hdr, v6_cork->tclass,
1654                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1655                                         ip6_autoflowlabel(net, np), fl6));
1656         hdr->hop_limit = v6_cork->hop_limit;
1657         hdr->nexthdr = proto;
1658         hdr->saddr = fl6->saddr;
1659         hdr->daddr = *final_dst;
1660 
1661         skb->priority = sk->sk_priority;
1662         skb->mark = sk->sk_mark;
1663 
1664         skb_dst_set(skb, dst_clone(&rt->dst));
1665         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1666         if (proto == IPPROTO_ICMPV6) {
1667                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1668 
1669                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1670                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1671         }
1672 
1673         ip6_cork_release(cork, v6_cork);
1674 out:
1675         return skb;
1676 }
1677 
1678 int ip6_send_skb(struct sk_buff *skb)
1679 {
1680         struct net *net = sock_net(skb->sk);
1681         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1682         int err;
1683 
1684         err = ip6_local_out(net, skb->sk, skb);
1685         if (err) {
1686                 if (err > 0)
1687                         err = net_xmit_errno(err);
1688                 if (err)
1689                         IP6_INC_STATS(net, rt->rt6i_idev,
1690                                       IPSTATS_MIB_OUTDISCARDS);
1691         }
1692 
1693         return err;
1694 }
1695 
1696 int ip6_push_pending_frames(struct sock *sk)
1697 {
1698         struct sk_buff *skb;
1699 
1700         skb = ip6_finish_skb(sk);
1701         if (!skb)
1702                 return 0;
1703 
1704         return ip6_send_skb(skb);
1705 }
1706 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1707 
1708 static void __ip6_flush_pending_frames(struct sock *sk,
1709                                        struct sk_buff_head *queue,
1710                                        struct inet_cork_full *cork,
1711                                        struct inet6_cork *v6_cork)
1712 {
1713         struct sk_buff *skb;
1714 
1715         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1716                 if (skb_dst(skb))
1717                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1718                                       IPSTATS_MIB_OUTDISCARDS);
1719                 kfree_skb(skb);
1720         }
1721 
1722         ip6_cork_release(cork, v6_cork);
1723 }
1724 
1725 void ip6_flush_pending_frames(struct sock *sk)
1726 {
1727         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1728                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1729 }
1730 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1731 
1732 struct sk_buff *ip6_make_skb(struct sock *sk,
1733                              int getfrag(void *from, char *to, int offset,
1734                                          int len, int odd, struct sk_buff *skb),
1735                              void *from, int length, int transhdrlen,
1736                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1737                              struct rt6_info *rt, unsigned int flags,
1738                              const struct sockcm_cookie *sockc)
1739 {
1740         struct inet_cork_full cork;
1741         struct inet6_cork v6_cork;
1742         struct sk_buff_head queue;
1743         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1744         int err;
1745 
1746         if (flags & MSG_PROBE)
1747                 return NULL;
1748 
1749         __skb_queue_head_init(&queue);
1750 
1751         cork.base.flags = 0;
1752         cork.base.addr = 0;
1753         cork.base.opt = NULL;
1754         cork.base.dst = NULL;
1755         v6_cork.opt = NULL;
1756         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1757         if (err) {
1758                 ip6_cork_release(&cork, &v6_cork);
1759                 return ERR_PTR(err);
1760         }
1761         if (ipc6->dontfrag < 0)
1762                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1763 
1764         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1765                                 &current->task_frag, getfrag, from,
1766                                 length + exthdrlen, transhdrlen + exthdrlen,
1767                                 flags, ipc6, sockc);
1768         if (err) {
1769                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1770                 return ERR_PTR(err);
1771         }
1772 
1773         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1774 }
1775 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp