1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv6.h> 44 45 #include <net/sock.h> 46 #include <net/snmp.h> 47 48 #include <net/ipv6.h> 49 #include <net/ndisc.h> 50 #include <net/protocol.h> 51 #include <net/ip6_route.h> 52 #include <net/addrconf.h> 53 #include <net/rawv6.h> 54 #include <net/icmp.h> 55 #include <net/xfrm.h> 56 #include <net/checksum.h> 57 #include <linux/mroute6.h> 58 #include <net/l3mdev.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst->dev; 64 struct neighbour *neigh; 65 struct in6_addr *nexthop; 66 int ret; 67 68 skb->protocol = htons(ETH_P_IPV6); 69 skb->dev = dev; 70 71 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 72 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 73 74 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 75 ((mroute6_socket(net, skb) && 76 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 77 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 78 &ipv6_hdr(skb)->saddr))) { 79 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 80 81 /* Do not check for IFF_ALLMULTI; multicast routing 82 is not supported in any case. 83 */ 84 if (newskb) 85 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 86 net, sk, newskb, NULL, newskb->dev, 87 dev_loopback_xmit); 88 89 if (ipv6_hdr(skb)->hop_limit == 0) { 90 IP6_INC_STATS(net, idev, 91 IPSTATS_MIB_OUTDISCARDS); 92 kfree_skb(skb); 93 return 0; 94 } 95 } 96 97 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 98 99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 100 IPV6_ADDR_SCOPE_NODELOCAL && 101 !(dev->flags & IFF_LOOPBACK)) { 102 kfree_skb(skb); 103 return 0; 104 } 105 } 106 107 rcu_read_lock_bh(); 108 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 110 if (unlikely(!neigh)) 111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 112 if (!IS_ERR(neigh)) { 113 ret = dst_neigh_output(dst, neigh, skb); 114 rcu_read_unlock_bh(); 115 return ret; 116 } 117 rcu_read_unlock_bh(); 118 119 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 120 kfree_skb(skb); 121 return -EINVAL; 122 } 123 124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 125 { 126 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 127 dst_allfrag(skb_dst(skb)) || 128 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 129 return ip6_fragment(net, sk, skb, ip6_finish_output2); 130 else 131 return ip6_finish_output2(net, sk, skb); 132 } 133 134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 135 { 136 struct net_device *dev = skb_dst(skb)->dev; 137 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 138 139 if (unlikely(idev->cnf.disable_ipv6)) { 140 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 141 kfree_skb(skb); 142 return 0; 143 } 144 145 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 146 net, sk, skb, NULL, dev, 147 ip6_finish_output, 148 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 149 } 150 151 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 152 { 153 if (!np->autoflowlabel_set) 154 return ip6_default_np_autolabel(net); 155 else 156 return np->autoflowlabel; 157 } 158 159 /* 160 * xmit an sk_buff (used by TCP, SCTP and DCCP) 161 * Note : socket lock is not held for SYNACK packets, but might be modified 162 * by calls to skb_set_owner_w() and ipv6_local_error(), 163 * which are using proper atomic operations or spinlocks. 164 */ 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 166 struct ipv6_txoptions *opt, int tclass) 167 { 168 struct net *net = sock_net(sk); 169 const struct ipv6_pinfo *np = inet6_sk(sk); 170 struct in6_addr *first_hop = &fl6->daddr; 171 struct dst_entry *dst = skb_dst(skb); 172 unsigned int head_room; 173 struct ipv6hdr *hdr; 174 u8 proto = fl6->flowi6_proto; 175 int seg_len = skb->len; 176 int hlimit = -1; 177 u32 mtu; 178 179 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 180 if (opt) 181 head_room += opt->opt_nflen + opt->opt_flen; 182 183 if (unlikely(skb_headroom(skb) < head_room)) { 184 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 185 if (!skb2) { 186 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 187 IPSTATS_MIB_OUTDISCARDS); 188 kfree_skb(skb); 189 return -ENOBUFS; 190 } 191 if (skb->sk) 192 skb_set_owner_w(skb2, skb->sk); 193 consume_skb(skb); 194 skb = skb2; 195 } 196 197 if (opt) { 198 seg_len += opt->opt_nflen + opt->opt_flen; 199 200 if (opt->opt_flen) 201 ipv6_push_frag_opts(skb, opt, &proto); 202 203 if (opt->opt_nflen) 204 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 205 } 206 207 skb_push(skb, sizeof(struct ipv6hdr)); 208 skb_reset_network_header(skb); 209 hdr = ipv6_hdr(skb); 210 211 /* 212 * Fill in the IPv6 header 213 */ 214 if (np) 215 hlimit = np->hop_limit; 216 if (hlimit < 0) 217 hlimit = ip6_dst_hoplimit(dst); 218 219 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 220 ip6_autoflowlabel(net, np), fl6)); 221 222 hdr->payload_len = htons(seg_len); 223 hdr->nexthdr = proto; 224 hdr->hop_limit = hlimit; 225 226 hdr->saddr = fl6->saddr; 227 hdr->daddr = *first_hop; 228 229 skb->protocol = htons(ETH_P_IPV6); 230 skb->priority = sk->sk_priority; 231 skb->mark = sk->sk_mark; 232 233 mtu = dst_mtu(dst); 234 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 235 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 236 IPSTATS_MIB_OUT, skb->len); 237 /* hooks should never assume socket lock is held. 238 * we promote our socket to non const 239 */ 240 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 241 net, (struct sock *)sk, skb, NULL, dst->dev, 242 dst_output); 243 } 244 245 skb->dev = dst->dev; 246 /* ipv6_local_error() does not require socket lock, 247 * we promote our socket to non const 248 */ 249 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 250 251 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 252 kfree_skb(skb); 253 return -EMSGSIZE; 254 } 255 EXPORT_SYMBOL(ip6_xmit); 256 257 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 258 { 259 struct ip6_ra_chain *ra; 260 struct sock *last = NULL; 261 262 read_lock(&ip6_ra_lock); 263 for (ra = ip6_ra_chain; ra; ra = ra->next) { 264 struct sock *sk = ra->sk; 265 if (sk && ra->sel == sel && 266 (!sk->sk_bound_dev_if || 267 sk->sk_bound_dev_if == skb->dev->ifindex)) { 268 if (last) { 269 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 270 if (skb2) 271 rawv6_rcv(last, skb2); 272 } 273 last = sk; 274 } 275 } 276 277 if (last) { 278 rawv6_rcv(last, skb); 279 read_unlock(&ip6_ra_lock); 280 return 1; 281 } 282 read_unlock(&ip6_ra_lock); 283 return 0; 284 } 285 286 static int ip6_forward_proxy_check(struct sk_buff *skb) 287 { 288 struct ipv6hdr *hdr = ipv6_hdr(skb); 289 u8 nexthdr = hdr->nexthdr; 290 __be16 frag_off; 291 int offset; 292 293 if (ipv6_ext_hdr(nexthdr)) { 294 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 295 if (offset < 0) 296 return 0; 297 } else 298 offset = sizeof(struct ipv6hdr); 299 300 if (nexthdr == IPPROTO_ICMPV6) { 301 struct icmp6hdr *icmp6; 302 303 if (!pskb_may_pull(skb, (skb_network_header(skb) + 304 offset + 1 - skb->data))) 305 return 0; 306 307 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 308 309 switch (icmp6->icmp6_type) { 310 case NDISC_ROUTER_SOLICITATION: 311 case NDISC_ROUTER_ADVERTISEMENT: 312 case NDISC_NEIGHBOUR_SOLICITATION: 313 case NDISC_NEIGHBOUR_ADVERTISEMENT: 314 case NDISC_REDIRECT: 315 /* For reaction involving unicast neighbor discovery 316 * message destined to the proxied address, pass it to 317 * input function. 318 */ 319 return 1; 320 default: 321 break; 322 } 323 } 324 325 /* 326 * The proxying router can't forward traffic sent to a link-local 327 * address, so signal the sender and discard the packet. This 328 * behavior is clarified by the MIPv6 specification. 329 */ 330 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 331 dst_link_failure(skb); 332 return -1; 333 } 334 335 return 0; 336 } 337 338 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 339 struct sk_buff *skb) 340 { 341 struct dst_entry *dst = skb_dst(skb); 342 343 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 344 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 345 skb_sender_cpu_clear(skb); 346 return dst_output(net, sk, skb); 347 } 348 349 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) 350 { 351 unsigned int mtu; 352 struct inet6_dev *idev; 353 354 if (dst_metric_locked(dst, RTAX_MTU)) { 355 mtu = dst_metric_raw(dst, RTAX_MTU); 356 if (mtu) 357 return mtu; 358 } 359 360 mtu = IPV6_MIN_MTU; 361 rcu_read_lock(); 362 idev = __in6_dev_get(dst->dev); 363 if (idev) 364 mtu = idev->cnf.mtu6; 365 rcu_read_unlock(); 366 367 return mtu; 368 } 369 370 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 371 { 372 if (skb->len <= mtu) 373 return false; 374 375 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 376 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 377 return true; 378 379 if (skb->ignore_df) 380 return false; 381 382 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) 383 return false; 384 385 return true; 386 } 387 388 int ip6_forward(struct sk_buff *skb) 389 { 390 struct dst_entry *dst = skb_dst(skb); 391 struct ipv6hdr *hdr = ipv6_hdr(skb); 392 struct inet6_skb_parm *opt = IP6CB(skb); 393 struct net *net = dev_net(dst->dev); 394 u32 mtu; 395 396 if (net->ipv6.devconf_all->forwarding == 0) 397 goto error; 398 399 if (skb->pkt_type != PACKET_HOST) 400 goto drop; 401 402 if (unlikely(skb->sk)) 403 goto drop; 404 405 if (skb_warn_if_lro(skb)) 406 goto drop; 407 408 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 409 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 410 IPSTATS_MIB_INDISCARDS); 411 goto drop; 412 } 413 414 skb_forward_csum(skb); 415 416 /* 417 * We DO NOT make any processing on 418 * RA packets, pushing them to user level AS IS 419 * without ane WARRANTY that application will be able 420 * to interpret them. The reason is that we 421 * cannot make anything clever here. 422 * 423 * We are not end-node, so that if packet contains 424 * AH/ESP, we cannot make anything. 425 * Defragmentation also would be mistake, RA packets 426 * cannot be fragmented, because there is no warranty 427 * that different fragments will go along one path. --ANK 428 */ 429 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 430 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 431 return 0; 432 } 433 434 /* 435 * check and decrement ttl 436 */ 437 if (hdr->hop_limit <= 1) { 438 /* Force OUTPUT device used as source address */ 439 skb->dev = dst->dev; 440 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 441 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 442 IPSTATS_MIB_INHDRERRORS); 443 444 kfree_skb(skb); 445 return -ETIMEDOUT; 446 } 447 448 /* XXX: idev->cnf.proxy_ndp? */ 449 if (net->ipv6.devconf_all->proxy_ndp && 450 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 451 int proxied = ip6_forward_proxy_check(skb); 452 if (proxied > 0) 453 return ip6_input(skb); 454 else if (proxied < 0) { 455 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 456 IPSTATS_MIB_INDISCARDS); 457 goto drop; 458 } 459 } 460 461 if (!xfrm6_route_forward(skb)) { 462 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 463 IPSTATS_MIB_INDISCARDS); 464 goto drop; 465 } 466 dst = skb_dst(skb); 467 468 /* IPv6 specs say nothing about it, but it is clear that we cannot 469 send redirects to source routed frames. 470 We don't send redirects to frames decapsulated from IPsec. 471 */ 472 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 473 struct in6_addr *target = NULL; 474 struct inet_peer *peer; 475 struct rt6_info *rt; 476 477 /* 478 * incoming and outgoing devices are the same 479 * send a redirect. 480 */ 481 482 rt = (struct rt6_info *) dst; 483 if (rt->rt6i_flags & RTF_GATEWAY) 484 target = &rt->rt6i_gateway; 485 else 486 target = &hdr->daddr; 487 488 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 489 490 /* Limit redirects both by destination (here) 491 and by source (inside ndisc_send_redirect) 492 */ 493 if (inet_peer_xrlim_allow(peer, 1*HZ)) 494 ndisc_send_redirect(skb, target); 495 if (peer) 496 inet_putpeer(peer); 497 } else { 498 int addrtype = ipv6_addr_type(&hdr->saddr); 499 500 /* This check is security critical. */ 501 if (addrtype == IPV6_ADDR_ANY || 502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 503 goto error; 504 if (addrtype & IPV6_ADDR_LINKLOCAL) { 505 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 506 ICMPV6_NOT_NEIGHBOUR, 0); 507 goto error; 508 } 509 } 510 511 mtu = ip6_dst_mtu_forward(dst); 512 if (mtu < IPV6_MIN_MTU) 513 mtu = IPV6_MIN_MTU; 514 515 if (ip6_pkt_too_big(skb, mtu)) { 516 /* Again, force OUTPUT device used as source address */ 517 skb->dev = dst->dev; 518 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 519 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 520 IPSTATS_MIB_INTOOBIGERRORS); 521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 522 IPSTATS_MIB_FRAGFAILS); 523 kfree_skb(skb); 524 return -EMSGSIZE; 525 } 526 527 if (skb_cow(skb, dst->dev->hard_header_len)) { 528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), 529 IPSTATS_MIB_OUTDISCARDS); 530 goto drop; 531 } 532 533 hdr = ipv6_hdr(skb); 534 535 /* Mangling hops number delayed to point after skb COW */ 536 537 hdr->hop_limit--; 538 539 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 540 net, NULL, skb, skb->dev, dst->dev, 541 ip6_forward_finish); 542 543 error: 544 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 545 drop: 546 kfree_skb(skb); 547 return -EINVAL; 548 } 549 550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 551 { 552 to->pkt_type = from->pkt_type; 553 to->priority = from->priority; 554 to->protocol = from->protocol; 555 skb_dst_drop(to); 556 skb_dst_set(to, dst_clone(skb_dst(from))); 557 to->dev = from->dev; 558 to->mark = from->mark; 559 560 skb_copy_hash(to, from); 561 562 #ifdef CONFIG_NET_SCHED 563 to->tc_index = from->tc_index; 564 #endif 565 nf_copy(to, from); 566 skb_copy_secmark(to, from); 567 } 568 569 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 570 int (*output)(struct net *, struct sock *, struct sk_buff *)) 571 { 572 struct sk_buff *frag; 573 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 574 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 575 inet6_sk(skb->sk) : NULL; 576 struct ipv6hdr *tmp_hdr; 577 struct frag_hdr *fh; 578 unsigned int mtu, hlen, left, len, nexthdr_offset; 579 int hroom, troom; 580 __be32 frag_id; 581 int ptr, offset = 0, err = 0; 582 u8 *prevhdr, nexthdr = 0; 583 584 err = ip6_find_1stfragopt(skb, &prevhdr); 585 if (err < 0) 586 goto fail; 587 hlen = err; 588 nexthdr = *prevhdr; 589 nexthdr_offset = prevhdr - skb_network_header(skb); 590 591 mtu = ip6_skb_dst_mtu(skb); 592 593 /* We must not fragment if the socket is set to force MTU discovery 594 * or if the skb it not generated by a local socket. 595 */ 596 if (unlikely(!skb->ignore_df && skb->len > mtu)) 597 goto fail_toobig; 598 599 if (IP6CB(skb)->frag_max_size) { 600 if (IP6CB(skb)->frag_max_size > mtu) 601 goto fail_toobig; 602 603 /* don't send fragments larger than what we received */ 604 mtu = IP6CB(skb)->frag_max_size; 605 if (mtu < IPV6_MIN_MTU) 606 mtu = IPV6_MIN_MTU; 607 } 608 609 if (np && np->frag_size < mtu) { 610 if (np->frag_size) 611 mtu = np->frag_size; 612 } 613 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 614 goto fail_toobig; 615 mtu -= hlen + sizeof(struct frag_hdr); 616 617 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 618 &ipv6_hdr(skb)->saddr); 619 620 if (skb->ip_summed == CHECKSUM_PARTIAL && 621 (err = skb_checksum_help(skb))) 622 goto fail; 623 624 prevhdr = skb_network_header(skb) + nexthdr_offset; 625 hroom = LL_RESERVED_SPACE(rt->dst.dev); 626 if (skb_has_frag_list(skb)) { 627 int first_len = skb_pagelen(skb); 628 struct sk_buff *frag2; 629 630 if (first_len - hlen > mtu || 631 ((first_len - hlen) & 7) || 632 skb_cloned(skb) || 633 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 634 goto slow_path; 635 636 skb_walk_frags(skb, frag) { 637 /* Correct geometry. */ 638 if (frag->len > mtu || 639 ((frag->len & 7) && frag->next) || 640 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 641 goto slow_path_clean; 642 643 /* Partially cloned skb? */ 644 if (skb_shared(frag)) 645 goto slow_path_clean; 646 647 BUG_ON(frag->sk); 648 if (skb->sk) { 649 frag->sk = skb->sk; 650 frag->destructor = sock_wfree; 651 } 652 skb->truesize -= frag->truesize; 653 } 654 655 err = 0; 656 offset = 0; 657 /* BUILD HEADER */ 658 659 *prevhdr = NEXTHDR_FRAGMENT; 660 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 661 if (!tmp_hdr) { 662 err = -ENOMEM; 663 goto fail; 664 } 665 frag = skb_shinfo(skb)->frag_list; 666 skb_frag_list_init(skb); 667 668 __skb_pull(skb, hlen); 669 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); 670 __skb_push(skb, hlen); 671 skb_reset_network_header(skb); 672 memcpy(skb_network_header(skb), tmp_hdr, hlen); 673 674 fh->nexthdr = nexthdr; 675 fh->reserved = 0; 676 fh->frag_off = htons(IP6_MF); 677 fh->identification = frag_id; 678 679 first_len = skb_pagelen(skb); 680 skb->data_len = first_len - skb_headlen(skb); 681 skb->len = first_len; 682 ipv6_hdr(skb)->payload_len = htons(first_len - 683 sizeof(struct ipv6hdr)); 684 685 dst_hold(&rt->dst); 686 687 for (;;) { 688 /* Prepare header of the next frame, 689 * before previous one went down. */ 690 if (frag) { 691 frag->ip_summed = CHECKSUM_NONE; 692 skb_reset_transport_header(frag); 693 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); 694 __skb_push(frag, hlen); 695 skb_reset_network_header(frag); 696 memcpy(skb_network_header(frag), tmp_hdr, 697 hlen); 698 offset += skb->len - hlen - sizeof(struct frag_hdr); 699 fh->nexthdr = nexthdr; 700 fh->reserved = 0; 701 fh->frag_off = htons(offset); 702 if (frag->next) 703 fh->frag_off |= htons(IP6_MF); 704 fh->identification = frag_id; 705 ipv6_hdr(frag)->payload_len = 706 htons(frag->len - 707 sizeof(struct ipv6hdr)); 708 ip6_copy_metadata(frag, skb); 709 } 710 711 err = output(net, sk, skb); 712 if (!err) 713 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 714 IPSTATS_MIB_FRAGCREATES); 715 716 if (err || !frag) 717 break; 718 719 skb = frag; 720 frag = skb->next; 721 skb->next = NULL; 722 } 723 724 kfree(tmp_hdr); 725 726 if (err == 0) { 727 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 728 IPSTATS_MIB_FRAGOKS); 729 ip6_rt_put(rt); 730 return 0; 731 } 732 733 kfree_skb_list(frag); 734 735 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 736 IPSTATS_MIB_FRAGFAILS); 737 ip6_rt_put(rt); 738 return err; 739 740 slow_path_clean: 741 skb_walk_frags(skb, frag2) { 742 if (frag2 == frag) 743 break; 744 frag2->sk = NULL; 745 frag2->destructor = NULL; 746 skb->truesize += frag2->truesize; 747 } 748 } 749 750 slow_path: 751 left = skb->len - hlen; /* Space per frame */ 752 ptr = hlen; /* Where to start from */ 753 754 /* 755 * Fragment the datagram. 756 */ 757 758 troom = rt->dst.dev->needed_tailroom; 759 760 /* 761 * Keep copying data until we run out. 762 */ 763 while (left > 0) { 764 u8 *fragnexthdr_offset; 765 766 len = left; 767 /* IF: it doesn't fit, use 'mtu' - the data space left */ 768 if (len > mtu) 769 len = mtu; 770 /* IF: we are not sending up to and including the packet end 771 then align the next start on an eight byte boundary */ 772 if (len < left) { 773 len &= ~7; 774 } 775 776 /* Allocate buffer */ 777 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 778 hroom + troom, GFP_ATOMIC); 779 if (!frag) { 780 err = -ENOMEM; 781 goto fail; 782 } 783 784 /* 785 * Set up data on packet 786 */ 787 788 ip6_copy_metadata(frag, skb); 789 skb_reserve(frag, hroom); 790 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 791 skb_reset_network_header(frag); 792 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 793 frag->transport_header = (frag->network_header + hlen + 794 sizeof(struct frag_hdr)); 795 796 /* 797 * Charge the memory for the fragment to any owner 798 * it might possess 799 */ 800 if (skb->sk) 801 skb_set_owner_w(frag, skb->sk); 802 803 /* 804 * Copy the packet header into the new buffer. 805 */ 806 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 807 808 fragnexthdr_offset = skb_network_header(frag); 809 fragnexthdr_offset += prevhdr - skb_network_header(skb); 810 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 811 812 /* 813 * Build fragment header. 814 */ 815 fh->nexthdr = nexthdr; 816 fh->reserved = 0; 817 fh->identification = frag_id; 818 819 /* 820 * Copy a block of the IP datagram. 821 */ 822 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 823 len)); 824 left -= len; 825 826 fh->frag_off = htons(offset); 827 if (left > 0) 828 fh->frag_off |= htons(IP6_MF); 829 ipv6_hdr(frag)->payload_len = htons(frag->len - 830 sizeof(struct ipv6hdr)); 831 832 ptr += len; 833 offset += len; 834 835 /* 836 * Put this fragment into the sending queue. 837 */ 838 err = output(net, sk, frag); 839 if (err) 840 goto fail; 841 842 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 843 IPSTATS_MIB_FRAGCREATES); 844 } 845 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 846 IPSTATS_MIB_FRAGOKS); 847 consume_skb(skb); 848 return err; 849 850 fail_toobig: 851 if (skb->sk && dst_allfrag(skb_dst(skb))) 852 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 853 854 skb->dev = skb_dst(skb)->dev; 855 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 856 err = -EMSGSIZE; 857 858 fail: 859 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 860 IPSTATS_MIB_FRAGFAILS); 861 kfree_skb(skb); 862 return err; 863 } 864 865 static inline int ip6_rt_check(const struct rt6key *rt_key, 866 const struct in6_addr *fl_addr, 867 const struct in6_addr *addr_cache) 868 { 869 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 870 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 871 } 872 873 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 874 struct dst_entry *dst, 875 const struct flowi6 *fl6) 876 { 877 struct ipv6_pinfo *np = inet6_sk(sk); 878 struct rt6_info *rt; 879 880 if (!dst) 881 goto out; 882 883 if (dst->ops->family != AF_INET6) { 884 dst_release(dst); 885 return NULL; 886 } 887 888 rt = (struct rt6_info *)dst; 889 /* Yes, checking route validity in not connected 890 * case is not very simple. Take into account, 891 * that we do not support routing by source, TOS, 892 * and MSG_DONTROUTE --ANK (980726) 893 * 894 * 1. ip6_rt_check(): If route was host route, 895 * check that cached destination is current. 896 * If it is network route, we still may 897 * check its validity using saved pointer 898 * to the last used address: daddr_cache. 899 * We do not want to save whole address now, 900 * (because main consumer of this service 901 * is tcp, which has not this problem), 902 * so that the last trick works only on connected 903 * sockets. 904 * 2. oif also should be the same. 905 */ 906 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 907 #ifdef CONFIG_IPV6_SUBTREES 908 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 909 #endif 910 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 911 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 912 dst_release(dst); 913 dst = NULL; 914 } 915 916 out: 917 return dst; 918 } 919 920 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 921 struct dst_entry **dst, struct flowi6 *fl6) 922 { 923 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 924 struct neighbour *n; 925 struct rt6_info *rt; 926 #endif 927 int err; 928 int flags = 0; 929 930 /* The correct way to handle this would be to do 931 * ip6_route_get_saddr, and then ip6_route_output; however, 932 * the route-specific preferred source forces the 933 * ip6_route_output call _before_ ip6_route_get_saddr. 934 * 935 * In source specific routing (no src=any default route), 936 * ip6_route_output will fail given src=any saddr, though, so 937 * that's why we try it again later. 938 */ 939 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 940 struct rt6_info *rt; 941 bool had_dst = *dst != NULL; 942 943 if (!had_dst) 944 *dst = ip6_route_output(net, sk, fl6); 945 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 946 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 947 sk ? inet6_sk(sk)->srcprefs : 0, 948 &fl6->saddr); 949 if (err) 950 goto out_err_release; 951 952 /* If we had an erroneous initial result, pretend it 953 * never existed and let the SA-enabled version take 954 * over. 955 */ 956 if (!had_dst && (*dst)->error) { 957 dst_release(*dst); 958 *dst = NULL; 959 } 960 961 if (fl6->flowi6_oif) 962 flags |= RT6_LOOKUP_F_IFACE; 963 } 964 965 if (!*dst) 966 *dst = ip6_route_output_flags(net, sk, fl6, flags); 967 968 err = (*dst)->error; 969 if (err) 970 goto out_err_release; 971 972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 973 /* 974 * Here if the dst entry we've looked up 975 * has a neighbour entry that is in the INCOMPLETE 976 * state and the src address from the flow is 977 * marked as OPTIMISTIC, we release the found 978 * dst entry and replace it instead with the 979 * dst entry of the nexthop router 980 */ 981 rt = (struct rt6_info *) *dst; 982 rcu_read_lock_bh(); 983 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 984 rt6_nexthop(rt, &fl6->daddr)); 985 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 986 rcu_read_unlock_bh(); 987 988 if (err) { 989 struct inet6_ifaddr *ifp; 990 struct flowi6 fl_gw6; 991 int redirect; 992 993 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 994 (*dst)->dev, 1); 995 996 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 997 if (ifp) 998 in6_ifa_put(ifp); 999 1000 if (redirect) { 1001 /* 1002 * We need to get the dst entry for the 1003 * default router instead 1004 */ 1005 dst_release(*dst); 1006 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1007 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1008 *dst = ip6_route_output(net, sk, &fl_gw6); 1009 err = (*dst)->error; 1010 if (err) 1011 goto out_err_release; 1012 } 1013 } 1014 #endif 1015 if (ipv6_addr_v4mapped(&fl6->saddr) && 1016 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1017 err = -EAFNOSUPPORT; 1018 goto out_err_release; 1019 } 1020 1021 return 0; 1022 1023 out_err_release: 1024 if (err == -ENETUNREACH) 1025 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1026 dst_release(*dst); 1027 *dst = NULL; 1028 return err; 1029 } 1030 1031 /** 1032 * ip6_dst_lookup - perform route lookup on flow 1033 * @sk: socket which provides route info 1034 * @dst: pointer to dst_entry * for result 1035 * @fl6: flow to lookup 1036 * 1037 * This function performs a route lookup on the given flow. 1038 * 1039 * It returns zero on success, or a standard errno code on error. 1040 */ 1041 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1042 struct flowi6 *fl6) 1043 { 1044 *dst = NULL; 1045 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1046 } 1047 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1048 1049 /** 1050 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1051 * @sk: socket which provides route info 1052 * @fl6: flow to lookup 1053 * @final_dst: final destination address for ipsec lookup 1054 * 1055 * This function performs a route lookup on the given flow. 1056 * 1057 * It returns a valid dst pointer on success, or a pointer encoded 1058 * error code. 1059 */ 1060 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1061 const struct in6_addr *final_dst) 1062 { 1063 struct dst_entry *dst = NULL; 1064 int err; 1065 1066 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1067 if (err) 1068 return ERR_PTR(err); 1069 if (final_dst) 1070 fl6->daddr = *final_dst; 1071 if (!fl6->flowi6_oif) 1072 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); 1073 1074 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1075 } 1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1077 1078 /** 1079 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1080 * @sk: socket which provides the dst cache and route info 1081 * @fl6: flow to lookup 1082 * @final_dst: final destination address for ipsec lookup 1083 * 1084 * This function performs a route lookup on the given flow with the 1085 * possibility of using the cached route in the socket if it is valid. 1086 * It will take the socket dst lock when operating on the dst cache. 1087 * As a result, this function can only be used in process context. 1088 * 1089 * It returns a valid dst pointer on success, or a pointer encoded 1090 * error code. 1091 */ 1092 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1093 const struct in6_addr *final_dst) 1094 { 1095 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1096 1097 dst = ip6_sk_dst_check(sk, dst, fl6); 1098 if (!dst) 1099 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1100 1101 return dst; 1102 } 1103 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1104 1105 static inline int ip6_ufo_append_data(struct sock *sk, 1106 struct sk_buff_head *queue, 1107 int getfrag(void *from, char *to, int offset, int len, 1108 int odd, struct sk_buff *skb), 1109 void *from, int length, int hh_len, int fragheaderlen, 1110 int exthdrlen, int transhdrlen, int mtu, 1111 unsigned int flags, const struct flowi6 *fl6) 1112 1113 { 1114 struct sk_buff *skb; 1115 int err; 1116 1117 /* There is support for UDP large send offload by network 1118 * device, so create one single skb packet containing complete 1119 * udp datagram 1120 */ 1121 skb = skb_peek_tail(queue); 1122 if (!skb) { 1123 skb = sock_alloc_send_skb(sk, 1124 hh_len + fragheaderlen + transhdrlen + 20, 1125 (flags & MSG_DONTWAIT), &err); 1126 if (!skb) 1127 return err; 1128 1129 /* reserve space for Hardware header */ 1130 skb_reserve(skb, hh_len); 1131 1132 /* create space for UDP/IP header */ 1133 skb_put(skb, fragheaderlen + transhdrlen); 1134 1135 /* initialize network header pointer */ 1136 skb_set_network_header(skb, exthdrlen); 1137 1138 /* initialize protocol header pointer */ 1139 skb->transport_header = skb->network_header + fragheaderlen; 1140 1141 skb->protocol = htons(ETH_P_IPV6); 1142 skb->csum = 0; 1143 1144 __skb_queue_tail(queue, skb); 1145 } else if (skb_is_gso(skb)) { 1146 goto append; 1147 } 1148 1149 skb->ip_summed = CHECKSUM_PARTIAL; 1150 /* Specify the length of each IPv6 datagram fragment. 1151 * It has to be a multiple of 8. 1152 */ 1153 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - 1154 sizeof(struct frag_hdr)) & ~7; 1155 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1156 skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), 1157 &fl6->daddr, 1158 &fl6->saddr); 1159 1160 append: 1161 return skb_append_datato_frags(sk, skb, getfrag, from, 1162 (length - transhdrlen)); 1163 } 1164 1165 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1166 gfp_t gfp) 1167 { 1168 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1169 } 1170 1171 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1172 gfp_t gfp) 1173 { 1174 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1175 } 1176 1177 static void ip6_append_data_mtu(unsigned int *mtu, 1178 int *maxfraglen, 1179 unsigned int fragheaderlen, 1180 struct sk_buff *skb, 1181 struct rt6_info *rt, 1182 unsigned int orig_mtu) 1183 { 1184 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1185 if (!skb) { 1186 /* first fragment, reserve header_len */ 1187 *mtu = orig_mtu - rt->dst.header_len; 1188 1189 } else { 1190 /* 1191 * this fragment is not first, the headers 1192 * space is regarded as data space. 1193 */ 1194 *mtu = orig_mtu; 1195 } 1196 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1197 + fragheaderlen - sizeof(struct frag_hdr); 1198 } 1199 } 1200 1201 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1202 struct inet6_cork *v6_cork, 1203 int hlimit, int tclass, struct ipv6_txoptions *opt, 1204 struct rt6_info *rt, struct flowi6 *fl6) 1205 { 1206 struct ipv6_pinfo *np = inet6_sk(sk); 1207 unsigned int mtu; 1208 1209 /* 1210 * setup for corking 1211 */ 1212 if (opt) { 1213 if (WARN_ON(v6_cork->opt)) 1214 return -EINVAL; 1215 1216 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1217 if (unlikely(!v6_cork->opt)) 1218 return -ENOBUFS; 1219 1220 v6_cork->opt->tot_len = sizeof(*opt); 1221 v6_cork->opt->opt_flen = opt->opt_flen; 1222 v6_cork->opt->opt_nflen = opt->opt_nflen; 1223 1224 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1225 sk->sk_allocation); 1226 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1227 return -ENOBUFS; 1228 1229 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1230 sk->sk_allocation); 1231 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1232 return -ENOBUFS; 1233 1234 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1235 sk->sk_allocation); 1236 if (opt->hopopt && !v6_cork->opt->hopopt) 1237 return -ENOBUFS; 1238 1239 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1240 sk->sk_allocation); 1241 if (opt->srcrt && !v6_cork->opt->srcrt) 1242 return -ENOBUFS; 1243 1244 /* need source address above miyazawa*/ 1245 } 1246 dst_hold(&rt->dst); 1247 cork->base.dst = &rt->dst; 1248 cork->fl.u.ip6 = *fl6; 1249 v6_cork->hop_limit = hlimit; 1250 v6_cork->tclass = tclass; 1251 if (rt->dst.flags & DST_XFRM_TUNNEL) 1252 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1253 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1254 else 1255 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1256 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); 1257 if (np->frag_size < mtu) { 1258 if (np->frag_size) 1259 mtu = np->frag_size; 1260 } 1261 if (mtu < IPV6_MIN_MTU) 1262 return -EINVAL; 1263 cork->base.fragsize = mtu; 1264 if (dst_allfrag(rt->dst.path)) 1265 cork->base.flags |= IPCORK_ALLFRAG; 1266 cork->base.length = 0; 1267 1268 return 0; 1269 } 1270 1271 static int __ip6_append_data(struct sock *sk, 1272 struct flowi6 *fl6, 1273 struct sk_buff_head *queue, 1274 struct inet_cork *cork, 1275 struct inet6_cork *v6_cork, 1276 struct page_frag *pfrag, 1277 int getfrag(void *from, char *to, int offset, 1278 int len, int odd, struct sk_buff *skb), 1279 void *from, int length, int transhdrlen, 1280 unsigned int flags, int dontfrag) 1281 { 1282 struct sk_buff *skb, *skb_prev = NULL; 1283 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1284 int exthdrlen = 0; 1285 int dst_exthdrlen = 0; 1286 int hh_len; 1287 int copy; 1288 int err; 1289 int offset = 0; 1290 __u8 tx_flags = 0; 1291 u32 tskey = 0; 1292 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1293 struct ipv6_txoptions *opt = v6_cork->opt; 1294 int csummode = CHECKSUM_NONE; 1295 unsigned int maxnonfragsize, headersize; 1296 1297 skb = skb_peek_tail(queue); 1298 if (!skb) { 1299 exthdrlen = opt ? opt->opt_flen : 0; 1300 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1301 } 1302 1303 mtu = cork->fragsize; 1304 orig_mtu = mtu; 1305 1306 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1307 1308 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1309 (opt ? opt->opt_nflen : 0); 1310 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1311 sizeof(struct frag_hdr); 1312 1313 headersize = sizeof(struct ipv6hdr) + 1314 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1315 (dst_allfrag(&rt->dst) ? 1316 sizeof(struct frag_hdr) : 0) + 1317 rt->rt6i_nfheader_len; 1318 1319 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1320 * the first fragment 1321 */ 1322 if (headersize + transhdrlen > mtu) 1323 goto emsgsize; 1324 1325 if (cork->length + length > mtu - headersize && dontfrag && 1326 (sk->sk_protocol == IPPROTO_UDP || 1327 sk->sk_protocol == IPPROTO_RAW)) { 1328 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1329 sizeof(struct ipv6hdr)); 1330 goto emsgsize; 1331 } 1332 1333 if (ip6_sk_ignore_df(sk)) 1334 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1335 else 1336 maxnonfragsize = mtu; 1337 1338 if (cork->length + length > maxnonfragsize - headersize) { 1339 emsgsize: 1340 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1341 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1342 return -EMSGSIZE; 1343 } 1344 1345 /* CHECKSUM_PARTIAL only with no extension headers and when 1346 * we are not going to fragment 1347 */ 1348 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1349 headersize == sizeof(struct ipv6hdr) && 1350 length < mtu - headersize && 1351 !(flags & MSG_MORE) && 1352 rt->dst.dev->features & NETIF_F_V6_CSUM) 1353 csummode = CHECKSUM_PARTIAL; 1354 1355 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { 1356 sock_tx_timestamp(sk, &tx_flags); 1357 if (tx_flags & SKBTX_ANY_SW_TSTAMP && 1358 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1359 tskey = sk->sk_tskey++; 1360 } 1361 1362 /* 1363 * Let's try using as much space as possible. 1364 * Use MTU if total length of the message fits into the MTU. 1365 * Otherwise, we need to reserve fragment header and 1366 * fragment alignment (= 8-15 octects, in total). 1367 * 1368 * Note that we may need to "move" the data from the tail of 1369 * of the buffer to the new fragment when we split 1370 * the message. 1371 * 1372 * FIXME: It may be fragmented into multiple chunks 1373 * at once if non-fragmentable extension headers 1374 * are too large. 1375 * --yoshfuji 1376 */ 1377 1378 cork->length += length; 1379 if ((skb && skb_is_gso(skb)) || 1380 (((length + (skb ? skb->len : headersize)) > mtu) && 1381 (skb_queue_len(queue) <= 1) && 1382 (sk->sk_protocol == IPPROTO_UDP) && 1383 (rt->dst.dev->features & NETIF_F_UFO) && 1384 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { 1385 err = ip6_ufo_append_data(sk, queue, getfrag, from, length, 1386 hh_len, fragheaderlen, exthdrlen, 1387 transhdrlen, mtu, flags, fl6); 1388 if (err) 1389 goto error; 1390 return 0; 1391 } 1392 1393 if (!skb) 1394 goto alloc_new_skb; 1395 1396 while (length > 0) { 1397 /* Check if the remaining data fits into current packet. */ 1398 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1399 if (copy < length) 1400 copy = maxfraglen - skb->len; 1401 1402 if (copy <= 0) { 1403 char *data; 1404 unsigned int datalen; 1405 unsigned int fraglen; 1406 unsigned int fraggap; 1407 unsigned int alloclen; 1408 alloc_new_skb: 1409 /* There's no room in the current skb */ 1410 if (skb) 1411 fraggap = skb->len - maxfraglen; 1412 else 1413 fraggap = 0; 1414 /* update mtu and maxfraglen if necessary */ 1415 if (!skb || !skb_prev) 1416 ip6_append_data_mtu(&mtu, &maxfraglen, 1417 fragheaderlen, skb, rt, 1418 orig_mtu); 1419 1420 skb_prev = skb; 1421 1422 /* 1423 * If remaining data exceeds the mtu, 1424 * we know we need more fragment(s). 1425 */ 1426 datalen = length + fraggap; 1427 1428 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1429 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1430 if ((flags & MSG_MORE) && 1431 !(rt->dst.dev->features&NETIF_F_SG)) 1432 alloclen = mtu; 1433 else 1434 alloclen = datalen + fragheaderlen; 1435 1436 alloclen += dst_exthdrlen; 1437 1438 if (datalen != length + fraggap) { 1439 /* 1440 * this is not the last fragment, the trailer 1441 * space is regarded as data space. 1442 */ 1443 datalen += rt->dst.trailer_len; 1444 } 1445 1446 alloclen += rt->dst.trailer_len; 1447 fraglen = datalen + fragheaderlen; 1448 1449 /* 1450 * We just reserve space for fragment header. 1451 * Note: this may be overallocation if the message 1452 * (without MSG_MORE) fits into the MTU. 1453 */ 1454 alloclen += sizeof(struct frag_hdr); 1455 1456 copy = datalen - transhdrlen - fraggap; 1457 if (copy < 0) { 1458 err = -EINVAL; 1459 goto error; 1460 } 1461 if (transhdrlen) { 1462 skb = sock_alloc_send_skb(sk, 1463 alloclen + hh_len, 1464 (flags & MSG_DONTWAIT), &err); 1465 } else { 1466 skb = NULL; 1467 if (atomic_read(&sk->sk_wmem_alloc) <= 1468 2 * sk->sk_sndbuf) 1469 skb = sock_wmalloc(sk, 1470 alloclen + hh_len, 1, 1471 sk->sk_allocation); 1472 if (unlikely(!skb)) 1473 err = -ENOBUFS; 1474 } 1475 if (!skb) 1476 goto error; 1477 /* 1478 * Fill in the control structures 1479 */ 1480 skb->protocol = htons(ETH_P_IPV6); 1481 skb->ip_summed = csummode; 1482 skb->csum = 0; 1483 /* reserve for fragmentation and ipsec header */ 1484 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1485 dst_exthdrlen); 1486 1487 /* Only the initial fragment is time stamped */ 1488 skb_shinfo(skb)->tx_flags = tx_flags; 1489 tx_flags = 0; 1490 skb_shinfo(skb)->tskey = tskey; 1491 tskey = 0; 1492 1493 /* 1494 * Find where to start putting bytes 1495 */ 1496 data = skb_put(skb, fraglen); 1497 skb_set_network_header(skb, exthdrlen); 1498 data += fragheaderlen; 1499 skb->transport_header = (skb->network_header + 1500 fragheaderlen); 1501 if (fraggap) { 1502 skb->csum = skb_copy_and_csum_bits( 1503 skb_prev, maxfraglen, 1504 data + transhdrlen, fraggap, 0); 1505 skb_prev->csum = csum_sub(skb_prev->csum, 1506 skb->csum); 1507 data += fraggap; 1508 pskb_trim_unique(skb_prev, maxfraglen); 1509 } 1510 if (copy > 0 && 1511 getfrag(from, data + transhdrlen, offset, 1512 copy, fraggap, skb) < 0) { 1513 err = -EFAULT; 1514 kfree_skb(skb); 1515 goto error; 1516 } 1517 1518 offset += copy; 1519 length -= datalen - fraggap; 1520 transhdrlen = 0; 1521 exthdrlen = 0; 1522 dst_exthdrlen = 0; 1523 1524 /* 1525 * Put the packet on the pending queue 1526 */ 1527 __skb_queue_tail(queue, skb); 1528 continue; 1529 } 1530 1531 if (copy > length) 1532 copy = length; 1533 1534 if (!(rt->dst.dev->features&NETIF_F_SG) && 1535 skb_tailroom(skb) >= copy) { 1536 unsigned int off; 1537 1538 off = skb->len; 1539 if (getfrag(from, skb_put(skb, copy), 1540 offset, copy, off, skb) < 0) { 1541 __skb_trim(skb, off); 1542 err = -EFAULT; 1543 goto error; 1544 } 1545 } else { 1546 int i = skb_shinfo(skb)->nr_frags; 1547 1548 err = -ENOMEM; 1549 if (!sk_page_frag_refill(sk, pfrag)) 1550 goto error; 1551 1552 if (!skb_can_coalesce(skb, i, pfrag->page, 1553 pfrag->offset)) { 1554 err = -EMSGSIZE; 1555 if (i == MAX_SKB_FRAGS) 1556 goto error; 1557 1558 __skb_fill_page_desc(skb, i, pfrag->page, 1559 pfrag->offset, 0); 1560 skb_shinfo(skb)->nr_frags = ++i; 1561 get_page(pfrag->page); 1562 } 1563 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1564 if (getfrag(from, 1565 page_address(pfrag->page) + pfrag->offset, 1566 offset, copy, skb->len, skb) < 0) 1567 goto error_efault; 1568 1569 pfrag->offset += copy; 1570 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1571 skb->len += copy; 1572 skb->data_len += copy; 1573 skb->truesize += copy; 1574 atomic_add(copy, &sk->sk_wmem_alloc); 1575 } 1576 offset += copy; 1577 length -= copy; 1578 } 1579 1580 return 0; 1581 1582 error_efault: 1583 err = -EFAULT; 1584 error: 1585 cork->length -= length; 1586 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1587 return err; 1588 } 1589 1590 int ip6_append_data(struct sock *sk, 1591 int getfrag(void *from, char *to, int offset, int len, 1592 int odd, struct sk_buff *skb), 1593 void *from, int length, int transhdrlen, int hlimit, 1594 int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, 1595 struct rt6_info *rt, unsigned int flags, int dontfrag) 1596 { 1597 struct inet_sock *inet = inet_sk(sk); 1598 struct ipv6_pinfo *np = inet6_sk(sk); 1599 int exthdrlen; 1600 int err; 1601 1602 if (flags&MSG_PROBE) 1603 return 0; 1604 if (skb_queue_empty(&sk->sk_write_queue)) { 1605 /* 1606 * setup for corking 1607 */ 1608 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit, 1609 tclass, opt, rt, fl6); 1610 if (err) 1611 return err; 1612 1613 exthdrlen = (opt ? opt->opt_flen : 0); 1614 length += exthdrlen; 1615 transhdrlen += exthdrlen; 1616 } else { 1617 fl6 = &inet->cork.fl.u.ip6; 1618 transhdrlen = 0; 1619 } 1620 1621 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1622 &np->cork, sk_page_frag(sk), getfrag, 1623 from, length, transhdrlen, flags, dontfrag); 1624 } 1625 EXPORT_SYMBOL_GPL(ip6_append_data); 1626 1627 static void ip6_cork_release(struct inet_cork_full *cork, 1628 struct inet6_cork *v6_cork) 1629 { 1630 if (v6_cork->opt) { 1631 kfree(v6_cork->opt->dst0opt); 1632 kfree(v6_cork->opt->dst1opt); 1633 kfree(v6_cork->opt->hopopt); 1634 kfree(v6_cork->opt->srcrt); 1635 kfree(v6_cork->opt); 1636 v6_cork->opt = NULL; 1637 } 1638 1639 if (cork->base.dst) { 1640 dst_release(cork->base.dst); 1641 cork->base.dst = NULL; 1642 cork->base.flags &= ~IPCORK_ALLFRAG; 1643 } 1644 memset(&cork->fl, 0, sizeof(cork->fl)); 1645 } 1646 1647 struct sk_buff *__ip6_make_skb(struct sock *sk, 1648 struct sk_buff_head *queue, 1649 struct inet_cork_full *cork, 1650 struct inet6_cork *v6_cork) 1651 { 1652 struct sk_buff *skb, *tmp_skb; 1653 struct sk_buff **tail_skb; 1654 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1655 struct ipv6_pinfo *np = inet6_sk(sk); 1656 struct net *net = sock_net(sk); 1657 struct ipv6hdr *hdr; 1658 struct ipv6_txoptions *opt = v6_cork->opt; 1659 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1660 struct flowi6 *fl6 = &cork->fl.u.ip6; 1661 unsigned char proto = fl6->flowi6_proto; 1662 1663 skb = __skb_dequeue(queue); 1664 if (!skb) 1665 goto out; 1666 tail_skb = &(skb_shinfo(skb)->frag_list); 1667 1668 /* move skb->data to ip header from ext header */ 1669 if (skb->data < skb_network_header(skb)) 1670 __skb_pull(skb, skb_network_offset(skb)); 1671 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1672 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1673 *tail_skb = tmp_skb; 1674 tail_skb = &(tmp_skb->next); 1675 skb->len += tmp_skb->len; 1676 skb->data_len += tmp_skb->len; 1677 skb->truesize += tmp_skb->truesize; 1678 tmp_skb->destructor = NULL; 1679 tmp_skb->sk = NULL; 1680 } 1681 1682 /* Allow local fragmentation. */ 1683 skb->ignore_df = ip6_sk_ignore_df(sk); 1684 1685 *final_dst = fl6->daddr; 1686 __skb_pull(skb, skb_network_header_len(skb)); 1687 if (opt && opt->opt_flen) 1688 ipv6_push_frag_opts(skb, opt, &proto); 1689 if (opt && opt->opt_nflen) 1690 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1691 1692 skb_push(skb, sizeof(struct ipv6hdr)); 1693 skb_reset_network_header(skb); 1694 hdr = ipv6_hdr(skb); 1695 1696 ip6_flow_hdr(hdr, v6_cork->tclass, 1697 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1698 ip6_autoflowlabel(net, np), fl6)); 1699 hdr->hop_limit = v6_cork->hop_limit; 1700 hdr->nexthdr = proto; 1701 hdr->saddr = fl6->saddr; 1702 hdr->daddr = *final_dst; 1703 1704 skb->priority = sk->sk_priority; 1705 skb->mark = sk->sk_mark; 1706 1707 skb_dst_set(skb, dst_clone(&rt->dst)); 1708 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1709 if (proto == IPPROTO_ICMPV6) { 1710 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1711 1712 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1713 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1714 } 1715 1716 ip6_cork_release(cork, v6_cork); 1717 out: 1718 return skb; 1719 } 1720 1721 int ip6_send_skb(struct sk_buff *skb) 1722 { 1723 struct net *net = sock_net(skb->sk); 1724 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1725 int err; 1726 1727 err = ip6_local_out(net, skb->sk, skb); 1728 if (err) { 1729 if (err > 0) 1730 err = net_xmit_errno(err); 1731 if (err) 1732 IP6_INC_STATS(net, rt->rt6i_idev, 1733 IPSTATS_MIB_OUTDISCARDS); 1734 } 1735 1736 return err; 1737 } 1738 1739 int ip6_push_pending_frames(struct sock *sk) 1740 { 1741 struct sk_buff *skb; 1742 1743 skb = ip6_finish_skb(sk); 1744 if (!skb) 1745 return 0; 1746 1747 return ip6_send_skb(skb); 1748 } 1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1750 1751 static void __ip6_flush_pending_frames(struct sock *sk, 1752 struct sk_buff_head *queue, 1753 struct inet_cork_full *cork, 1754 struct inet6_cork *v6_cork) 1755 { 1756 struct sk_buff *skb; 1757 1758 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1759 if (skb_dst(skb)) 1760 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1761 IPSTATS_MIB_OUTDISCARDS); 1762 kfree_skb(skb); 1763 } 1764 1765 ip6_cork_release(cork, v6_cork); 1766 } 1767 1768 void ip6_flush_pending_frames(struct sock *sk) 1769 { 1770 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1771 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1772 } 1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1774 1775 struct sk_buff *ip6_make_skb(struct sock *sk, 1776 int getfrag(void *from, char *to, int offset, 1777 int len, int odd, struct sk_buff *skb), 1778 void *from, int length, int transhdrlen, 1779 int hlimit, int tclass, 1780 struct ipv6_txoptions *opt, struct flowi6 *fl6, 1781 struct rt6_info *rt, unsigned int flags, 1782 int dontfrag) 1783 { 1784 struct inet_cork_full cork; 1785 struct inet6_cork v6_cork; 1786 struct sk_buff_head queue; 1787 int exthdrlen = (opt ? opt->opt_flen : 0); 1788 int err; 1789 1790 if (flags & MSG_PROBE) 1791 return NULL; 1792 1793 __skb_queue_head_init(&queue); 1794 1795 cork.base.flags = 0; 1796 cork.base.addr = 0; 1797 cork.base.opt = NULL; 1798 cork.base.dst = NULL; 1799 v6_cork.opt = NULL; 1800 err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6); 1801 if (err) { 1802 ip6_cork_release(&cork, &v6_cork); 1803 return ERR_PTR(err); 1804 } 1805 1806 if (dontfrag < 0) 1807 dontfrag = inet6_sk(sk)->dontfrag; 1808 1809 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1810 ¤t->task_frag, getfrag, from, 1811 length + exthdrlen, transhdrlen + exthdrlen, 1812 flags, dontfrag); 1813 if (err) { 1814 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1815 return ERR_PTR(err); 1816 } 1817 1818 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1819 } 1820
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.