1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. 20 * 21 * Changes: 22 * Paul `Rusty' Russell properly handle non-linear skbs 23 * Harald Welte don't use nfcache 24 * 25 */ 26 27 #define KMSG_COMPONENT "IPVS" 28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 29 30 #include <linux/module.h> 31 #include <linux/kernel.h> 32 #include <linux/ip.h> 33 #include <linux/tcp.h> 34 #include <linux/sctp.h> 35 #include <linux/icmp.h> 36 #include <linux/slab.h> 37 38 #include <net/ip.h> 39 #include <net/tcp.h> 40 #include <net/udp.h> 41 #include <net/icmp.h> /* for icmp_send */ 42 #include <net/route.h> 43 #include <net/ip6_checksum.h> 44 #include <net/netns/generic.h> /* net_generic() */ 45 46 #include <linux/netfilter.h> 47 #include <linux/netfilter_ipv4.h> 48 49 #ifdef CONFIG_IP_VS_IPV6 50 #include <net/ipv6.h> 51 #include <linux/netfilter_ipv6.h> 52 #include <net/ip6_route.h> 53 #endif 54 55 #include <net/ip_vs.h> 56 57 58 EXPORT_SYMBOL(register_ip_vs_scheduler); 59 EXPORT_SYMBOL(unregister_ip_vs_scheduler); 60 EXPORT_SYMBOL(ip_vs_proto_name); 61 EXPORT_SYMBOL(ip_vs_conn_new); 62 EXPORT_SYMBOL(ip_vs_conn_in_get); 63 EXPORT_SYMBOL(ip_vs_conn_out_get); 64 #ifdef CONFIG_IP_VS_PROTO_TCP 65 EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 66 #endif 67 EXPORT_SYMBOL(ip_vs_conn_put); 68 #ifdef CONFIG_IP_VS_DEBUG 69 EXPORT_SYMBOL(ip_vs_get_debug_level); 70 #endif 71 72 static int ip_vs_net_id __read_mostly; 73 /* netns cnt used for uniqueness */ 74 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 75 76 /* ID used in ICMP lookups */ 77 #define icmp_id(icmph) (((icmph)->un).echo.id) 78 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 79 80 const char *ip_vs_proto_name(unsigned int proto) 81 { 82 static char buf[20]; 83 84 switch (proto) { 85 case IPPROTO_IP: 86 return "IP"; 87 case IPPROTO_UDP: 88 return "UDP"; 89 case IPPROTO_TCP: 90 return "TCP"; 91 case IPPROTO_SCTP: 92 return "SCTP"; 93 case IPPROTO_ICMP: 94 return "ICMP"; 95 #ifdef CONFIG_IP_VS_IPV6 96 case IPPROTO_ICMPV6: 97 return "ICMPv6"; 98 #endif 99 default: 100 sprintf(buf, "IP_%u", proto); 101 return buf; 102 } 103 } 104 105 void ip_vs_init_hash_table(struct list_head *table, int rows) 106 { 107 while (--rows >= 0) 108 INIT_LIST_HEAD(&table[rows]); 109 } 110 111 static inline void 112 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 113 { 114 struct ip_vs_dest *dest = cp->dest; 115 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 116 117 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 118 struct ip_vs_cpu_stats *s; 119 struct ip_vs_service *svc; 120 121 s = this_cpu_ptr(dest->stats.cpustats); 122 s->ustats.inpkts++; 123 u64_stats_update_begin(&s->syncp); 124 s->ustats.inbytes += skb->len; 125 u64_stats_update_end(&s->syncp); 126 127 rcu_read_lock(); 128 svc = rcu_dereference(dest->svc); 129 s = this_cpu_ptr(svc->stats.cpustats); 130 s->ustats.inpkts++; 131 u64_stats_update_begin(&s->syncp); 132 s->ustats.inbytes += skb->len; 133 u64_stats_update_end(&s->syncp); 134 rcu_read_unlock(); 135 136 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 137 s->ustats.inpkts++; 138 u64_stats_update_begin(&s->syncp); 139 s->ustats.inbytes += skb->len; 140 u64_stats_update_end(&s->syncp); 141 } 142 } 143 144 145 static inline void 146 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 147 { 148 struct ip_vs_dest *dest = cp->dest; 149 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 150 151 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 152 struct ip_vs_cpu_stats *s; 153 struct ip_vs_service *svc; 154 155 s = this_cpu_ptr(dest->stats.cpustats); 156 s->ustats.outpkts++; 157 u64_stats_update_begin(&s->syncp); 158 s->ustats.outbytes += skb->len; 159 u64_stats_update_end(&s->syncp); 160 161 rcu_read_lock(); 162 svc = rcu_dereference(dest->svc); 163 s = this_cpu_ptr(svc->stats.cpustats); 164 s->ustats.outpkts++; 165 u64_stats_update_begin(&s->syncp); 166 s->ustats.outbytes += skb->len; 167 u64_stats_update_end(&s->syncp); 168 rcu_read_unlock(); 169 170 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 171 s->ustats.outpkts++; 172 u64_stats_update_begin(&s->syncp); 173 s->ustats.outbytes += skb->len; 174 u64_stats_update_end(&s->syncp); 175 } 176 } 177 178 179 static inline void 180 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 181 { 182 struct netns_ipvs *ipvs = net_ipvs(svc->net); 183 struct ip_vs_cpu_stats *s; 184 185 s = this_cpu_ptr(cp->dest->stats.cpustats); 186 s->ustats.conns++; 187 188 s = this_cpu_ptr(svc->stats.cpustats); 189 s->ustats.conns++; 190 191 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 192 s->ustats.conns++; 193 } 194 195 196 static inline void 197 ip_vs_set_state(struct ip_vs_conn *cp, int direction, 198 const struct sk_buff *skb, 199 struct ip_vs_proto_data *pd) 200 { 201 if (likely(pd->pp->state_transition)) 202 pd->pp->state_transition(cp, direction, skb, pd); 203 } 204 205 static inline int 206 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 207 struct sk_buff *skb, int protocol, 208 const union nf_inet_addr *caddr, __be16 cport, 209 const union nf_inet_addr *vaddr, __be16 vport, 210 struct ip_vs_conn_param *p) 211 { 212 ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, 213 vport, p); 214 p->pe = rcu_dereference(svc->pe); 215 if (p->pe && p->pe->fill_param) 216 return p->pe->fill_param(p, skb); 217 218 return 0; 219 } 220 221 /* 222 * IPVS persistent scheduling function 223 * It creates a connection entry according to its template if exists, 224 * or selects a server and creates a connection entry plus a template. 225 * Locking: we are svc user (svc->refcnt), so we hold all dests too 226 * Protocols supported: TCP, UDP 227 */ 228 static struct ip_vs_conn * 229 ip_vs_sched_persist(struct ip_vs_service *svc, 230 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 231 int *ignored, struct ip_vs_iphdr *iph) 232 { 233 struct ip_vs_conn *cp = NULL; 234 struct ip_vs_dest *dest; 235 struct ip_vs_conn *ct; 236 __be16 dport = 0; /* destination port to forward */ 237 unsigned int flags; 238 struct ip_vs_conn_param param; 239 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 240 union nf_inet_addr snet; /* source network of the client, 241 after masking */ 242 243 /* Mask saddr with the netmask to adjust template granularity */ 244 #ifdef CONFIG_IP_VS_IPV6 245 if (svc->af == AF_INET6) 246 ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, 247 (__force __u32) svc->netmask); 248 else 249 #endif 250 snet.ip = iph->saddr.ip & svc->netmask; 251 252 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 253 "mnet %s\n", 254 IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), 255 IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), 256 IP_VS_DBG_ADDR(svc->af, &snet)); 257 258 /* 259 * As far as we know, FTP is a very complicated network protocol, and 260 * it uses control connection and data connections. For active FTP, 261 * FTP server initialize data connection to the client, its source port 262 * is often 20. For passive FTP, FTP server tells the clients the port 263 * that it passively listens to, and the client issues the data 264 * connection. In the tunneling or direct routing mode, the load 265 * balancer is on the client-to-server half of connection, the port 266 * number is unknown to the load balancer. So, a conn template like 267 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 268 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 269 * is created for other persistent services. 270 */ 271 { 272 int protocol = iph->protocol; 273 const union nf_inet_addr *vaddr = &iph->daddr; 274 __be16 vport = 0; 275 276 if (dst_port == svc->port) { 277 /* non-FTP template: 278 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 279 * FTP template: 280 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 281 */ 282 if (svc->port != FTPPORT) 283 vport = dst_port; 284 } else { 285 /* Note: persistent fwmark-based services and 286 * persistent port zero service are handled here. 287 * fwmark template: 288 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 289 * port zero template: 290 * <protocol,caddr,0,vaddr,0,daddr,0> 291 */ 292 if (svc->fwmark) { 293 protocol = IPPROTO_IP; 294 vaddr = &fwmark; 295 } 296 } 297 /* return *ignored = -1 so NF_DROP can be used */ 298 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 299 vaddr, vport, ¶m) < 0) { 300 *ignored = -1; 301 return NULL; 302 } 303 } 304 305 /* Check if a template already exists */ 306 ct = ip_vs_ct_in_get(¶m); 307 if (!ct || !ip_vs_check_template(ct)) { 308 struct ip_vs_scheduler *sched; 309 310 /* 311 * No template found or the dest of the connection 312 * template is not available. 313 * return *ignored=0 i.e. ICMP and NF_DROP 314 */ 315 sched = rcu_dereference(svc->scheduler); 316 if (sched) { 317 /* read svc->sched_data after svc->scheduler */ 318 smp_rmb(); 319 dest = sched->schedule(svc, skb, iph); 320 } else { 321 dest = NULL; 322 } 323 if (!dest) { 324 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 325 kfree(param.pe_data); 326 *ignored = 0; 327 return NULL; 328 } 329 330 if (dst_port == svc->port && svc->port != FTPPORT) 331 dport = dest->port; 332 333 /* Create a template 334 * This adds param.pe_data to the template, 335 * and thus param.pe_data will be destroyed 336 * when the template expires */ 337 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 338 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 339 if (ct == NULL) { 340 kfree(param.pe_data); 341 *ignored = -1; 342 return NULL; 343 } 344 345 ct->timeout = svc->timeout; 346 } else { 347 /* set destination with the found template */ 348 dest = ct->dest; 349 kfree(param.pe_data); 350 } 351 352 dport = dst_port; 353 if (dport == svc->port && dest->port) 354 dport = dest->port; 355 356 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 357 && iph->protocol == IPPROTO_UDP) ? 358 IP_VS_CONN_F_ONE_PACKET : 0; 359 360 /* 361 * Create a new connection according to the template 362 */ 363 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, 364 src_port, &iph->daddr, dst_port, ¶m); 365 366 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 367 skb->mark); 368 if (cp == NULL) { 369 ip_vs_conn_put(ct); 370 *ignored = -1; 371 return NULL; 372 } 373 374 /* 375 * Add its control 376 */ 377 ip_vs_control_add(cp, ct); 378 ip_vs_conn_put(ct); 379 380 ip_vs_conn_stats(cp, svc); 381 return cp; 382 } 383 384 385 /* 386 * IPVS main scheduling function 387 * It selects a server according to the virtual service, and 388 * creates a connection entry. 389 * Protocols supported: TCP, UDP 390 * 391 * Usage of *ignored 392 * 393 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 394 * svc/scheduler decides that this packet should be accepted with 395 * NF_ACCEPT because it must not be scheduled. 396 * 397 * 0 : scheduler can not find destination, so try bypass or 398 * return ICMP and then NF_DROP (ip_vs_leave). 399 * 400 * -1 : scheduler tried to schedule but fatal error occurred, eg. 401 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 402 * failure such as missing Call-ID, ENOMEM on skb_linearize 403 * or pe_data. In this case we should return NF_DROP without 404 * any attempts to send ICMP with ip_vs_leave. 405 */ 406 struct ip_vs_conn * 407 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 408 struct ip_vs_proto_data *pd, int *ignored, 409 struct ip_vs_iphdr *iph) 410 { 411 struct ip_vs_protocol *pp = pd->pp; 412 struct ip_vs_conn *cp = NULL; 413 struct ip_vs_scheduler *sched; 414 struct ip_vs_dest *dest; 415 __be16 _ports[2], *pptr; 416 unsigned int flags; 417 418 *ignored = 1; 419 /* 420 * IPv6 frags, only the first hit here. 421 */ 422 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 423 if (pptr == NULL) 424 return NULL; 425 426 /* 427 * FTPDATA needs this check when using local real server. 428 * Never schedule Active FTPDATA connections from real server. 429 * For LVS-NAT they must be already created. For other methods 430 * with persistence the connection is created on SYN+ACK. 431 */ 432 if (pptr[0] == FTPDATA) { 433 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, 434 "Not scheduling FTPDATA"); 435 return NULL; 436 } 437 438 /* 439 * Do not schedule replies from local real server. 440 */ 441 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 442 (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { 443 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, 444 "Not scheduling reply for existing connection"); 445 __ip_vs_conn_put(cp); 446 return NULL; 447 } 448 449 /* 450 * Persistent service 451 */ 452 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 453 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, 454 iph); 455 456 *ignored = 0; 457 458 /* 459 * Non-persistent service 460 */ 461 if (!svc->fwmark && pptr[1] != svc->port) { 462 if (!svc->port) 463 pr_err("Schedule: port zero only supported " 464 "in persistent services, " 465 "check your ipvs configuration\n"); 466 return NULL; 467 } 468 469 sched = rcu_dereference(svc->scheduler); 470 if (sched) { 471 /* read svc->sched_data after svc->scheduler */ 472 smp_rmb(); 473 dest = sched->schedule(svc, skb, iph); 474 } else { 475 dest = NULL; 476 } 477 if (dest == NULL) { 478 IP_VS_DBG(1, "Schedule: no dest found.\n"); 479 return NULL; 480 } 481 482 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 483 && iph->protocol == IPPROTO_UDP) ? 484 IP_VS_CONN_F_ONE_PACKET : 0; 485 486 /* 487 * Create a connection entry. 488 */ 489 { 490 struct ip_vs_conn_param p; 491 492 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, 493 &iph->saddr, pptr[0], &iph->daddr, 494 pptr[1], &p); 495 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 496 dest->port ? dest->port : pptr[1], 497 flags, dest, skb->mark); 498 if (!cp) { 499 *ignored = -1; 500 return NULL; 501 } 502 } 503 504 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 505 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 506 ip_vs_fwd_tag(cp), 507 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 508 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 509 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 510 cp->flags, atomic_read(&cp->refcnt)); 511 512 ip_vs_conn_stats(cp, svc); 513 return cp; 514 } 515 516 517 /* 518 * Pass or drop the packet. 519 * Called by ip_vs_in, when the virtual service is available but 520 * no destination is available for a new connection. 521 */ 522 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 523 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 524 { 525 __be16 _ports[2], *pptr; 526 #ifdef CONFIG_SYSCTL 527 struct net *net; 528 struct netns_ipvs *ipvs; 529 int unicast; 530 #endif 531 532 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 533 if (pptr == NULL) { 534 return NF_DROP; 535 } 536 537 #ifdef CONFIG_SYSCTL 538 net = skb_net(skb); 539 540 #ifdef CONFIG_IP_VS_IPV6 541 if (svc->af == AF_INET6) 542 unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; 543 else 544 #endif 545 unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); 546 547 /* if it is fwmark-based service, the cache_bypass sysctl is up 548 and the destination is a non-local unicast, then create 549 a cache_bypass connection entry */ 550 ipvs = net_ipvs(net); 551 if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { 552 int ret; 553 struct ip_vs_conn *cp; 554 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 555 iph->protocol == IPPROTO_UDP) ? 556 IP_VS_CONN_F_ONE_PACKET : 0; 557 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 558 559 /* create a new connection entry */ 560 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 561 { 562 struct ip_vs_conn_param p; 563 ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, 564 &iph->saddr, pptr[0], 565 &iph->daddr, pptr[1], &p); 566 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 567 IP_VS_CONN_F_BYPASS | flags, 568 NULL, skb->mark); 569 if (!cp) 570 return NF_DROP; 571 } 572 573 /* statistics */ 574 ip_vs_in_stats(cp, skb); 575 576 /* set state */ 577 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 578 579 /* transmit the first SYN packet */ 580 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 581 /* do not touch skb anymore */ 582 583 atomic_inc(&cp->in_pkts); 584 ip_vs_conn_put(cp); 585 return ret; 586 } 587 #endif 588 589 /* 590 * When the virtual ftp service is presented, packets destined 591 * for other services on the VIP may get here (except services 592 * listed in the ipvs table), pass the packets, because it is 593 * not ipvs job to decide to drop the packets. 594 */ 595 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) 596 return NF_ACCEPT; 597 598 /* 599 * Notify the client that the destination is unreachable, and 600 * release the socket buffer. 601 * Since it is in IP layer, the TCP socket is not actually 602 * created, the TCP RST packet cannot be sent, instead that 603 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 604 */ 605 #ifdef CONFIG_IP_VS_IPV6 606 if (svc->af == AF_INET6) { 607 if (!skb->dev) { 608 struct net *net_ = dev_net(skb_dst(skb)->dev); 609 610 skb->dev = net_->loopback_dev; 611 } 612 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 613 } else 614 #endif 615 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 616 617 return NF_DROP; 618 } 619 620 #ifdef CONFIG_SYSCTL 621 622 static int sysctl_snat_reroute(struct sk_buff *skb) 623 { 624 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); 625 return ipvs->sysctl_snat_reroute; 626 } 627 628 static int sysctl_nat_icmp_send(struct net *net) 629 { 630 struct netns_ipvs *ipvs = net_ipvs(net); 631 return ipvs->sysctl_nat_icmp_send; 632 } 633 634 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) 635 { 636 return ipvs->sysctl_expire_nodest_conn; 637 } 638 639 #else 640 641 static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } 642 static int sysctl_nat_icmp_send(struct net *net) { return 0; } 643 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } 644 645 #endif 646 647 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 648 { 649 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 650 } 651 652 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 653 { 654 if (NF_INET_LOCAL_IN == hooknum) 655 return IP_DEFRAG_VS_IN; 656 if (NF_INET_FORWARD == hooknum) 657 return IP_DEFRAG_VS_FWD; 658 return IP_DEFRAG_VS_OUT; 659 } 660 661 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) 662 { 663 int err; 664 665 local_bh_disable(); 666 err = ip_defrag(skb, user); 667 local_bh_enable(); 668 if (!err) 669 ip_send_check(ip_hdr(skb)); 670 671 return err; 672 } 673 674 static int ip_vs_route_me_harder(int af, struct sk_buff *skb) 675 { 676 #ifdef CONFIG_IP_VS_IPV6 677 if (af == AF_INET6) { 678 if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) 679 return 1; 680 } else 681 #endif 682 if ((sysctl_snat_reroute(skb) || 683 skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 684 ip_route_me_harder(skb, RTN_LOCAL) != 0) 685 return 1; 686 687 return 0; 688 } 689 690 /* 691 * Packet has been made sufficiently writable in caller 692 * - inout: 1=in->out, 0=out->in 693 */ 694 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 695 struct ip_vs_conn *cp, int inout) 696 { 697 struct iphdr *iph = ip_hdr(skb); 698 unsigned int icmp_offset = iph->ihl*4; 699 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 700 icmp_offset); 701 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 702 703 if (inout) { 704 iph->saddr = cp->vaddr.ip; 705 ip_send_check(iph); 706 ciph->daddr = cp->vaddr.ip; 707 ip_send_check(ciph); 708 } else { 709 iph->daddr = cp->daddr.ip; 710 ip_send_check(iph); 711 ciph->saddr = cp->daddr.ip; 712 ip_send_check(ciph); 713 } 714 715 /* the TCP/UDP/SCTP port */ 716 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 717 IPPROTO_SCTP == ciph->protocol) { 718 __be16 *ports = (void *)ciph + ciph->ihl*4; 719 720 if (inout) 721 ports[1] = cp->vport; 722 else 723 ports[0] = cp->dport; 724 } 725 726 /* And finally the ICMP checksum */ 727 icmph->checksum = 0; 728 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 729 skb->ip_summed = CHECKSUM_UNNECESSARY; 730 731 if (inout) 732 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 733 "Forwarding altered outgoing ICMP"); 734 else 735 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 736 "Forwarding altered incoming ICMP"); 737 } 738 739 #ifdef CONFIG_IP_VS_IPV6 740 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 741 struct ip_vs_conn *cp, int inout) 742 { 743 struct ipv6hdr *iph = ipv6_hdr(skb); 744 unsigned int icmp_offset = 0; 745 unsigned int offs = 0; /* header offset*/ 746 int protocol; 747 struct icmp6hdr *icmph; 748 struct ipv6hdr *ciph; 749 unsigned short fragoffs; 750 751 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 752 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 753 offs = icmp_offset + sizeof(struct icmp6hdr); 754 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 755 756 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 757 758 if (inout) { 759 iph->saddr = cp->vaddr.in6; 760 ciph->daddr = cp->vaddr.in6; 761 } else { 762 iph->daddr = cp->daddr.in6; 763 ciph->saddr = cp->daddr.in6; 764 } 765 766 /* the TCP/UDP/SCTP port */ 767 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 768 IPPROTO_SCTP == protocol)) { 769 __be16 *ports = (void *)(skb_network_header(skb) + offs); 770 771 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 772 ntohs(inout ? ports[1] : ports[0]), 773 ntohs(inout ? cp->vport : cp->dport)); 774 if (inout) 775 ports[1] = cp->vport; 776 else 777 ports[0] = cp->dport; 778 } 779 780 /* And finally the ICMP checksum */ 781 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 782 skb->len - icmp_offset, 783 IPPROTO_ICMPV6, 0); 784 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 785 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 786 skb->ip_summed = CHECKSUM_PARTIAL; 787 788 if (inout) 789 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 790 (void *)ciph - (void *)iph, 791 "Forwarding altered outgoing ICMPv6"); 792 else 793 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 794 (void *)ciph - (void *)iph, 795 "Forwarding altered incoming ICMPv6"); 796 } 797 #endif 798 799 /* Handle relevant response ICMP messages - forward to the right 800 * destination host. 801 */ 802 static int handle_response_icmp(int af, struct sk_buff *skb, 803 union nf_inet_addr *snet, 804 __u8 protocol, struct ip_vs_conn *cp, 805 struct ip_vs_protocol *pp, 806 unsigned int offset, unsigned int ihl) 807 { 808 unsigned int verdict = NF_DROP; 809 810 if (IP_VS_FWD_METHOD(cp) != 0) { 811 pr_err("shouldn't reach here, because the box is on the " 812 "half connection in the tun/dr module.\n"); 813 } 814 815 /* Ensure the checksum is correct */ 816 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 817 /* Failed checksum! */ 818 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 819 IP_VS_DBG_ADDR(af, snet)); 820 goto out; 821 } 822 823 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 824 IPPROTO_SCTP == protocol) 825 offset += 2 * sizeof(__u16); 826 if (!skb_make_writable(skb, offset)) 827 goto out; 828 829 #ifdef CONFIG_IP_VS_IPV6 830 if (af == AF_INET6) 831 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 832 else 833 #endif 834 ip_vs_nat_icmp(skb, pp, cp, 1); 835 836 if (ip_vs_route_me_harder(af, skb)) 837 goto out; 838 839 /* do the statistics and put it back */ 840 ip_vs_out_stats(cp, skb); 841 842 skb->ipvs_property = 1; 843 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 844 ip_vs_notrack(skb); 845 else 846 ip_vs_update_conntrack(skb, cp, 0); 847 verdict = NF_ACCEPT; 848 849 out: 850 __ip_vs_conn_put(cp); 851 852 return verdict; 853 } 854 855 /* 856 * Handle ICMP messages in the inside-to-outside direction (outgoing). 857 * Find any that might be relevant, check against existing connections. 858 * Currently handles error types - unreachable, quench, ttl exceeded. 859 */ 860 static int ip_vs_out_icmp(struct sk_buff *skb, int *related, 861 unsigned int hooknum) 862 { 863 struct iphdr *iph; 864 struct icmphdr _icmph, *ic; 865 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 866 struct ip_vs_iphdr ciph; 867 struct ip_vs_conn *cp; 868 struct ip_vs_protocol *pp; 869 unsigned int offset, ihl; 870 union nf_inet_addr snet; 871 872 *related = 1; 873 874 /* reassemble IP fragments */ 875 if (ip_is_fragment(ip_hdr(skb))) { 876 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) 877 return NF_STOLEN; 878 } 879 880 iph = ip_hdr(skb); 881 offset = ihl = iph->ihl * 4; 882 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 883 if (ic == NULL) 884 return NF_DROP; 885 886 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 887 ic->type, ntohs(icmp_id(ic)), 888 &iph->saddr, &iph->daddr); 889 890 /* 891 * Work through seeing if this is for us. 892 * These checks are supposed to be in an order that means easy 893 * things are checked first to speed up processing.... however 894 * this means that some packets will manage to get a long way 895 * down this stack and then be rejected, but that's life. 896 */ 897 if ((ic->type != ICMP_DEST_UNREACH) && 898 (ic->type != ICMP_SOURCE_QUENCH) && 899 (ic->type != ICMP_TIME_EXCEEDED)) { 900 *related = 0; 901 return NF_ACCEPT; 902 } 903 904 /* Now find the contained IP header */ 905 offset += sizeof(_icmph); 906 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 907 if (cih == NULL) 908 return NF_ACCEPT; /* The packet looks wrong, ignore */ 909 910 pp = ip_vs_proto_get(cih->protocol); 911 if (!pp) 912 return NF_ACCEPT; 913 914 /* Is the embedded protocol header present? */ 915 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 916 pp->dont_defrag)) 917 return NF_ACCEPT; 918 919 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 920 "Checking outgoing ICMP for"); 921 922 ip_vs_fill_ip4hdr(cih, &ciph); 923 ciph.len += offset; 924 /* The embedded headers contain source and dest in reverse order */ 925 cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); 926 if (!cp) 927 return NF_ACCEPT; 928 929 snet.ip = iph->saddr; 930 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 931 pp, ciph.len, ihl); 932 } 933 934 #ifdef CONFIG_IP_VS_IPV6 935 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, 936 unsigned int hooknum, struct ip_vs_iphdr *ipvsh) 937 { 938 struct icmp6hdr _icmph, *ic; 939 struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ 940 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 941 struct ip_vs_conn *cp; 942 struct ip_vs_protocol *pp; 943 union nf_inet_addr snet; 944 unsigned int writable; 945 946 *related = 1; 947 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); 948 if (ic == NULL) 949 return NF_DROP; 950 951 /* 952 * Work through seeing if this is for us. 953 * These checks are supposed to be in an order that means easy 954 * things are checked first to speed up processing.... however 955 * this means that some packets will manage to get a long way 956 * down this stack and then be rejected, but that's life. 957 */ 958 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 959 *related = 0; 960 return NF_ACCEPT; 961 } 962 /* Fragment header that is before ICMP header tells us that: 963 * it's not an error message since they can't be fragmented. 964 */ 965 if (ipvsh->flags & IP6_FH_F_FRAG) 966 return NF_DROP; 967 968 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 969 ic->icmp6_type, ntohs(icmpv6_id(ic)), 970 &ipvsh->saddr, &ipvsh->daddr); 971 972 /* Now find the contained IP header */ 973 ciph.len = ipvsh->len + sizeof(_icmph); 974 ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); 975 if (ip6h == NULL) 976 return NF_ACCEPT; /* The packet looks wrong, ignore */ 977 ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ 978 ciph.daddr.in6 = ip6h->daddr; 979 /* skip possible IPv6 exthdrs of contained IPv6 packet */ 980 ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); 981 if (ciph.protocol < 0) 982 return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ 983 984 pp = ip_vs_proto_get(ciph.protocol); 985 if (!pp) 986 return NF_ACCEPT; 987 988 /* The embedded headers contain source and dest in reverse order */ 989 cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); 990 if (!cp) 991 return NF_ACCEPT; 992 993 snet.in6 = ciph.saddr.in6; 994 writable = ciph.len; 995 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 996 pp, writable, sizeof(struct ipv6hdr)); 997 } 998 #endif 999 1000 /* 1001 * Check if sctp chunc is ABORT chunk 1002 */ 1003 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 1004 { 1005 sctp_chunkhdr_t *sch, schunk; 1006 sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), 1007 sizeof(schunk), &schunk); 1008 if (sch == NULL) 1009 return 0; 1010 if (sch->type == SCTP_CID_ABORT) 1011 return 1; 1012 return 0; 1013 } 1014 1015 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1016 { 1017 struct tcphdr _tcph, *th; 1018 1019 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1020 if (th == NULL) 1021 return 0; 1022 return th->rst; 1023 } 1024 1025 static inline bool is_new_conn(const struct sk_buff *skb, 1026 struct ip_vs_iphdr *iph) 1027 { 1028 switch (iph->protocol) { 1029 case IPPROTO_TCP: { 1030 struct tcphdr _tcph, *th; 1031 1032 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1033 if (th == NULL) 1034 return false; 1035 return th->syn; 1036 } 1037 case IPPROTO_SCTP: { 1038 sctp_chunkhdr_t *sch, schunk; 1039 1040 sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), 1041 sizeof(schunk), &schunk); 1042 if (sch == NULL) 1043 return false; 1044 return sch->type == SCTP_CID_INIT; 1045 } 1046 default: 1047 return false; 1048 } 1049 } 1050 1051 /* Handle response packets: rewrite addresses and send away... 1052 */ 1053 static unsigned int 1054 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1055 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 1056 { 1057 struct ip_vs_protocol *pp = pd->pp; 1058 1059 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); 1060 1061 if (!skb_make_writable(skb, iph->len)) 1062 goto drop; 1063 1064 /* mangle the packet */ 1065 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) 1066 goto drop; 1067 1068 #ifdef CONFIG_IP_VS_IPV6 1069 if (af == AF_INET6) 1070 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1071 else 1072 #endif 1073 { 1074 ip_hdr(skb)->saddr = cp->vaddr.ip; 1075 ip_send_check(ip_hdr(skb)); 1076 } 1077 1078 /* 1079 * nf_iterate does not expect change in the skb->dst->dev. 1080 * It looks like it is not fatal to enable this code for hooks 1081 * where our handlers are at the end of the chain list and 1082 * when all next handlers use skb->dst->dev and not outdev. 1083 * It will definitely route properly the inout NAT traffic 1084 * when multiple paths are used. 1085 */ 1086 1087 /* For policy routing, packets originating from this 1088 * machine itself may be routed differently to packets 1089 * passing through. We want this packet to be routed as 1090 * if it came from this machine itself. So re-compute 1091 * the routing information. 1092 */ 1093 if (ip_vs_route_me_harder(af, skb)) 1094 goto drop; 1095 1096 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); 1097 1098 ip_vs_out_stats(cp, skb); 1099 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1100 skb->ipvs_property = 1; 1101 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1102 ip_vs_notrack(skb); 1103 else 1104 ip_vs_update_conntrack(skb, cp, 0); 1105 ip_vs_conn_put(cp); 1106 1107 LeaveFunction(11); 1108 return NF_ACCEPT; 1109 1110 drop: 1111 ip_vs_conn_put(cp); 1112 kfree_skb(skb); 1113 LeaveFunction(11); 1114 return NF_STOLEN; 1115 } 1116 1117 /* 1118 * Check if outgoing packet belongs to the established ip_vs_conn. 1119 */ 1120 static unsigned int 1121 ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) 1122 { 1123 struct net *net = NULL; 1124 struct ip_vs_iphdr iph; 1125 struct ip_vs_protocol *pp; 1126 struct ip_vs_proto_data *pd; 1127 struct ip_vs_conn *cp; 1128 1129 EnterFunction(11); 1130 1131 /* Already marked as IPVS request or reply? */ 1132 if (skb->ipvs_property) 1133 return NF_ACCEPT; 1134 1135 /* Bad... Do not break raw sockets */ 1136 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && 1137 af == AF_INET)) { 1138 struct sock *sk = skb->sk; 1139 struct inet_sock *inet = inet_sk(skb->sk); 1140 1141 if (inet && sk->sk_family == PF_INET && inet->nodefrag) 1142 return NF_ACCEPT; 1143 } 1144 1145 if (unlikely(!skb_dst(skb))) 1146 return NF_ACCEPT; 1147 1148 net = skb_net(skb); 1149 if (!net_ipvs(net)->enable) 1150 return NF_ACCEPT; 1151 1152 ip_vs_fill_iph_skb(af, skb, &iph); 1153 #ifdef CONFIG_IP_VS_IPV6 1154 if (af == AF_INET6) { 1155 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1156 int related; 1157 int verdict = ip_vs_out_icmp_v6(skb, &related, 1158 hooknum, &iph); 1159 1160 if (related) 1161 return verdict; 1162 } 1163 } else 1164 #endif 1165 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1166 int related; 1167 int verdict = ip_vs_out_icmp(skb, &related, hooknum); 1168 1169 if (related) 1170 return verdict; 1171 } 1172 1173 pd = ip_vs_proto_data_get(net, iph.protocol); 1174 if (unlikely(!pd)) 1175 return NF_ACCEPT; 1176 pp = pd->pp; 1177 1178 /* reassemble IP fragments */ 1179 #ifdef CONFIG_IP_VS_IPV6 1180 if (af == AF_INET) 1181 #endif 1182 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1183 if (ip_vs_gather_frags(skb, 1184 ip_vs_defrag_user(hooknum))) 1185 return NF_STOLEN; 1186 1187 ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); 1188 } 1189 1190 /* 1191 * Check if the packet belongs to an existing entry 1192 */ 1193 cp = pp->conn_out_get(af, skb, &iph, 0); 1194 1195 if (likely(cp)) 1196 return handle_response(af, skb, pd, cp, &iph); 1197 if (sysctl_nat_icmp_send(net) && 1198 (pp->protocol == IPPROTO_TCP || 1199 pp->protocol == IPPROTO_UDP || 1200 pp->protocol == IPPROTO_SCTP)) { 1201 __be16 _ports[2], *pptr; 1202 1203 pptr = frag_safe_skb_hp(skb, iph.len, 1204 sizeof(_ports), _ports, &iph); 1205 if (pptr == NULL) 1206 return NF_ACCEPT; /* Not for me */ 1207 if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, 1208 pptr[0])) { 1209 /* 1210 * Notify the real server: there is no 1211 * existing entry if it is not RST 1212 * packet or not TCP packet. 1213 */ 1214 if ((iph.protocol != IPPROTO_TCP && 1215 iph.protocol != IPPROTO_SCTP) 1216 || ((iph.protocol == IPPROTO_TCP 1217 && !is_tcp_reset(skb, iph.len)) 1218 || (iph.protocol == IPPROTO_SCTP 1219 && !is_sctp_abort(skb, 1220 iph.len)))) { 1221 #ifdef CONFIG_IP_VS_IPV6 1222 if (af == AF_INET6) { 1223 if (!skb->dev) 1224 skb->dev = net->loopback_dev; 1225 icmpv6_send(skb, 1226 ICMPV6_DEST_UNREACH, 1227 ICMPV6_PORT_UNREACH, 1228 0); 1229 } else 1230 #endif 1231 icmp_send(skb, 1232 ICMP_DEST_UNREACH, 1233 ICMP_PORT_UNREACH, 0); 1234 return NF_DROP; 1235 } 1236 } 1237 } 1238 IP_VS_DBG_PKT(12, af, pp, skb, 0, 1239 "ip_vs_out: packet continues traversal as normal"); 1240 return NF_ACCEPT; 1241 } 1242 1243 /* 1244 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1245 * used only for VS/NAT. 1246 * Check if packet is reply for established ip_vs_conn. 1247 */ 1248 static unsigned int 1249 ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1250 const struct net_device *in, const struct net_device *out, 1251 int (*okfn)(struct sk_buff *)) 1252 { 1253 return ip_vs_out(ops->hooknum, skb, AF_INET); 1254 } 1255 1256 /* 1257 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1258 * Check if packet is reply for established ip_vs_conn. 1259 */ 1260 static unsigned int 1261 ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1262 const struct net_device *in, const struct net_device *out, 1263 int (*okfn)(struct sk_buff *)) 1264 { 1265 return ip_vs_out(ops->hooknum, skb, AF_INET); 1266 } 1267 1268 #ifdef CONFIG_IP_VS_IPV6 1269 1270 /* 1271 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1272 * used only for VS/NAT. 1273 * Check if packet is reply for established ip_vs_conn. 1274 */ 1275 static unsigned int 1276 ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1277 const struct net_device *in, const struct net_device *out, 1278 int (*okfn)(struct sk_buff *)) 1279 { 1280 return ip_vs_out(ops->hooknum, skb, AF_INET6); 1281 } 1282 1283 /* 1284 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1285 * Check if packet is reply for established ip_vs_conn. 1286 */ 1287 static unsigned int 1288 ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1289 const struct net_device *in, const struct net_device *out, 1290 int (*okfn)(struct sk_buff *)) 1291 { 1292 return ip_vs_out(ops->hooknum, skb, AF_INET6); 1293 } 1294 1295 #endif 1296 1297 /* 1298 * Handle ICMP messages in the outside-to-inside direction (incoming). 1299 * Find any that might be relevant, check against existing connections, 1300 * forward to the right destination host if relevant. 1301 * Currently handles error types - unreachable, quench, ttl exceeded. 1302 */ 1303 static int 1304 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) 1305 { 1306 struct net *net = NULL; 1307 struct iphdr *iph; 1308 struct icmphdr _icmph, *ic; 1309 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1310 struct ip_vs_iphdr ciph; 1311 struct ip_vs_conn *cp; 1312 struct ip_vs_protocol *pp; 1313 struct ip_vs_proto_data *pd; 1314 unsigned int offset, offset2, ihl, verdict; 1315 bool ipip; 1316 1317 *related = 1; 1318 1319 /* reassemble IP fragments */ 1320 if (ip_is_fragment(ip_hdr(skb))) { 1321 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) 1322 return NF_STOLEN; 1323 } 1324 1325 iph = ip_hdr(skb); 1326 offset = ihl = iph->ihl * 4; 1327 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1328 if (ic == NULL) 1329 return NF_DROP; 1330 1331 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1332 ic->type, ntohs(icmp_id(ic)), 1333 &iph->saddr, &iph->daddr); 1334 1335 /* 1336 * Work through seeing if this is for us. 1337 * These checks are supposed to be in an order that means easy 1338 * things are checked first to speed up processing.... however 1339 * this means that some packets will manage to get a long way 1340 * down this stack and then be rejected, but that's life. 1341 */ 1342 if ((ic->type != ICMP_DEST_UNREACH) && 1343 (ic->type != ICMP_SOURCE_QUENCH) && 1344 (ic->type != ICMP_TIME_EXCEEDED)) { 1345 *related = 0; 1346 return NF_ACCEPT; 1347 } 1348 1349 /* Now find the contained IP header */ 1350 offset += sizeof(_icmph); 1351 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1352 if (cih == NULL) 1353 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1354 1355 net = skb_net(skb); 1356 1357 /* Special case for errors for IPIP packets */ 1358 ipip = false; 1359 if (cih->protocol == IPPROTO_IPIP) { 1360 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1361 return NF_ACCEPT; 1362 /* Error for our IPIP must arrive at LOCAL_IN */ 1363 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1364 return NF_ACCEPT; 1365 offset += cih->ihl * 4; 1366 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1367 if (cih == NULL) 1368 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1369 ipip = true; 1370 } 1371 1372 pd = ip_vs_proto_data_get(net, cih->protocol); 1373 if (!pd) 1374 return NF_ACCEPT; 1375 pp = pd->pp; 1376 1377 /* Is the embedded protocol header present? */ 1378 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1379 pp->dont_defrag)) 1380 return NF_ACCEPT; 1381 1382 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1383 "Checking incoming ICMP for"); 1384 1385 offset2 = offset; 1386 ip_vs_fill_ip4hdr(cih, &ciph); 1387 ciph.len += offset; 1388 offset = ciph.len; 1389 /* The embedded headers contain source and dest in reverse order. 1390 * For IPIP this is error for request, not for reply. 1391 */ 1392 cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); 1393 if (!cp) 1394 return NF_ACCEPT; 1395 1396 verdict = NF_DROP; 1397 1398 /* Ensure the checksum is correct */ 1399 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1400 /* Failed checksum! */ 1401 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1402 &iph->saddr); 1403 goto out; 1404 } 1405 1406 if (ipip) { 1407 __be32 info = ic->un.gateway; 1408 __u8 type = ic->type; 1409 __u8 code = ic->code; 1410 1411 /* Update the MTU */ 1412 if (ic->type == ICMP_DEST_UNREACH && 1413 ic->code == ICMP_FRAG_NEEDED) { 1414 struct ip_vs_dest *dest = cp->dest; 1415 u32 mtu = ntohs(ic->un.frag.mtu); 1416 __be16 frag_off = cih->frag_off; 1417 1418 /* Strip outer IP and ICMP, go to IPIP header */ 1419 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1420 goto ignore_ipip; 1421 offset2 -= ihl + sizeof(_icmph); 1422 skb_reset_network_header(skb); 1423 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", 1424 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); 1425 ipv4_update_pmtu(skb, dev_net(skb->dev), 1426 mtu, 0, 0, 0, 0); 1427 /* Client uses PMTUD? */ 1428 if (!(frag_off & htons(IP_DF))) 1429 goto ignore_ipip; 1430 /* Prefer the resulting PMTU */ 1431 if (dest) { 1432 struct ip_vs_dest_dst *dest_dst; 1433 1434 rcu_read_lock(); 1435 dest_dst = rcu_dereference(dest->dest_dst); 1436 if (dest_dst) 1437 mtu = dst_mtu(dest_dst->dst_cache); 1438 rcu_read_unlock(); 1439 } 1440 if (mtu > 68 + sizeof(struct iphdr)) 1441 mtu -= sizeof(struct iphdr); 1442 info = htonl(mtu); 1443 } 1444 /* Strip outer IP, ICMP and IPIP, go to IP header of 1445 * original request. 1446 */ 1447 if (pskb_pull(skb, offset2) == NULL) 1448 goto ignore_ipip; 1449 skb_reset_network_header(skb); 1450 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1451 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1452 type, code, ntohl(info)); 1453 icmp_send(skb, type, code, info); 1454 /* ICMP can be shorter but anyways, account it */ 1455 ip_vs_out_stats(cp, skb); 1456 1457 ignore_ipip: 1458 consume_skb(skb); 1459 verdict = NF_STOLEN; 1460 goto out; 1461 } 1462 1463 /* do the statistics and put it back */ 1464 ip_vs_in_stats(cp, skb); 1465 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1466 IPPROTO_SCTP == cih->protocol) 1467 offset += 2 * sizeof(__u16); 1468 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1469 1470 out: 1471 __ip_vs_conn_put(cp); 1472 1473 return verdict; 1474 } 1475 1476 #ifdef CONFIG_IP_VS_IPV6 1477 static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, 1478 unsigned int hooknum, struct ip_vs_iphdr *iph) 1479 { 1480 struct net *net = NULL; 1481 struct ipv6hdr _ip6h, *ip6h; 1482 struct icmp6hdr _icmph, *ic; 1483 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1484 struct ip_vs_conn *cp; 1485 struct ip_vs_protocol *pp; 1486 struct ip_vs_proto_data *pd; 1487 unsigned int offs_ciph, writable, verdict; 1488 1489 *related = 1; 1490 1491 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); 1492 if (ic == NULL) 1493 return NF_DROP; 1494 1495 /* 1496 * Work through seeing if this is for us. 1497 * These checks are supposed to be in an order that means easy 1498 * things are checked first to speed up processing.... however 1499 * this means that some packets will manage to get a long way 1500 * down this stack and then be rejected, but that's life. 1501 */ 1502 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1503 *related = 0; 1504 return NF_ACCEPT; 1505 } 1506 /* Fragment header that is before ICMP header tells us that: 1507 * it's not an error message since they can't be fragmented. 1508 */ 1509 if (iph->flags & IP6_FH_F_FRAG) 1510 return NF_DROP; 1511 1512 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1513 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1514 &iph->saddr, &iph->daddr); 1515 1516 /* Now find the contained IP header */ 1517 ciph.len = iph->len + sizeof(_icmph); 1518 offs_ciph = ciph.len; /* Save ip header offset */ 1519 ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); 1520 if (ip6h == NULL) 1521 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1522 ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ 1523 ciph.daddr.in6 = ip6h->daddr; 1524 /* skip possible IPv6 exthdrs of contained IPv6 packet */ 1525 ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); 1526 if (ciph.protocol < 0) 1527 return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ 1528 1529 net = skb_net(skb); 1530 pd = ip_vs_proto_data_get(net, ciph.protocol); 1531 if (!pd) 1532 return NF_ACCEPT; 1533 pp = pd->pp; 1534 1535 /* Cannot handle fragmented embedded protocol */ 1536 if (ciph.fragoffs) 1537 return NF_ACCEPT; 1538 1539 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, 1540 "Checking incoming ICMPv6 for"); 1541 1542 /* The embedded headers contain source and dest in reverse order 1543 * if not from localhost 1544 */ 1545 cp = pp->conn_in_get(AF_INET6, skb, &ciph, 1546 (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); 1547 1548 if (!cp) 1549 return NF_ACCEPT; 1550 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 1551 if ((hooknum == NF_INET_LOCAL_OUT) && 1552 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 1553 __ip_vs_conn_put(cp); 1554 return NF_ACCEPT; 1555 } 1556 1557 /* do the statistics and put it back */ 1558 ip_vs_in_stats(cp, skb); 1559 1560 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 1561 writable = ciph.len; 1562 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 1563 IPPROTO_SCTP == ciph.protocol) 1564 writable += 2 * sizeof(__u16); /* Also mangle ports */ 1565 1566 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); 1567 1568 __ip_vs_conn_put(cp); 1569 1570 return verdict; 1571 } 1572 #endif 1573 1574 1575 /* 1576 * Check if it's for virtual services, look it up, 1577 * and send it on its way... 1578 */ 1579 static unsigned int 1580 ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) 1581 { 1582 struct net *net; 1583 struct ip_vs_iphdr iph; 1584 struct ip_vs_protocol *pp; 1585 struct ip_vs_proto_data *pd; 1586 struct ip_vs_conn *cp; 1587 int ret, pkts; 1588 struct netns_ipvs *ipvs; 1589 1590 /* Already marked as IPVS request or reply? */ 1591 if (skb->ipvs_property) 1592 return NF_ACCEPT; 1593 1594 /* 1595 * Big tappo: 1596 * - remote client: only PACKET_HOST 1597 * - route: used for struct net when skb->dev is unset 1598 */ 1599 if (unlikely((skb->pkt_type != PACKET_HOST && 1600 hooknum != NF_INET_LOCAL_OUT) || 1601 !skb_dst(skb))) { 1602 ip_vs_fill_iph_skb(af, skb, &iph); 1603 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 1604 " ignored in hook %u\n", 1605 skb->pkt_type, iph.protocol, 1606 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 1607 return NF_ACCEPT; 1608 } 1609 /* ipvs enabled in this netns ? */ 1610 net = skb_net(skb); 1611 ipvs = net_ipvs(net); 1612 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1613 return NF_ACCEPT; 1614 1615 ip_vs_fill_iph_skb(af, skb, &iph); 1616 1617 /* Bad... Do not break raw sockets */ 1618 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && 1619 af == AF_INET)) { 1620 struct sock *sk = skb->sk; 1621 struct inet_sock *inet = inet_sk(skb->sk); 1622 1623 if (inet && sk->sk_family == PF_INET && inet->nodefrag) 1624 return NF_ACCEPT; 1625 } 1626 1627 #ifdef CONFIG_IP_VS_IPV6 1628 if (af == AF_INET6) { 1629 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1630 int related; 1631 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, 1632 &iph); 1633 1634 if (related) 1635 return verdict; 1636 } 1637 } else 1638 #endif 1639 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1640 int related; 1641 int verdict = ip_vs_in_icmp(skb, &related, hooknum); 1642 1643 if (related) 1644 return verdict; 1645 } 1646 1647 /* Protocol supported? */ 1648 pd = ip_vs_proto_data_get(net, iph.protocol); 1649 if (unlikely(!pd)) 1650 return NF_ACCEPT; 1651 pp = pd->pp; 1652 /* 1653 * Check if the packet belongs to an existing connection entry 1654 */ 1655 cp = pp->conn_in_get(af, skb, &iph, 0); 1656 1657 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && 1658 unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && 1659 is_new_conn(skb, &iph)) { 1660 ip_vs_conn_expire_now(cp); 1661 __ip_vs_conn_put(cp); 1662 cp = NULL; 1663 } 1664 1665 if (unlikely(!cp) && !iph.fragoffs) { 1666 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1667 * replayed fragment zero will already have created the cp 1668 */ 1669 int v; 1670 1671 /* Schedule and create new connection entry into &cp */ 1672 if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) 1673 return v; 1674 } 1675 1676 if (unlikely(!cp)) { 1677 /* sorry, all this trouble for a no-hit :) */ 1678 IP_VS_DBG_PKT(12, af, pp, skb, 0, 1679 "ip_vs_in: packet continues traversal as normal"); 1680 if (iph.fragoffs) { 1681 /* Fragment that couldn't be mapped to a conn entry 1682 * is missing module nf_defrag_ipv6 1683 */ 1684 IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); 1685 IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); 1686 } 1687 return NF_ACCEPT; 1688 } 1689 1690 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); 1691 /* Check the server status */ 1692 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1693 /* the destination server is not available */ 1694 1695 __u32 flags = cp->flags; 1696 1697 /* when timer already started, silently drop the packet.*/ 1698 if (timer_pending(&cp->timer)) 1699 __ip_vs_conn_put(cp); 1700 else 1701 ip_vs_conn_put(cp); 1702 1703 if (sysctl_expire_nodest_conn(ipvs) && 1704 !(flags & IP_VS_CONN_F_ONE_PACKET)) { 1705 /* try to expire the connection immediately */ 1706 ip_vs_conn_expire_now(cp); 1707 } 1708 1709 return NF_DROP; 1710 } 1711 1712 ip_vs_in_stats(cp, skb); 1713 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 1714 if (cp->packet_xmit) 1715 ret = cp->packet_xmit(skb, cp, pp, &iph); 1716 /* do not touch skb anymore */ 1717 else { 1718 IP_VS_DBG_RL("warning: packet_xmit is null"); 1719 ret = NF_ACCEPT; 1720 } 1721 1722 /* Increase its packet counter and check if it is needed 1723 * to be synchronized 1724 * 1725 * Sync connection if it is about to close to 1726 * encorage the standby servers to update the connections timeout 1727 * 1728 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 1729 */ 1730 1731 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 1732 pkts = sysctl_sync_threshold(ipvs); 1733 else 1734 pkts = atomic_add_return(1, &cp->in_pkts); 1735 1736 if (ipvs->sync_state & IP_VS_STATE_MASTER) 1737 ip_vs_sync_conn(net, cp, pkts); 1738 1739 ip_vs_conn_put(cp); 1740 return ret; 1741 } 1742 1743 /* 1744 * AF_INET handler in NF_INET_LOCAL_IN chain 1745 * Schedule and forward packets from remote clients 1746 */ 1747 static unsigned int 1748 ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1749 const struct net_device *in, 1750 const struct net_device *out, 1751 int (*okfn)(struct sk_buff *)) 1752 { 1753 return ip_vs_in(ops->hooknum, skb, AF_INET); 1754 } 1755 1756 /* 1757 * AF_INET handler in NF_INET_LOCAL_OUT chain 1758 * Schedule and forward packets from local clients 1759 */ 1760 static unsigned int 1761 ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, 1762 const struct net_device *in, const struct net_device *out, 1763 int (*okfn)(struct sk_buff *)) 1764 { 1765 return ip_vs_in(ops->hooknum, skb, AF_INET); 1766 } 1767 1768 #ifdef CONFIG_IP_VS_IPV6 1769 1770 /* 1771 * AF_INET6 handler in NF_INET_LOCAL_IN chain 1772 * Schedule and forward packets from remote clients 1773 */ 1774 static unsigned int 1775 ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1776 const struct net_device *in, 1777 const struct net_device *out, 1778 int (*okfn)(struct sk_buff *)) 1779 { 1780 return ip_vs_in(ops->hooknum, skb, AF_INET6); 1781 } 1782 1783 /* 1784 * AF_INET6 handler in NF_INET_LOCAL_OUT chain 1785 * Schedule and forward packets from local clients 1786 */ 1787 static unsigned int 1788 ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1789 const struct net_device *in, const struct net_device *out, 1790 int (*okfn)(struct sk_buff *)) 1791 { 1792 return ip_vs_in(ops->hooknum, skb, AF_INET6); 1793 } 1794 1795 #endif 1796 1797 1798 /* 1799 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 1800 * related packets destined for 0.0.0.0/0. 1801 * When fwmark-based virtual service is used, such as transparent 1802 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 1803 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 1804 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 1805 * and send them to ip_vs_in_icmp. 1806 */ 1807 static unsigned int 1808 ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, 1809 const struct net_device *in, const struct net_device *out, 1810 int (*okfn)(struct sk_buff *)) 1811 { 1812 int r; 1813 struct net *net; 1814 struct netns_ipvs *ipvs; 1815 1816 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 1817 return NF_ACCEPT; 1818 1819 /* ipvs enabled in this netns ? */ 1820 net = skb_net(skb); 1821 ipvs = net_ipvs(net); 1822 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1823 return NF_ACCEPT; 1824 1825 return ip_vs_in_icmp(skb, &r, ops->hooknum); 1826 } 1827 1828 #ifdef CONFIG_IP_VS_IPV6 1829 static unsigned int 1830 ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1831 const struct net_device *in, const struct net_device *out, 1832 int (*okfn)(struct sk_buff *)) 1833 { 1834 int r; 1835 struct net *net; 1836 struct netns_ipvs *ipvs; 1837 struct ip_vs_iphdr iphdr; 1838 1839 ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); 1840 if (iphdr.protocol != IPPROTO_ICMPV6) 1841 return NF_ACCEPT; 1842 1843 /* ipvs enabled in this netns ? */ 1844 net = skb_net(skb); 1845 ipvs = net_ipvs(net); 1846 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1847 return NF_ACCEPT; 1848 1849 return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); 1850 } 1851 #endif 1852 1853 1854 static struct nf_hook_ops ip_vs_ops[] __read_mostly = { 1855 /* After packet filtering, change source only for VS/NAT */ 1856 { 1857 .hook = ip_vs_reply4, 1858 .owner = THIS_MODULE, 1859 .pf = NFPROTO_IPV4, 1860 .hooknum = NF_INET_LOCAL_IN, 1861 .priority = NF_IP_PRI_NAT_SRC - 2, 1862 }, 1863 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1864 * or VS/NAT(change destination), so that filtering rules can be 1865 * applied to IPVS. */ 1866 { 1867 .hook = ip_vs_remote_request4, 1868 .owner = THIS_MODULE, 1869 .pf = NFPROTO_IPV4, 1870 .hooknum = NF_INET_LOCAL_IN, 1871 .priority = NF_IP_PRI_NAT_SRC - 1, 1872 }, 1873 /* Before ip_vs_in, change source only for VS/NAT */ 1874 { 1875 .hook = ip_vs_local_reply4, 1876 .owner = THIS_MODULE, 1877 .pf = NFPROTO_IPV4, 1878 .hooknum = NF_INET_LOCAL_OUT, 1879 .priority = NF_IP_PRI_NAT_DST + 1, 1880 }, 1881 /* After mangle, schedule and forward local requests */ 1882 { 1883 .hook = ip_vs_local_request4, 1884 .owner = THIS_MODULE, 1885 .pf = NFPROTO_IPV4, 1886 .hooknum = NF_INET_LOCAL_OUT, 1887 .priority = NF_IP_PRI_NAT_DST + 2, 1888 }, 1889 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1890 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1891 { 1892 .hook = ip_vs_forward_icmp, 1893 .owner = THIS_MODULE, 1894 .pf = NFPROTO_IPV4, 1895 .hooknum = NF_INET_FORWARD, 1896 .priority = 99, 1897 }, 1898 /* After packet filtering, change source only for VS/NAT */ 1899 { 1900 .hook = ip_vs_reply4, 1901 .owner = THIS_MODULE, 1902 .pf = NFPROTO_IPV4, 1903 .hooknum = NF_INET_FORWARD, 1904 .priority = 100, 1905 }, 1906 #ifdef CONFIG_IP_VS_IPV6 1907 /* After packet filtering, change source only for VS/NAT */ 1908 { 1909 .hook = ip_vs_reply6, 1910 .owner = THIS_MODULE, 1911 .pf = NFPROTO_IPV6, 1912 .hooknum = NF_INET_LOCAL_IN, 1913 .priority = NF_IP6_PRI_NAT_SRC - 2, 1914 }, 1915 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1916 * or VS/NAT(change destination), so that filtering rules can be 1917 * applied to IPVS. */ 1918 { 1919 .hook = ip_vs_remote_request6, 1920 .owner = THIS_MODULE, 1921 .pf = NFPROTO_IPV6, 1922 .hooknum = NF_INET_LOCAL_IN, 1923 .priority = NF_IP6_PRI_NAT_SRC - 1, 1924 }, 1925 /* Before ip_vs_in, change source only for VS/NAT */ 1926 { 1927 .hook = ip_vs_local_reply6, 1928 .owner = THIS_MODULE, 1929 .pf = NFPROTO_IPV6, 1930 .hooknum = NF_INET_LOCAL_OUT, 1931 .priority = NF_IP6_PRI_NAT_DST + 1, 1932 }, 1933 /* After mangle, schedule and forward local requests */ 1934 { 1935 .hook = ip_vs_local_request6, 1936 .owner = THIS_MODULE, 1937 .pf = NFPROTO_IPV6, 1938 .hooknum = NF_INET_LOCAL_OUT, 1939 .priority = NF_IP6_PRI_NAT_DST + 2, 1940 }, 1941 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1942 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1943 { 1944 .hook = ip_vs_forward_icmp_v6, 1945 .owner = THIS_MODULE, 1946 .pf = NFPROTO_IPV6, 1947 .hooknum = NF_INET_FORWARD, 1948 .priority = 99, 1949 }, 1950 /* After packet filtering, change source only for VS/NAT */ 1951 { 1952 .hook = ip_vs_reply6, 1953 .owner = THIS_MODULE, 1954 .pf = NFPROTO_IPV6, 1955 .hooknum = NF_INET_FORWARD, 1956 .priority = 100, 1957 }, 1958 #endif 1959 }; 1960 /* 1961 * Initialize IP Virtual Server netns mem. 1962 */ 1963 static int __net_init __ip_vs_init(struct net *net) 1964 { 1965 struct netns_ipvs *ipvs; 1966 1967 ipvs = net_generic(net, ip_vs_net_id); 1968 if (ipvs == NULL) 1969 return -ENOMEM; 1970 1971 /* Hold the beast until a service is registerd */ 1972 ipvs->enable = 0; 1973 ipvs->net = net; 1974 /* Counters used for creating unique names */ 1975 ipvs->gen = atomic_read(&ipvs_netns_cnt); 1976 atomic_inc(&ipvs_netns_cnt); 1977 net->ipvs = ipvs; 1978 1979 if (ip_vs_estimator_net_init(net) < 0) 1980 goto estimator_fail; 1981 1982 if (ip_vs_control_net_init(net) < 0) 1983 goto control_fail; 1984 1985 if (ip_vs_protocol_net_init(net) < 0) 1986 goto protocol_fail; 1987 1988 if (ip_vs_app_net_init(net) < 0) 1989 goto app_fail; 1990 1991 if (ip_vs_conn_net_init(net) < 0) 1992 goto conn_fail; 1993 1994 if (ip_vs_sync_net_init(net) < 0) 1995 goto sync_fail; 1996 1997 printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", 1998 sizeof(struct netns_ipvs), ipvs->gen); 1999 return 0; 2000 /* 2001 * Error handling 2002 */ 2003 2004 sync_fail: 2005 ip_vs_conn_net_cleanup(net); 2006 conn_fail: 2007 ip_vs_app_net_cleanup(net); 2008 app_fail: 2009 ip_vs_protocol_net_cleanup(net); 2010 protocol_fail: 2011 ip_vs_control_net_cleanup(net); 2012 control_fail: 2013 ip_vs_estimator_net_cleanup(net); 2014 estimator_fail: 2015 net->ipvs = NULL; 2016 return -ENOMEM; 2017 } 2018 2019 static void __net_exit __ip_vs_cleanup(struct net *net) 2020 { 2021 ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ 2022 ip_vs_conn_net_cleanup(net); 2023 ip_vs_app_net_cleanup(net); 2024 ip_vs_protocol_net_cleanup(net); 2025 ip_vs_control_net_cleanup(net); 2026 ip_vs_estimator_net_cleanup(net); 2027 IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); 2028 net->ipvs = NULL; 2029 } 2030 2031 static void __net_exit __ip_vs_dev_cleanup(struct net *net) 2032 { 2033 EnterFunction(2); 2034 net_ipvs(net)->enable = 0; /* Disable packet reception */ 2035 smp_wmb(); 2036 ip_vs_sync_net_cleanup(net); 2037 LeaveFunction(2); 2038 } 2039 2040 static struct pernet_operations ipvs_core_ops = { 2041 .init = __ip_vs_init, 2042 .exit = __ip_vs_cleanup, 2043 .id = &ip_vs_net_id, 2044 .size = sizeof(struct netns_ipvs), 2045 }; 2046 2047 static struct pernet_operations ipvs_core_dev_ops = { 2048 .exit = __ip_vs_dev_cleanup, 2049 }; 2050 2051 /* 2052 * Initialize IP Virtual Server 2053 */ 2054 static int __init ip_vs_init(void) 2055 { 2056 int ret; 2057 2058 ret = ip_vs_control_init(); 2059 if (ret < 0) { 2060 pr_err("can't setup control.\n"); 2061 goto exit; 2062 } 2063 2064 ip_vs_protocol_init(); 2065 2066 ret = ip_vs_conn_init(); 2067 if (ret < 0) { 2068 pr_err("can't setup connection table.\n"); 2069 goto cleanup_protocol; 2070 } 2071 2072 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2073 if (ret < 0) 2074 goto cleanup_conn; 2075 2076 ret = register_pernet_device(&ipvs_core_dev_ops); 2077 if (ret < 0) 2078 goto cleanup_sub; 2079 2080 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2081 if (ret < 0) { 2082 pr_err("can't register hooks.\n"); 2083 goto cleanup_dev; 2084 } 2085 2086 ret = ip_vs_register_nl_ioctl(); 2087 if (ret < 0) { 2088 pr_err("can't register netlink/ioctl.\n"); 2089 goto cleanup_hooks; 2090 } 2091 2092 pr_info("ipvs loaded.\n"); 2093 2094 return ret; 2095 2096 cleanup_hooks: 2097 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2098 cleanup_dev: 2099 unregister_pernet_device(&ipvs_core_dev_ops); 2100 cleanup_sub: 2101 unregister_pernet_subsys(&ipvs_core_ops); 2102 cleanup_conn: 2103 ip_vs_conn_cleanup(); 2104 cleanup_protocol: 2105 ip_vs_protocol_cleanup(); 2106 ip_vs_control_cleanup(); 2107 exit: 2108 return ret; 2109 } 2110 2111 static void __exit ip_vs_cleanup(void) 2112 { 2113 ip_vs_unregister_nl_ioctl(); 2114 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2115 unregister_pernet_device(&ipvs_core_dev_ops); 2116 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2117 ip_vs_conn_cleanup(); 2118 ip_vs_protocol_cleanup(); 2119 ip_vs_control_cleanup(); 2120 pr_info("ipvs unloaded.\n"); 2121 } 2122 2123 module_init(ip_vs_init); 2124 module_exit(ip_vs_cleanup); 2125 MODULE_LICENSE("GPL"); 2126
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.