1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the NetFilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version 1, is capable of handling both version 0 and 1 messages. 9 * Version 0 is the plain old format. 10 * Note Version 0 receivers will just drop Ver 1 messages. 11 * Version 1 is capable of handle IPv6, Persistence data, 12 * time-outs, and firewall marks. 13 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 14 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 15 * 16 * Definitions Message: is a complete datagram 17 * Sync_conn: is a part of a Message 18 * Param Data is an option to a Sync_conn. 19 * 20 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 21 * 22 * ip_vs_sync: sync connection info from master load balancer to backups 23 * through multicast 24 * 25 * Changes: 26 * Alexandre Cassen : Added master & backup support at a time. 27 * Alexandre Cassen : Added SyncID support for incoming sync 28 * messages filtering. 29 * Justin Ossevoort : Fix endian problem on sync message size. 30 * Hans Schillstrom : Added Version 1: i.e. IPv6, 31 * Persistence support, fwmark and time-out. 32 */ 33 34 #define KMSG_COMPONENT "IPVS" 35 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/slab.h> 39 #include <linux/inetdevice.h> 40 #include <linux/net.h> 41 #include <linux/completion.h> 42 #include <linux/delay.h> 43 #include <linux/skbuff.h> 44 #include <linux/in.h> 45 #include <linux/igmp.h> /* for ip_mc_join_group */ 46 #include <linux/udp.h> 47 #include <linux/err.h> 48 #include <linux/kthread.h> 49 #include <linux/wait.h> 50 #include <linux/kernel.h> 51 52 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 53 54 #include <net/ip.h> 55 #include <net/sock.h> 56 57 #include <net/ip_vs.h> 58 59 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 60 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 61 62 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 63 64 static struct lock_class_key __ipvs_sync_key; 65 /* 66 * IPVS sync connection entry 67 * Version 0, i.e. original version. 68 */ 69 struct ip_vs_sync_conn_v0 { 70 __u8 reserved; 71 72 /* Protocol, addresses and port numbers */ 73 __u8 protocol; /* Which protocol (TCP/UDP) */ 74 __be16 cport; 75 __be16 vport; 76 __be16 dport; 77 __be32 caddr; /* client address */ 78 __be32 vaddr; /* virtual address */ 79 __be32 daddr; /* destination address */ 80 81 /* Flags and state transition */ 82 __be16 flags; /* status flags */ 83 __be16 state; /* state info */ 84 85 /* The sequence options start here */ 86 }; 87 88 struct ip_vs_sync_conn_options { 89 struct ip_vs_seq in_seq; /* incoming seq. struct */ 90 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 91 }; 92 93 /* 94 Sync Connection format (sync_conn) 95 96 0 1 2 3 97 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99 | Type | Protocol | Ver. | Size | 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Flags | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | State | cport | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | vport | dport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | fwmark | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | timeout (in sec.) | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | ... | 112 | IP-Addresses (v4 or v6) | 113 | ... | 114 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 115 Optional Parameters. 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 | Param. Type | Param. Length | Param. data | 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 119 | ... | 120 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 121 | | Param Type | Param. Length | 122 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | Param data | 124 | Last Param data should be padded for 32 bit alignment | 125 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 126 */ 127 128 /* 129 * Type 0, IPv4 sync connection format 130 */ 131 struct ip_vs_sync_v4 { 132 __u8 type; 133 __u8 protocol; /* Which protocol (TCP/UDP) */ 134 __be16 ver_size; /* Version msb 4 bits */ 135 /* Flags and state transition */ 136 __be32 flags; /* status flags */ 137 __be16 state; /* state info */ 138 /* Protocol, addresses and port numbers */ 139 __be16 cport; 140 __be16 vport; 141 __be16 dport; 142 __be32 fwmark; /* Firewall mark from skb */ 143 __be32 timeout; /* cp timeout */ 144 __be32 caddr; /* client address */ 145 __be32 vaddr; /* virtual address */ 146 __be32 daddr; /* destination address */ 147 /* The sequence options start here */ 148 /* PE data padded to 32bit alignment after seq. options */ 149 }; 150 /* 151 * Type 2 messages IPv6 152 */ 153 struct ip_vs_sync_v6 { 154 __u8 type; 155 __u8 protocol; /* Which protocol (TCP/UDP) */ 156 __be16 ver_size; /* Version msb 4 bits */ 157 /* Flags and state transition */ 158 __be32 flags; /* status flags */ 159 __be16 state; /* state info */ 160 /* Protocol, addresses and port numbers */ 161 __be16 cport; 162 __be16 vport; 163 __be16 dport; 164 __be32 fwmark; /* Firewall mark from skb */ 165 __be32 timeout; /* cp timeout */ 166 struct in6_addr caddr; /* client address */ 167 struct in6_addr vaddr; /* virtual address */ 168 struct in6_addr daddr; /* destination address */ 169 /* The sequence options start here */ 170 /* PE data padded to 32bit alignment after seq. options */ 171 }; 172 173 union ip_vs_sync_conn { 174 struct ip_vs_sync_v4 v4; 175 struct ip_vs_sync_v6 v6; 176 }; 177 178 /* Bits in Type field in above */ 179 #define STYPE_INET6 0 180 #define STYPE_F_INET6 (1 << STYPE_INET6) 181 182 #define SVER_SHIFT 12 /* Shift to get version */ 183 #define SVER_MASK 0x0fff /* Mask to strip version */ 184 185 #define IPVS_OPT_SEQ_DATA 1 186 #define IPVS_OPT_PE_DATA 2 187 #define IPVS_OPT_PE_NAME 3 188 #define IPVS_OPT_PARAM 7 189 190 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 191 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 192 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 193 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 194 195 struct ip_vs_sync_thread_data { 196 struct netns_ipvs *ipvs; 197 struct socket *sock; 198 char *buf; 199 int id; 200 }; 201 202 /* Version 0 definition of packet sizes */ 203 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 204 #define FULL_CONN_SIZE \ 205 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 206 207 208 /* 209 The master mulitcasts messages (Datagrams) to the backup load balancers 210 in the following format. 211 212 Version 1: 213 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 214 215 0 1 2 3 216 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 218 | 0 | SyncID | Size | 219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 220 | Count Conns | Version | Reserved, set to Zero | 221 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 | | 223 | IPVS Sync Connection (1) | 224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 225 | . | 226 ~ . ~ 227 | . | 228 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 229 | | 230 | IPVS Sync Connection (n) | 231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 232 233 Version 0 Header 234 0 1 2 3 235 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 236 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 237 | Count Conns | SyncID | Size | 238 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 239 | IPVS Sync Connection (1) | 240 */ 241 242 #define SYNC_MESG_HEADER_LEN 4 243 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 244 245 /* Version 0 header */ 246 struct ip_vs_sync_mesg_v0 { 247 __u8 nr_conns; 248 __u8 syncid; 249 __be16 size; 250 251 /* ip_vs_sync_conn entries start here */ 252 }; 253 254 /* Version 1 header */ 255 struct ip_vs_sync_mesg { 256 __u8 reserved; /* must be zero */ 257 __u8 syncid; 258 __be16 size; 259 __u8 nr_conns; 260 __s8 version; /* SYNC_PROTO_VER */ 261 __u16 spare; 262 /* ip_vs_sync_conn entries start here */ 263 }; 264 265 union ipvs_sockaddr { 266 struct sockaddr_in in; 267 struct sockaddr_in6 in6; 268 }; 269 270 struct ip_vs_sync_buff { 271 struct list_head list; 272 unsigned long firstuse; 273 274 /* pointers for the message data */ 275 struct ip_vs_sync_mesg *mesg; 276 unsigned char *head; 277 unsigned char *end; 278 }; 279 280 /* 281 * Copy of struct ip_vs_seq 282 * From unaligned network order to aligned host order 283 */ 284 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 285 { 286 memset(ho, 0, sizeof(*ho)); 287 ho->init_seq = get_unaligned_be32(&no->init_seq); 288 ho->delta = get_unaligned_be32(&no->delta); 289 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 290 } 291 292 /* 293 * Copy of struct ip_vs_seq 294 * From Aligned host order to unaligned network order 295 */ 296 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 297 { 298 put_unaligned_be32(ho->init_seq, &no->init_seq); 299 put_unaligned_be32(ho->delta, &no->delta); 300 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 301 } 302 303 static inline struct ip_vs_sync_buff * 304 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 305 { 306 struct ip_vs_sync_buff *sb; 307 308 spin_lock_bh(&ipvs->sync_lock); 309 if (list_empty(&ms->sync_queue)) { 310 sb = NULL; 311 __set_current_state(TASK_INTERRUPTIBLE); 312 } else { 313 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 314 list); 315 list_del(&sb->list); 316 ms->sync_queue_len--; 317 if (!ms->sync_queue_len) 318 ms->sync_queue_delay = 0; 319 } 320 spin_unlock_bh(&ipvs->sync_lock); 321 322 return sb; 323 } 324 325 /* 326 * Create a new sync buffer for Version 1 proto. 327 */ 328 static inline struct ip_vs_sync_buff * 329 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 330 { 331 struct ip_vs_sync_buff *sb; 332 333 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 334 return NULL; 335 336 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 337 ipvs->mcfg.sync_maxlen); 338 sb->mesg = kmalloc(len, GFP_ATOMIC); 339 if (!sb->mesg) { 340 kfree(sb); 341 return NULL; 342 } 343 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 344 sb->mesg->version = SYNC_PROTO_VER; 345 sb->mesg->syncid = ipvs->mcfg.syncid; 346 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 347 sb->mesg->nr_conns = 0; 348 sb->mesg->spare = 0; 349 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 350 sb->end = (unsigned char *)sb->mesg + len; 351 352 sb->firstuse = jiffies; 353 return sb; 354 } 355 356 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 357 { 358 kfree(sb->mesg); 359 kfree(sb); 360 } 361 362 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 363 struct ipvs_master_sync_state *ms) 364 { 365 struct ip_vs_sync_buff *sb = ms->sync_buff; 366 367 spin_lock(&ipvs->sync_lock); 368 if (ipvs->sync_state & IP_VS_STATE_MASTER && 369 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 370 if (!ms->sync_queue_len) 371 schedule_delayed_work(&ms->master_wakeup_work, 372 max(IPVS_SYNC_SEND_DELAY, 1)); 373 ms->sync_queue_len++; 374 list_add_tail(&sb->list, &ms->sync_queue); 375 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) 376 wake_up_process(ms->master_thread); 377 } else 378 ip_vs_sync_buff_release(sb); 379 spin_unlock(&ipvs->sync_lock); 380 } 381 382 /* 383 * Get the current sync buffer if it has been created for more 384 * than the specified time or the specified time is zero. 385 */ 386 static inline struct ip_vs_sync_buff * 387 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 388 unsigned long time) 389 { 390 struct ip_vs_sync_buff *sb; 391 392 spin_lock_bh(&ipvs->sync_buff_lock); 393 sb = ms->sync_buff; 394 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 395 ms->sync_buff = NULL; 396 __set_current_state(TASK_RUNNING); 397 } else 398 sb = NULL; 399 spin_unlock_bh(&ipvs->sync_buff_lock); 400 return sb; 401 } 402 403 static inline int 404 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 405 { 406 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 407 } 408 409 /* 410 * Create a new sync buffer for Version 0 proto. 411 */ 412 static inline struct ip_vs_sync_buff * 413 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 414 { 415 struct ip_vs_sync_buff *sb; 416 struct ip_vs_sync_mesg_v0 *mesg; 417 418 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 419 return NULL; 420 421 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 422 ipvs->mcfg.sync_maxlen); 423 sb->mesg = kmalloc(len, GFP_ATOMIC); 424 if (!sb->mesg) { 425 kfree(sb); 426 return NULL; 427 } 428 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 429 mesg->nr_conns = 0; 430 mesg->syncid = ipvs->mcfg.syncid; 431 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 432 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 433 sb->end = (unsigned char *)mesg + len; 434 sb->firstuse = jiffies; 435 return sb; 436 } 437 438 /* Check if connection is controlled by persistence */ 439 static inline bool in_persistence(struct ip_vs_conn *cp) 440 { 441 for (cp = cp->control; cp; cp = cp->control) { 442 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 443 return true; 444 } 445 return false; 446 } 447 448 /* Check if conn should be synced. 449 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 450 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 451 * sync_retries times with period of sync_refresh_period/8 452 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 453 * for state changes or only once when pkts matches sync_threshold 454 * - (3) templates: rate can be reduced only with sync_refresh_period or 455 * with (2) 456 */ 457 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 458 struct ip_vs_conn *cp, int pkts) 459 { 460 unsigned long orig = ACCESS_ONCE(cp->sync_endtime); 461 unsigned long now = jiffies; 462 unsigned long n = (now + cp->timeout) & ~3UL; 463 unsigned int sync_refresh_period; 464 int sync_period; 465 int force; 466 467 /* Check if we sync in current state */ 468 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 469 force = 0; 470 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 471 return 0; 472 else if (likely(cp->protocol == IPPROTO_TCP)) { 473 if (!((1 << cp->state) & 474 ((1 << IP_VS_TCP_S_ESTABLISHED) | 475 (1 << IP_VS_TCP_S_FIN_WAIT) | 476 (1 << IP_VS_TCP_S_CLOSE) | 477 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 478 (1 << IP_VS_TCP_S_TIME_WAIT)))) 479 return 0; 480 force = cp->state != cp->old_state; 481 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 482 goto set; 483 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 484 if (!((1 << cp->state) & 485 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 486 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 487 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 488 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 489 (1 << IP_VS_SCTP_S_CLOSED)))) 490 return 0; 491 force = cp->state != cp->old_state; 492 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 493 goto set; 494 } else { 495 /* UDP or another protocol with single state */ 496 force = 0; 497 } 498 499 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 500 if (sync_refresh_period > 0) { 501 long diff = n - orig; 502 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 503 504 /* Avoid sync if difference is below sync_refresh_period 505 * and below the half timeout. 506 */ 507 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 508 int retries = orig & 3; 509 510 if (retries >= sysctl_sync_retries(ipvs)) 511 return 0; 512 if (time_before(now, orig - cp->timeout + 513 (sync_refresh_period >> 3))) 514 return 0; 515 n |= retries + 1; 516 } 517 } 518 sync_period = sysctl_sync_period(ipvs); 519 if (sync_period > 0) { 520 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 521 pkts % sync_period != sysctl_sync_threshold(ipvs)) 522 return 0; 523 } else if (!sync_refresh_period && 524 pkts != sysctl_sync_threshold(ipvs)) 525 return 0; 526 527 set: 528 cp->old_state = cp->state; 529 n = cmpxchg(&cp->sync_endtime, orig, n); 530 return n == orig || force; 531 } 532 533 /* 534 * Version 0 , could be switched in by sys_ctl. 535 * Add an ip_vs_conn information into the current sync_buff. 536 */ 537 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 538 int pkts) 539 { 540 struct ip_vs_sync_mesg_v0 *m; 541 struct ip_vs_sync_conn_v0 *s; 542 struct ip_vs_sync_buff *buff; 543 struct ipvs_master_sync_state *ms; 544 int id; 545 unsigned int len; 546 547 if (unlikely(cp->af != AF_INET)) 548 return; 549 /* Do not sync ONE PACKET */ 550 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 551 return; 552 553 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 554 return; 555 556 spin_lock_bh(&ipvs->sync_buff_lock); 557 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 558 spin_unlock_bh(&ipvs->sync_buff_lock); 559 return; 560 } 561 562 id = select_master_thread_id(ipvs, cp); 563 ms = &ipvs->ms[id]; 564 buff = ms->sync_buff; 565 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 566 SIMPLE_CONN_SIZE; 567 if (buff) { 568 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 569 /* Send buffer if it is for v1 */ 570 if (buff->head + len > buff->end || !m->nr_conns) { 571 sb_queue_tail(ipvs, ms); 572 ms->sync_buff = NULL; 573 buff = NULL; 574 } 575 } 576 if (!buff) { 577 buff = ip_vs_sync_buff_create_v0(ipvs, len); 578 if (!buff) { 579 spin_unlock_bh(&ipvs->sync_buff_lock); 580 pr_err("ip_vs_sync_buff_create failed.\n"); 581 return; 582 } 583 ms->sync_buff = buff; 584 } 585 586 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 587 s = (struct ip_vs_sync_conn_v0 *) buff->head; 588 589 /* copy members */ 590 s->reserved = 0; 591 s->protocol = cp->protocol; 592 s->cport = cp->cport; 593 s->vport = cp->vport; 594 s->dport = cp->dport; 595 s->caddr = cp->caddr.ip; 596 s->vaddr = cp->vaddr.ip; 597 s->daddr = cp->daddr.ip; 598 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 599 s->state = htons(cp->state); 600 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 601 struct ip_vs_sync_conn_options *opt = 602 (struct ip_vs_sync_conn_options *)&s[1]; 603 memcpy(opt, &cp->in_seq, sizeof(*opt)); 604 } 605 606 m->nr_conns++; 607 m->size = htons(ntohs(m->size) + len); 608 buff->head += len; 609 spin_unlock_bh(&ipvs->sync_buff_lock); 610 611 /* synchronize its controller if it has */ 612 cp = cp->control; 613 if (cp) { 614 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 615 pkts = atomic_add_return(1, &cp->in_pkts); 616 else 617 pkts = sysctl_sync_threshold(ipvs); 618 ip_vs_sync_conn(ipvs, cp, pkts); 619 } 620 } 621 622 /* 623 * Add an ip_vs_conn information into the current sync_buff. 624 * Called by ip_vs_in. 625 * Sending Version 1 messages 626 */ 627 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 628 { 629 struct ip_vs_sync_mesg *m; 630 union ip_vs_sync_conn *s; 631 struct ip_vs_sync_buff *buff; 632 struct ipvs_master_sync_state *ms; 633 int id; 634 __u8 *p; 635 unsigned int len, pe_name_len, pad; 636 637 /* Handle old version of the protocol */ 638 if (sysctl_sync_ver(ipvs) == 0) { 639 ip_vs_sync_conn_v0(ipvs, cp, pkts); 640 return; 641 } 642 /* Do not sync ONE PACKET */ 643 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 644 goto control; 645 sloop: 646 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 647 goto control; 648 649 /* Sanity checks */ 650 pe_name_len = 0; 651 if (cp->pe_data_len) { 652 if (!cp->pe_data || !cp->dest) { 653 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 654 return; 655 } 656 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 657 } 658 659 spin_lock_bh(&ipvs->sync_buff_lock); 660 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 661 spin_unlock_bh(&ipvs->sync_buff_lock); 662 return; 663 } 664 665 id = select_master_thread_id(ipvs, cp); 666 ms = &ipvs->ms[id]; 667 668 #ifdef CONFIG_IP_VS_IPV6 669 if (cp->af == AF_INET6) 670 len = sizeof(struct ip_vs_sync_v6); 671 else 672 #endif 673 len = sizeof(struct ip_vs_sync_v4); 674 675 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 676 len += sizeof(struct ip_vs_sync_conn_options) + 2; 677 678 if (cp->pe_data_len) 679 len += cp->pe_data_len + 2; /* + Param hdr field */ 680 if (pe_name_len) 681 len += pe_name_len + 2; 682 683 /* check if there is a space for this one */ 684 pad = 0; 685 buff = ms->sync_buff; 686 if (buff) { 687 m = buff->mesg; 688 pad = (4 - (size_t) buff->head) & 3; 689 /* Send buffer if it is for v0 */ 690 if (buff->head + len + pad > buff->end || m->reserved) { 691 sb_queue_tail(ipvs, ms); 692 ms->sync_buff = NULL; 693 buff = NULL; 694 pad = 0; 695 } 696 } 697 698 if (!buff) { 699 buff = ip_vs_sync_buff_create(ipvs, len); 700 if (!buff) { 701 spin_unlock_bh(&ipvs->sync_buff_lock); 702 pr_err("ip_vs_sync_buff_create failed.\n"); 703 return; 704 } 705 ms->sync_buff = buff; 706 m = buff->mesg; 707 } 708 709 p = buff->head; 710 buff->head += pad + len; 711 m->size = htons(ntohs(m->size) + pad + len); 712 /* Add ev. padding from prev. sync_conn */ 713 while (pad--) 714 *(p++) = 0; 715 716 s = (union ip_vs_sync_conn *)p; 717 718 /* Set message type & copy members */ 719 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 720 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 721 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 722 s->v4.state = htons(cp->state); 723 s->v4.protocol = cp->protocol; 724 s->v4.cport = cp->cport; 725 s->v4.vport = cp->vport; 726 s->v4.dport = cp->dport; 727 s->v4.fwmark = htonl(cp->fwmark); 728 s->v4.timeout = htonl(cp->timeout / HZ); 729 m->nr_conns++; 730 731 #ifdef CONFIG_IP_VS_IPV6 732 if (cp->af == AF_INET6) { 733 p += sizeof(struct ip_vs_sync_v6); 734 s->v6.caddr = cp->caddr.in6; 735 s->v6.vaddr = cp->vaddr.in6; 736 s->v6.daddr = cp->daddr.in6; 737 } else 738 #endif 739 { 740 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 741 s->v4.caddr = cp->caddr.ip; 742 s->v4.vaddr = cp->vaddr.ip; 743 s->v4.daddr = cp->daddr.ip; 744 } 745 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 746 *(p++) = IPVS_OPT_SEQ_DATA; 747 *(p++) = sizeof(struct ip_vs_sync_conn_options); 748 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 749 p += sizeof(struct ip_vs_seq); 750 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 751 p += sizeof(struct ip_vs_seq); 752 } 753 /* Handle pe data */ 754 if (cp->pe_data_len && cp->pe_data) { 755 *(p++) = IPVS_OPT_PE_DATA; 756 *(p++) = cp->pe_data_len; 757 memcpy(p, cp->pe_data, cp->pe_data_len); 758 p += cp->pe_data_len; 759 if (pe_name_len) { 760 /* Add PE_NAME */ 761 *(p++) = IPVS_OPT_PE_NAME; 762 *(p++) = pe_name_len; 763 memcpy(p, cp->pe->name, pe_name_len); 764 p += pe_name_len; 765 } 766 } 767 768 spin_unlock_bh(&ipvs->sync_buff_lock); 769 770 control: 771 /* synchronize its controller if it has */ 772 cp = cp->control; 773 if (!cp) 774 return; 775 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 776 pkts = atomic_add_return(1, &cp->in_pkts); 777 else 778 pkts = sysctl_sync_threshold(ipvs); 779 goto sloop; 780 } 781 782 /* 783 * fill_param used by version 1 784 */ 785 static inline int 786 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 787 struct ip_vs_conn_param *p, 788 __u8 *pe_data, unsigned int pe_data_len, 789 __u8 *pe_name, unsigned int pe_name_len) 790 { 791 #ifdef CONFIG_IP_VS_IPV6 792 if (af == AF_INET6) 793 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 794 (const union nf_inet_addr *)&sc->v6.caddr, 795 sc->v6.cport, 796 (const union nf_inet_addr *)&sc->v6.vaddr, 797 sc->v6.vport, p); 798 else 799 #endif 800 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 801 (const union nf_inet_addr *)&sc->v4.caddr, 802 sc->v4.cport, 803 (const union nf_inet_addr *)&sc->v4.vaddr, 804 sc->v4.vport, p); 805 /* Handle pe data */ 806 if (pe_data_len) { 807 if (pe_name_len) { 808 char buff[IP_VS_PENAME_MAXLEN+1]; 809 810 memcpy(buff, pe_name, pe_name_len); 811 buff[pe_name_len]=0; 812 p->pe = __ip_vs_pe_getbyname(buff); 813 if (!p->pe) { 814 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 815 buff); 816 return 1; 817 } 818 } else { 819 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 820 return 1; 821 } 822 823 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 824 if (!p->pe_data) { 825 module_put(p->pe->module); 826 return -ENOMEM; 827 } 828 p->pe_data_len = pe_data_len; 829 } 830 return 0; 831 } 832 833 /* 834 * Connection Add / Update. 835 * Common for version 0 and 1 reception of backup sync_conns. 836 * Param: ... 837 * timeout is in sec. 838 */ 839 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 840 unsigned int flags, unsigned int state, 841 unsigned int protocol, unsigned int type, 842 const union nf_inet_addr *daddr, __be16 dport, 843 unsigned long timeout, __u32 fwmark, 844 struct ip_vs_sync_conn_options *opt) 845 { 846 struct ip_vs_dest *dest; 847 struct ip_vs_conn *cp; 848 849 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 850 cp = ip_vs_conn_in_get(param); 851 if (cp && ((cp->dport != dport) || 852 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 853 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 854 ip_vs_conn_expire_now(cp); 855 __ip_vs_conn_put(cp); 856 cp = NULL; 857 } else { 858 /* This is the expiration message for the 859 * connection that was already replaced, so we 860 * just ignore it. 861 */ 862 __ip_vs_conn_put(cp); 863 kfree(param->pe_data); 864 return; 865 } 866 } 867 } else { 868 cp = ip_vs_ct_in_get(param); 869 } 870 871 if (cp) { 872 /* Free pe_data */ 873 kfree(param->pe_data); 874 875 dest = cp->dest; 876 spin_lock_bh(&cp->lock); 877 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 878 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 879 if (flags & IP_VS_CONN_F_INACTIVE) { 880 atomic_dec(&dest->activeconns); 881 atomic_inc(&dest->inactconns); 882 } else { 883 atomic_inc(&dest->activeconns); 884 atomic_dec(&dest->inactconns); 885 } 886 } 887 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 888 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 889 cp->flags = flags; 890 spin_unlock_bh(&cp->lock); 891 if (!dest) 892 ip_vs_try_bind_dest(cp); 893 } else { 894 /* 895 * Find the appropriate destination for the connection. 896 * If it is not found the connection will remain unbound 897 * but still handled. 898 */ 899 rcu_read_lock(); 900 /* This function is only invoked by the synchronization 901 * code. We do not currently support heterogeneous pools 902 * with synchronization, so we can make the assumption that 903 * the svc_af is the same as the dest_af 904 */ 905 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 906 param->vaddr, param->vport, protocol, 907 fwmark, flags); 908 909 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 910 fwmark); 911 rcu_read_unlock(); 912 if (!cp) { 913 kfree(param->pe_data); 914 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 915 return; 916 } 917 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 918 kfree(param->pe_data); 919 } 920 921 if (opt) { 922 cp->in_seq = opt->in_seq; 923 cp->out_seq = opt->out_seq; 924 } 925 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 926 cp->state = state; 927 cp->old_state = cp->state; 928 /* 929 * For Ver 0 messages style 930 * - Not possible to recover the right timeout for templates 931 * - can not find the right fwmark 932 * virtual service. If needed, we can do it for 933 * non-fwmark persistent services. 934 * Ver 1 messages style. 935 * - No problem. 936 */ 937 if (timeout) { 938 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 939 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 940 cp->timeout = timeout*HZ; 941 } else { 942 struct ip_vs_proto_data *pd; 943 944 pd = ip_vs_proto_data_get(ipvs, protocol); 945 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 946 cp->timeout = pd->timeout_table[state]; 947 else 948 cp->timeout = (3*60*HZ); 949 } 950 ip_vs_conn_put(cp); 951 } 952 953 /* 954 * Process received multicast message for Version 0 955 */ 956 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 957 const size_t buflen) 958 { 959 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 960 struct ip_vs_sync_conn_v0 *s; 961 struct ip_vs_sync_conn_options *opt; 962 struct ip_vs_protocol *pp; 963 struct ip_vs_conn_param param; 964 char *p; 965 int i; 966 967 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 968 for (i=0; i<m->nr_conns; i++) { 969 unsigned int flags, state; 970 971 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 972 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 973 return; 974 } 975 s = (struct ip_vs_sync_conn_v0 *) p; 976 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 977 flags &= ~IP_VS_CONN_F_HASHED; 978 if (flags & IP_VS_CONN_F_SEQ_MASK) { 979 opt = (struct ip_vs_sync_conn_options *)&s[1]; 980 p += FULL_CONN_SIZE; 981 if (p > buffer+buflen) { 982 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 983 return; 984 } 985 } else { 986 opt = NULL; 987 p += SIMPLE_CONN_SIZE; 988 } 989 990 state = ntohs(s->state); 991 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 992 pp = ip_vs_proto_get(s->protocol); 993 if (!pp) { 994 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 995 s->protocol); 996 continue; 997 } 998 if (state >= pp->num_states) { 999 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1000 pp->name, state); 1001 continue; 1002 } 1003 } else { 1004 /* protocol in templates is not used for state/timeout */ 1005 if (state > 0) { 1006 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", 1007 state); 1008 state = 0; 1009 } 1010 } 1011 1012 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1013 (const union nf_inet_addr *)&s->caddr, 1014 s->cport, 1015 (const union nf_inet_addr *)&s->vaddr, 1016 s->vport, ¶m); 1017 1018 /* Send timeout as Zero */ 1019 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1020 (union nf_inet_addr *)&s->daddr, s->dport, 1021 0, 0, opt); 1022 } 1023 } 1024 1025 /* 1026 * Handle options 1027 */ 1028 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1029 __u32 *opt_flags, 1030 struct ip_vs_sync_conn_options *opt) 1031 { 1032 struct ip_vs_sync_conn_options *topt; 1033 1034 topt = (struct ip_vs_sync_conn_options *)p; 1035 1036 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1037 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1038 return -EINVAL; 1039 } 1040 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1041 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1042 return -EINVAL; 1043 } 1044 ntoh_seq(&topt->in_seq, &opt->in_seq); 1045 ntoh_seq(&topt->out_seq, &opt->out_seq); 1046 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1047 return 0; 1048 } 1049 1050 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1051 __u8 **data, unsigned int maxlen, 1052 __u32 *opt_flags, __u32 flag) 1053 { 1054 if (plen > maxlen) { 1055 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1056 return -EINVAL; 1057 } 1058 if (*opt_flags & flag) { 1059 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1060 return -EINVAL; 1061 } 1062 *data_len = plen; 1063 *data = p; 1064 *opt_flags |= flag; 1065 return 0; 1066 } 1067 /* 1068 * Process a Version 1 sync. connection 1069 */ 1070 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1071 { 1072 struct ip_vs_sync_conn_options opt; 1073 union ip_vs_sync_conn *s; 1074 struct ip_vs_protocol *pp; 1075 struct ip_vs_conn_param param; 1076 __u32 flags; 1077 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1078 __u8 *pe_data=NULL, *pe_name=NULL; 1079 __u32 opt_flags=0; 1080 int retc=0; 1081 1082 s = (union ip_vs_sync_conn *) p; 1083 1084 if (s->v6.type & STYPE_F_INET6) { 1085 #ifdef CONFIG_IP_VS_IPV6 1086 af = AF_INET6; 1087 p += sizeof(struct ip_vs_sync_v6); 1088 #else 1089 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1090 retc = 10; 1091 goto out; 1092 #endif 1093 } else if (!s->v4.type) { 1094 af = AF_INET; 1095 p += sizeof(struct ip_vs_sync_v4); 1096 } else { 1097 return -10; 1098 } 1099 if (p > msg_end) 1100 return -20; 1101 1102 /* Process optional params check Type & Len. */ 1103 while (p < msg_end) { 1104 int ptype; 1105 int plen; 1106 1107 if (p+2 > msg_end) 1108 return -30; 1109 ptype = *(p++); 1110 plen = *(p++); 1111 1112 if (!plen || ((p + plen) > msg_end)) 1113 return -40; 1114 /* Handle seq option p = param data */ 1115 switch (ptype & ~IPVS_OPT_F_PARAM) { 1116 case IPVS_OPT_SEQ_DATA: 1117 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1118 return -50; 1119 break; 1120 1121 case IPVS_OPT_PE_DATA: 1122 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1123 IP_VS_PEDATA_MAXLEN, &opt_flags, 1124 IPVS_OPT_F_PE_DATA)) 1125 return -60; 1126 break; 1127 1128 case IPVS_OPT_PE_NAME: 1129 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1130 IP_VS_PENAME_MAXLEN, &opt_flags, 1131 IPVS_OPT_F_PE_NAME)) 1132 return -70; 1133 break; 1134 1135 default: 1136 /* Param data mandatory ? */ 1137 if (!(ptype & IPVS_OPT_F_PARAM)) { 1138 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1139 ptype & ~IPVS_OPT_F_PARAM); 1140 retc = 20; 1141 goto out; 1142 } 1143 } 1144 p += plen; /* Next option */ 1145 } 1146 1147 /* Get flags and Mask off unsupported */ 1148 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1149 flags |= IP_VS_CONN_F_SYNC; 1150 state = ntohs(s->v4.state); 1151 1152 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1153 pp = ip_vs_proto_get(s->v4.protocol); 1154 if (!pp) { 1155 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1156 s->v4.protocol); 1157 retc = 30; 1158 goto out; 1159 } 1160 if (state >= pp->num_states) { 1161 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1162 pp->name, state); 1163 retc = 40; 1164 goto out; 1165 } 1166 } else { 1167 /* protocol in templates is not used for state/timeout */ 1168 if (state > 0) { 1169 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", 1170 state); 1171 state = 0; 1172 } 1173 } 1174 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1175 pe_data_len, pe_name, pe_name_len)) { 1176 retc = 50; 1177 goto out; 1178 } 1179 /* If only IPv4, just silent skip IPv6 */ 1180 if (af == AF_INET) 1181 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1182 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1183 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1184 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1185 ); 1186 #ifdef CONFIG_IP_VS_IPV6 1187 else 1188 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1189 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1190 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1191 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1192 ); 1193 #endif 1194 ip_vs_pe_put(param.pe); 1195 return 0; 1196 /* Error exit */ 1197 out: 1198 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1199 return retc; 1200 1201 } 1202 /* 1203 * Process received multicast message and create the corresponding 1204 * ip_vs_conn entries. 1205 * Handles Version 0 & 1 1206 */ 1207 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1208 const size_t buflen) 1209 { 1210 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1211 __u8 *p, *msg_end; 1212 int i, nr_conns; 1213 1214 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1215 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1216 return; 1217 } 1218 1219 if (buflen != ntohs(m2->size)) { 1220 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1221 return; 1222 } 1223 /* SyncID sanity check */ 1224 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1225 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1226 return; 1227 } 1228 /* Handle version 1 message */ 1229 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1230 && (m2->spare == 0)) { 1231 1232 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1233 nr_conns = m2->nr_conns; 1234 1235 for (i=0; i<nr_conns; i++) { 1236 union ip_vs_sync_conn *s; 1237 unsigned int size; 1238 int retc; 1239 1240 p = msg_end; 1241 if (p + sizeof(s->v4) > buffer+buflen) { 1242 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); 1243 return; 1244 } 1245 s = (union ip_vs_sync_conn *)p; 1246 size = ntohs(s->v4.ver_size) & SVER_MASK; 1247 msg_end = p + size; 1248 /* Basic sanity checks */ 1249 if (msg_end > buffer+buflen) { 1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1251 return; 1252 } 1253 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1254 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1255 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1256 return; 1257 } 1258 /* Process a single sync_conn */ 1259 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1260 if (retc < 0) { 1261 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1262 retc); 1263 return; 1264 } 1265 /* Make sure we have 32 bit alignment */ 1266 msg_end = p + ((size + 3) & ~3); 1267 } 1268 } else { 1269 /* Old type of message */ 1270 ip_vs_process_message_v0(ipvs, buffer, buflen); 1271 return; 1272 } 1273 } 1274 1275 1276 /* 1277 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1278 */ 1279 static void set_sock_size(struct sock *sk, int mode, int val) 1280 { 1281 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1282 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1283 lock_sock(sk); 1284 if (mode) { 1285 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1286 sysctl_wmem_max); 1287 sk->sk_sndbuf = val * 2; 1288 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1289 } else { 1290 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1291 sysctl_rmem_max); 1292 sk->sk_rcvbuf = val * 2; 1293 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1294 } 1295 release_sock(sk); 1296 } 1297 1298 /* 1299 * Setup loopback of outgoing multicasts on a sending socket 1300 */ 1301 static void set_mcast_loop(struct sock *sk, u_char loop) 1302 { 1303 struct inet_sock *inet = inet_sk(sk); 1304 1305 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1306 lock_sock(sk); 1307 inet->mc_loop = loop ? 1 : 0; 1308 #ifdef CONFIG_IP_VS_IPV6 1309 if (sk->sk_family == AF_INET6) { 1310 struct ipv6_pinfo *np = inet6_sk(sk); 1311 1312 /* IPV6_MULTICAST_LOOP */ 1313 np->mc_loop = loop ? 1 : 0; 1314 } 1315 #endif 1316 release_sock(sk); 1317 } 1318 1319 /* 1320 * Specify TTL for outgoing multicasts on a sending socket 1321 */ 1322 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1323 { 1324 struct inet_sock *inet = inet_sk(sk); 1325 1326 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1327 lock_sock(sk); 1328 inet->mc_ttl = ttl; 1329 #ifdef CONFIG_IP_VS_IPV6 1330 if (sk->sk_family == AF_INET6) { 1331 struct ipv6_pinfo *np = inet6_sk(sk); 1332 1333 /* IPV6_MULTICAST_HOPS */ 1334 np->mcast_hops = ttl; 1335 } 1336 #endif 1337 release_sock(sk); 1338 } 1339 1340 /* Control fragmentation of messages */ 1341 static void set_mcast_pmtudisc(struct sock *sk, int val) 1342 { 1343 struct inet_sock *inet = inet_sk(sk); 1344 1345 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1346 lock_sock(sk); 1347 inet->pmtudisc = val; 1348 #ifdef CONFIG_IP_VS_IPV6 1349 if (sk->sk_family == AF_INET6) { 1350 struct ipv6_pinfo *np = inet6_sk(sk); 1351 1352 /* IPV6_MTU_DISCOVER */ 1353 np->pmtudisc = val; 1354 } 1355 #endif 1356 release_sock(sk); 1357 } 1358 1359 /* 1360 * Specifiy default interface for outgoing multicasts 1361 */ 1362 static int set_mcast_if(struct sock *sk, char *ifname) 1363 { 1364 struct net_device *dev; 1365 struct inet_sock *inet = inet_sk(sk); 1366 struct net *net = sock_net(sk); 1367 1368 dev = __dev_get_by_name(net, ifname); 1369 if (!dev) 1370 return -ENODEV; 1371 1372 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1373 return -EINVAL; 1374 1375 lock_sock(sk); 1376 inet->mc_index = dev->ifindex; 1377 /* inet->mc_addr = 0; */ 1378 #ifdef CONFIG_IP_VS_IPV6 1379 if (sk->sk_family == AF_INET6) { 1380 struct ipv6_pinfo *np = inet6_sk(sk); 1381 1382 /* IPV6_MULTICAST_IF */ 1383 np->mcast_oif = dev->ifindex; 1384 } 1385 #endif 1386 release_sock(sk); 1387 1388 return 0; 1389 } 1390 1391 1392 /* 1393 * Join a multicast group. 1394 * the group is specified by a class D multicast address 224.0.0.0/8 1395 * in the in_addr structure passed in as a parameter. 1396 */ 1397 static int 1398 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 1399 { 1400 struct net *net = sock_net(sk); 1401 struct ip_mreqn mreq; 1402 struct net_device *dev; 1403 int ret; 1404 1405 memset(&mreq, 0, sizeof(mreq)); 1406 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1407 1408 dev = __dev_get_by_name(net, ifname); 1409 if (!dev) 1410 return -ENODEV; 1411 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1412 return -EINVAL; 1413 1414 mreq.imr_ifindex = dev->ifindex; 1415 1416 lock_sock(sk); 1417 ret = ip_mc_join_group(sk, &mreq); 1418 release_sock(sk); 1419 1420 return ret; 1421 } 1422 1423 #ifdef CONFIG_IP_VS_IPV6 1424 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1425 char *ifname) 1426 { 1427 struct net *net = sock_net(sk); 1428 struct net_device *dev; 1429 int ret; 1430 1431 dev = __dev_get_by_name(net, ifname); 1432 if (!dev) 1433 return -ENODEV; 1434 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1435 return -EINVAL; 1436 1437 lock_sock(sk); 1438 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1439 release_sock(sk); 1440 1441 return ret; 1442 } 1443 #endif 1444 1445 static int bind_mcastif_addr(struct socket *sock, char *ifname) 1446 { 1447 struct net *net = sock_net(sock->sk); 1448 struct net_device *dev; 1449 __be32 addr; 1450 struct sockaddr_in sin; 1451 1452 dev = __dev_get_by_name(net, ifname); 1453 if (!dev) 1454 return -ENODEV; 1455 1456 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1457 if (!addr) 1458 pr_err("You probably need to specify IP address on " 1459 "multicast interface.\n"); 1460 1461 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1462 ifname, &addr); 1463 1464 /* Now bind the socket with the address of multicast interface */ 1465 sin.sin_family = AF_INET; 1466 sin.sin_addr.s_addr = addr; 1467 sin.sin_port = 0; 1468 1469 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1470 } 1471 1472 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1473 struct ipvs_sync_daemon_cfg *c, int id) 1474 { 1475 if (AF_INET6 == c->mcast_af) { 1476 sa->in6 = (struct sockaddr_in6) { 1477 .sin6_family = AF_INET6, 1478 .sin6_port = htons(c->mcast_port + id), 1479 }; 1480 sa->in6.sin6_addr = c->mcast_group.in6; 1481 *salen = sizeof(sa->in6); 1482 } else { 1483 sa->in = (struct sockaddr_in) { 1484 .sin_family = AF_INET, 1485 .sin_port = htons(c->mcast_port + id), 1486 }; 1487 sa->in.sin_addr = c->mcast_group.in; 1488 *salen = sizeof(sa->in); 1489 } 1490 } 1491 1492 /* 1493 * Set up sending multicast socket over UDP 1494 */ 1495 static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) 1496 { 1497 /* multicast addr */ 1498 union ipvs_sockaddr mcast_addr; 1499 struct socket *sock; 1500 int result, salen; 1501 1502 /* First create a socket */ 1503 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1504 IPPROTO_UDP, &sock); 1505 if (result < 0) { 1506 pr_err("Error during creation of socket; terminating\n"); 1507 return ERR_PTR(result); 1508 } 1509 result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); 1510 if (result < 0) { 1511 pr_err("Error setting outbound mcast interface\n"); 1512 goto error; 1513 } 1514 1515 set_mcast_loop(sock->sk, 0); 1516 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1517 /* Allow fragmentation if MTU changes */ 1518 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1519 result = sysctl_sync_sock_size(ipvs); 1520 if (result > 0) 1521 set_sock_size(sock->sk, 1, result); 1522 1523 if (AF_INET == ipvs->mcfg.mcast_af) 1524 result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); 1525 else 1526 result = 0; 1527 if (result < 0) { 1528 pr_err("Error binding address of the mcast interface\n"); 1529 goto error; 1530 } 1531 1532 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1533 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1534 salen, 0); 1535 if (result < 0) { 1536 pr_err("Error connecting to the multicast addr\n"); 1537 goto error; 1538 } 1539 1540 return sock; 1541 1542 error: 1543 sock_release(sock); 1544 return ERR_PTR(result); 1545 } 1546 1547 1548 /* 1549 * Set up receiving multicast socket over UDP 1550 */ 1551 static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, 1552 int ifindex) 1553 { 1554 /* multicast addr */ 1555 union ipvs_sockaddr mcast_addr; 1556 struct socket *sock; 1557 int result, salen; 1558 1559 /* First create a socket */ 1560 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1561 IPPROTO_UDP, &sock); 1562 if (result < 0) { 1563 pr_err("Error during creation of socket; terminating\n"); 1564 return ERR_PTR(result); 1565 } 1566 /* it is equivalent to the REUSEADDR option in user-space */ 1567 sock->sk->sk_reuse = SK_CAN_REUSE; 1568 result = sysctl_sync_sock_size(ipvs); 1569 if (result > 0) 1570 set_sock_size(sock->sk, 0, result); 1571 1572 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1573 sock->sk->sk_bound_dev_if = ifindex; 1574 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1575 if (result < 0) { 1576 pr_err("Error binding to the multicast addr\n"); 1577 goto error; 1578 } 1579 1580 /* join the multicast group */ 1581 #ifdef CONFIG_IP_VS_IPV6 1582 if (ipvs->bcfg.mcast_af == AF_INET6) 1583 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1584 ipvs->bcfg.mcast_ifn); 1585 else 1586 #endif 1587 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1588 ipvs->bcfg.mcast_ifn); 1589 if (result < 0) { 1590 pr_err("Error joining to the multicast group\n"); 1591 goto error; 1592 } 1593 1594 return sock; 1595 1596 error: 1597 sock_release(sock); 1598 return ERR_PTR(result); 1599 } 1600 1601 1602 static int 1603 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1604 { 1605 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1606 struct kvec iov; 1607 int len; 1608 1609 EnterFunction(7); 1610 iov.iov_base = (void *)buffer; 1611 iov.iov_len = length; 1612 1613 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1614 1615 LeaveFunction(7); 1616 return len; 1617 } 1618 1619 static int 1620 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1621 { 1622 int msize; 1623 int ret; 1624 1625 msize = ntohs(msg->size); 1626 1627 ret = ip_vs_send_async(sock, (char *)msg, msize); 1628 if (ret >= 0 || ret == -EAGAIN) 1629 return ret; 1630 pr_err("ip_vs_send_async error %d\n", ret); 1631 return 0; 1632 } 1633 1634 static int 1635 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1636 { 1637 struct msghdr msg = {NULL,}; 1638 struct kvec iov; 1639 int len; 1640 1641 EnterFunction(7); 1642 1643 /* Receive a packet */ 1644 iov.iov_base = buffer; 1645 iov.iov_len = (size_t)buflen; 1646 1647 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); 1648 1649 if (len < 0) 1650 return len; 1651 1652 LeaveFunction(7); 1653 return len; 1654 } 1655 1656 /* Wakeup the master thread for sending */ 1657 static void master_wakeup_work_handler(struct work_struct *work) 1658 { 1659 struct ipvs_master_sync_state *ms = 1660 container_of(work, struct ipvs_master_sync_state, 1661 master_wakeup_work.work); 1662 struct netns_ipvs *ipvs = ms->ipvs; 1663 1664 spin_lock_bh(&ipvs->sync_lock); 1665 if (ms->sync_queue_len && 1666 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1667 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1668 wake_up_process(ms->master_thread); 1669 } 1670 spin_unlock_bh(&ipvs->sync_lock); 1671 } 1672 1673 /* Get next buffer to send */ 1674 static inline struct ip_vs_sync_buff * 1675 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1676 { 1677 struct ip_vs_sync_buff *sb; 1678 1679 sb = sb_dequeue(ipvs, ms); 1680 if (sb) 1681 return sb; 1682 /* Do not delay entries in buffer for more than 2 seconds */ 1683 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1684 } 1685 1686 static int sync_thread_master(void *data) 1687 { 1688 struct ip_vs_sync_thread_data *tinfo = data; 1689 struct netns_ipvs *ipvs = tinfo->ipvs; 1690 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1691 struct sock *sk = tinfo->sock->sk; 1692 struct ip_vs_sync_buff *sb; 1693 1694 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1695 "syncid = %d, id = %d\n", 1696 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1697 1698 for (;;) { 1699 sb = next_sync_buff(ipvs, ms); 1700 if (unlikely(kthread_should_stop())) 1701 break; 1702 if (!sb) { 1703 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1704 continue; 1705 } 1706 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1707 /* (Ab)use interruptible sleep to avoid increasing 1708 * the load avg. 1709 */ 1710 __wait_event_interruptible(*sk_sleep(sk), 1711 sock_writeable(sk) || 1712 kthread_should_stop()); 1713 if (unlikely(kthread_should_stop())) 1714 goto done; 1715 } 1716 ip_vs_sync_buff_release(sb); 1717 } 1718 1719 done: 1720 __set_current_state(TASK_RUNNING); 1721 if (sb) 1722 ip_vs_sync_buff_release(sb); 1723 1724 /* clean up the sync_buff queue */ 1725 while ((sb = sb_dequeue(ipvs, ms))) 1726 ip_vs_sync_buff_release(sb); 1727 __set_current_state(TASK_RUNNING); 1728 1729 /* clean up the current sync_buff */ 1730 sb = get_curr_sync_buff(ipvs, ms, 0); 1731 if (sb) 1732 ip_vs_sync_buff_release(sb); 1733 1734 /* release the sending multicast socket */ 1735 sock_release(tinfo->sock); 1736 kfree(tinfo); 1737 1738 return 0; 1739 } 1740 1741 1742 static int sync_thread_backup(void *data) 1743 { 1744 struct ip_vs_sync_thread_data *tinfo = data; 1745 struct netns_ipvs *ipvs = tinfo->ipvs; 1746 int len; 1747 1748 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1749 "syncid = %d, id = %d\n", 1750 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1751 1752 while (!kthread_should_stop()) { 1753 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1754 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 1755 || kthread_should_stop()); 1756 1757 /* do we have data now? */ 1758 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1759 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1760 ipvs->bcfg.sync_maxlen); 1761 if (len <= 0) { 1762 if (len != -EAGAIN) 1763 pr_err("receiving message error\n"); 1764 break; 1765 } 1766 1767 ip_vs_process_message(ipvs, tinfo->buf, len); 1768 } 1769 } 1770 1771 /* release the sending multicast socket */ 1772 sock_release(tinfo->sock); 1773 kfree(tinfo->buf); 1774 kfree(tinfo); 1775 1776 return 0; 1777 } 1778 1779 1780 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1781 int state) 1782 { 1783 struct ip_vs_sync_thread_data *tinfo; 1784 struct task_struct **array = NULL, *task; 1785 struct socket *sock; 1786 struct net_device *dev; 1787 char *name; 1788 int (*threadfn)(void *data); 1789 int id, count, hlen; 1790 int result = -ENOMEM; 1791 u16 mtu, min_mtu; 1792 1793 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1794 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1795 sizeof(struct ip_vs_sync_conn_v0)); 1796 1797 if (!ipvs->sync_state) { 1798 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1799 ipvs->threads_mask = count - 1; 1800 } else 1801 count = ipvs->threads_mask + 1; 1802 1803 if (c->mcast_af == AF_UNSPEC) { 1804 c->mcast_af = AF_INET; 1805 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1806 } 1807 if (!c->mcast_port) 1808 c->mcast_port = IP_VS_SYNC_PORT; 1809 if (!c->mcast_ttl) 1810 c->mcast_ttl = 1; 1811 1812 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1813 if (!dev) { 1814 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1815 return -ENODEV; 1816 } 1817 hlen = (AF_INET6 == c->mcast_af) ? 1818 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1819 sizeof(struct iphdr) + sizeof(struct udphdr); 1820 mtu = (state == IP_VS_STATE_BACKUP) ? 1821 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1822 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1823 1824 if (c->sync_maxlen) 1825 c->sync_maxlen = clamp_t(unsigned int, 1826 c->sync_maxlen, min_mtu, 1827 65535 - hlen); 1828 else 1829 c->sync_maxlen = mtu - hlen; 1830 1831 if (state == IP_VS_STATE_MASTER) { 1832 if (ipvs->ms) 1833 return -EEXIST; 1834 1835 ipvs->mcfg = *c; 1836 name = "ipvs-m:%d:%d"; 1837 threadfn = sync_thread_master; 1838 } else if (state == IP_VS_STATE_BACKUP) { 1839 if (ipvs->backup_threads) 1840 return -EEXIST; 1841 1842 ipvs->bcfg = *c; 1843 name = "ipvs-b:%d:%d"; 1844 threadfn = sync_thread_backup; 1845 } else { 1846 return -EINVAL; 1847 } 1848 1849 if (state == IP_VS_STATE_MASTER) { 1850 struct ipvs_master_sync_state *ms; 1851 1852 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1853 if (!ipvs->ms) 1854 goto out; 1855 ms = ipvs->ms; 1856 for (id = 0; id < count; id++, ms++) { 1857 INIT_LIST_HEAD(&ms->sync_queue); 1858 ms->sync_queue_len = 0; 1859 ms->sync_queue_delay = 0; 1860 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1861 master_wakeup_work_handler); 1862 ms->ipvs = ipvs; 1863 } 1864 } else { 1865 array = kcalloc(count, sizeof(struct task_struct *), 1866 GFP_KERNEL); 1867 if (!array) 1868 goto out; 1869 } 1870 1871 tinfo = NULL; 1872 for (id = 0; id < count; id++) { 1873 if (state == IP_VS_STATE_MASTER) 1874 sock = make_send_sock(ipvs, id); 1875 else 1876 sock = make_receive_sock(ipvs, id, dev->ifindex); 1877 if (IS_ERR(sock)) { 1878 result = PTR_ERR(sock); 1879 goto outtinfo; 1880 } 1881 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1882 if (!tinfo) 1883 goto outsocket; 1884 tinfo->ipvs = ipvs; 1885 tinfo->sock = sock; 1886 if (state == IP_VS_STATE_BACKUP) { 1887 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1888 GFP_KERNEL); 1889 if (!tinfo->buf) 1890 goto outtinfo; 1891 } else { 1892 tinfo->buf = NULL; 1893 } 1894 tinfo->id = id; 1895 1896 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1897 if (IS_ERR(task)) { 1898 result = PTR_ERR(task); 1899 goto outtinfo; 1900 } 1901 tinfo = NULL; 1902 if (state == IP_VS_STATE_MASTER) 1903 ipvs->ms[id].master_thread = task; 1904 else 1905 array[id] = task; 1906 } 1907 1908 /* mark as active */ 1909 1910 if (state == IP_VS_STATE_BACKUP) 1911 ipvs->backup_threads = array; 1912 spin_lock_bh(&ipvs->sync_buff_lock); 1913 ipvs->sync_state |= state; 1914 spin_unlock_bh(&ipvs->sync_buff_lock); 1915 1916 /* increase the module use count */ 1917 ip_vs_use_count_inc(); 1918 1919 return 0; 1920 1921 outsocket: 1922 sock_release(sock); 1923 1924 outtinfo: 1925 if (tinfo) { 1926 sock_release(tinfo->sock); 1927 kfree(tinfo->buf); 1928 kfree(tinfo); 1929 } 1930 count = id; 1931 while (count-- > 0) { 1932 if (state == IP_VS_STATE_MASTER) 1933 kthread_stop(ipvs->ms[count].master_thread); 1934 else 1935 kthread_stop(array[count]); 1936 } 1937 kfree(array); 1938 1939 out: 1940 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1941 kfree(ipvs->ms); 1942 ipvs->ms = NULL; 1943 } 1944 return result; 1945 } 1946 1947 1948 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1949 { 1950 struct task_struct **array; 1951 int id; 1952 int retc = -EINVAL; 1953 1954 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1955 1956 if (state == IP_VS_STATE_MASTER) { 1957 if (!ipvs->ms) 1958 return -ESRCH; 1959 1960 /* 1961 * The lock synchronizes with sb_queue_tail(), so that we don't 1962 * add sync buffers to the queue, when we are already in 1963 * progress of stopping the master sync daemon. 1964 */ 1965 1966 spin_lock_bh(&ipvs->sync_buff_lock); 1967 spin_lock(&ipvs->sync_lock); 1968 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1969 spin_unlock(&ipvs->sync_lock); 1970 spin_unlock_bh(&ipvs->sync_buff_lock); 1971 1972 retc = 0; 1973 for (id = ipvs->threads_mask; id >= 0; id--) { 1974 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1975 int ret; 1976 1977 pr_info("stopping master sync thread %d ...\n", 1978 task_pid_nr(ms->master_thread)); 1979 cancel_delayed_work_sync(&ms->master_wakeup_work); 1980 ret = kthread_stop(ms->master_thread); 1981 if (retc >= 0) 1982 retc = ret; 1983 } 1984 kfree(ipvs->ms); 1985 ipvs->ms = NULL; 1986 } else if (state == IP_VS_STATE_BACKUP) { 1987 if (!ipvs->backup_threads) 1988 return -ESRCH; 1989 1990 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1991 array = ipvs->backup_threads; 1992 retc = 0; 1993 for (id = ipvs->threads_mask; id >= 0; id--) { 1994 int ret; 1995 1996 pr_info("stopping backup sync thread %d ...\n", 1997 task_pid_nr(array[id])); 1998 ret = kthread_stop(array[id]); 1999 if (retc >= 0) 2000 retc = ret; 2001 } 2002 kfree(array); 2003 ipvs->backup_threads = NULL; 2004 } 2005 2006 /* decrease the module use count */ 2007 ip_vs_use_count_dec(); 2008 2009 return retc; 2010 } 2011 2012 /* 2013 * Initialize data struct for each netns 2014 */ 2015 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2016 { 2017 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2018 spin_lock_init(&ipvs->sync_lock); 2019 spin_lock_init(&ipvs->sync_buff_lock); 2020 return 0; 2021 } 2022 2023 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2024 { 2025 int retc; 2026 2027 mutex_lock(&ipvs->sync_mutex); 2028 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2029 if (retc && retc != -ESRCH) 2030 pr_err("Failed to stop Master Daemon\n"); 2031 2032 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2033 if (retc && retc != -ESRCH) 2034 pr_err("Failed to stop Backup Daemon\n"); 2035 mutex_unlock(&ipvs->sync_mutex); 2036 } 2037
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.