~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/netfilter/ipvs/ip_vs_proto_tcp.c

Version: ~ [ linux-5.5-rc1 ] ~ [ linux-5.4.2 ] ~ [ linux-5.3.15 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.88 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.158 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.206 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.206 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.78 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
  3  *
  4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  5  *              Julian Anastasov <ja@ssi.bg>
  6  *
  7  *              This program is free software; you can redistribute it and/or
  8  *              modify it under the terms of the GNU General Public License
  9  *              as published by the Free Software Foundation; either version
 10  *              2 of the License, or (at your option) any later version.
 11  *
 12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
 13  *
 14  *              Network name space (netns) aware.
 15  *              Global data moved to netns i.e struct netns_ipvs
 16  *              tcp_timeouts table has copy per netns in a hash table per
 17  *              protocol ip_vs_proto_data and is handled by netns
 18  */
 19 
 20 #define KMSG_COMPONENT "IPVS"
 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 22 
 23 #include <linux/kernel.h>
 24 #include <linux/ip.h>
 25 #include <linux/tcp.h>                  /* for tcphdr */
 26 #include <net/ip.h>
 27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
 28 #include <net/ip6_checksum.h>
 29 #include <linux/netfilter.h>
 30 #include <linux/netfilter_ipv4.h>
 31 
 32 #include <net/ip_vs.h>
 33 
 34 static int
 35 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 36                   int *verdict, struct ip_vs_conn **cpp,
 37                   struct ip_vs_iphdr *iph)
 38 {
 39         struct net *net;
 40         struct ip_vs_service *svc;
 41         struct tcphdr _tcph, *th;
 42 
 43         th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
 44         if (th == NULL) {
 45                 *verdict = NF_DROP;
 46                 return 0;
 47         }
 48         net = skb_net(skb);
 49         /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 50         if (th->syn &&
 51             (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
 52                                      &iph->daddr, th->dest))) {
 53                 int ignored;
 54 
 55                 if (ip_vs_todrop(net_ipvs(net))) {
 56                         /*
 57                          * It seems that we are very loaded.
 58                          * We have to drop this packet :(
 59                          */
 60                         ip_vs_service_put(svc);
 61                         *verdict = NF_DROP;
 62                         return 0;
 63                 }
 64 
 65                 /*
 66                  * Let the virtual server select a real server for the
 67                  * incoming connection, and create a connection entry.
 68                  */
 69                 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 70                 if (!*cpp && ignored <= 0) {
 71                         if (!ignored)
 72                                 *verdict = ip_vs_leave(svc, skb, pd, iph);
 73                         else {
 74                                 ip_vs_service_put(svc);
 75                                 *verdict = NF_DROP;
 76                         }
 77                         return 0;
 78                 }
 79                 ip_vs_service_put(svc);
 80         }
 81         /* NF_ACCEPT */
 82         return 1;
 83 }
 84 
 85 
 86 static inline void
 87 tcp_fast_csum_update(int af, struct tcphdr *tcph,
 88                      const union nf_inet_addr *oldip,
 89                      const union nf_inet_addr *newip,
 90                      __be16 oldport, __be16 newport)
 91 {
 92 #ifdef CONFIG_IP_VS_IPV6
 93         if (af == AF_INET6)
 94                 tcph->check =
 95                         csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 96                                          ip_vs_check_diff2(oldport, newport,
 97                                                 ~csum_unfold(tcph->check))));
 98         else
 99 #endif
100         tcph->check =
101                 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
102                                  ip_vs_check_diff2(oldport, newport,
103                                                 ~csum_unfold(tcph->check))));
104 }
105 
106 
107 static inline void
108 tcp_partial_csum_update(int af, struct tcphdr *tcph,
109                      const union nf_inet_addr *oldip,
110                      const union nf_inet_addr *newip,
111                      __be16 oldlen, __be16 newlen)
112 {
113 #ifdef CONFIG_IP_VS_IPV6
114         if (af == AF_INET6)
115                 tcph->check =
116                         ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
117                                          ip_vs_check_diff2(oldlen, newlen,
118                                                 csum_unfold(tcph->check))));
119         else
120 #endif
121         tcph->check =
122                 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
123                                 ip_vs_check_diff2(oldlen, newlen,
124                                                 csum_unfold(tcph->check))));
125 }
126 
127 
128 static int
129 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
130                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
131 {
132         struct tcphdr *tcph;
133         unsigned int tcphoff = iph->len;
134         int oldlen;
135         int payload_csum = 0;
136 
137 #ifdef CONFIG_IP_VS_IPV6
138         if (cp->af == AF_INET6 && iph->fragoffs)
139                 return 1;
140 #endif
141         oldlen = skb->len - tcphoff;
142 
143         /* csum_check requires unshared skb */
144         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
145                 return 0;
146 
147         if (unlikely(cp->app != NULL)) {
148                 int ret;
149 
150                 /* Some checks before mangling */
151                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
152                         return 0;
153 
154                 /* Call application helper if needed */
155                 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
156                         return 0;
157                 /* ret=2: csum update is needed after payload mangling */
158                 if (ret == 1)
159                         oldlen = skb->len - tcphoff;
160                 else
161                         payload_csum = 1;
162         }
163 
164         tcph = (void *)skb_network_header(skb) + tcphoff;
165         tcph->source = cp->vport;
166 
167         /* Adjust TCP checksums */
168         if (skb->ip_summed == CHECKSUM_PARTIAL) {
169                 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
170                                         htons(oldlen),
171                                         htons(skb->len - tcphoff));
172         } else if (!payload_csum) {
173                 /* Only port and addr are changed, do fast csum update */
174                 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
175                                      cp->dport, cp->vport);
176                 if (skb->ip_summed == CHECKSUM_COMPLETE)
177                         skb->ip_summed = (cp->app && pp->csum_check) ?
178                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
179         } else {
180                 /* full checksum calculation */
181                 tcph->check = 0;
182                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
183 #ifdef CONFIG_IP_VS_IPV6
184                 if (cp->af == AF_INET6)
185                         tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
186                                                       &cp->caddr.in6,
187                                                       skb->len - tcphoff,
188                                                       cp->protocol, skb->csum);
189                 else
190 #endif
191                         tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
192                                                         cp->caddr.ip,
193                                                         skb->len - tcphoff,
194                                                         cp->protocol,
195                                                         skb->csum);
196                 skb->ip_summed = CHECKSUM_UNNECESSARY;
197 
198                 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
199                           pp->name, tcph->check,
200                           (char*)&(tcph->check) - (char*)tcph);
201         }
202         return 1;
203 }
204 
205 
206 static int
207 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
208                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
209 {
210         struct tcphdr *tcph;
211         unsigned int tcphoff = iph->len;
212         int oldlen;
213         int payload_csum = 0;
214 
215 #ifdef CONFIG_IP_VS_IPV6
216         if (cp->af == AF_INET6 && iph->fragoffs)
217                 return 1;
218 #endif
219         oldlen = skb->len - tcphoff;
220 
221         /* csum_check requires unshared skb */
222         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
223                 return 0;
224 
225         if (unlikely(cp->app != NULL)) {
226                 int ret;
227 
228                 /* Some checks before mangling */
229                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
230                         return 0;
231 
232                 /*
233                  *      Attempt ip_vs_app call.
234                  *      It will fix ip_vs_conn and iph ack_seq stuff
235                  */
236                 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
237                         return 0;
238                 /* ret=2: csum update is needed after payload mangling */
239                 if (ret == 1)
240                         oldlen = skb->len - tcphoff;
241                 else
242                         payload_csum = 1;
243         }
244 
245         tcph = (void *)skb_network_header(skb) + tcphoff;
246         tcph->dest = cp->dport;
247 
248         /*
249          *      Adjust TCP checksums
250          */
251         if (skb->ip_summed == CHECKSUM_PARTIAL) {
252                 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
253                                         htons(oldlen),
254                                         htons(skb->len - tcphoff));
255         } else if (!payload_csum) {
256                 /* Only port and addr are changed, do fast csum update */
257                 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
258                                      cp->vport, cp->dport);
259                 if (skb->ip_summed == CHECKSUM_COMPLETE)
260                         skb->ip_summed = (cp->app && pp->csum_check) ?
261                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
262         } else {
263                 /* full checksum calculation */
264                 tcph->check = 0;
265                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
266 #ifdef CONFIG_IP_VS_IPV6
267                 if (cp->af == AF_INET6)
268                         tcph->check = csum_ipv6_magic(&cp->caddr.in6,
269                                                       &cp->daddr.in6,
270                                                       skb->len - tcphoff,
271                                                       cp->protocol, skb->csum);
272                 else
273 #endif
274                         tcph->check = csum_tcpudp_magic(cp->caddr.ip,
275                                                         cp->daddr.ip,
276                                                         skb->len - tcphoff,
277                                                         cp->protocol,
278                                                         skb->csum);
279                 skb->ip_summed = CHECKSUM_UNNECESSARY;
280         }
281         return 1;
282 }
283 
284 
285 static int
286 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
287 {
288         unsigned int tcphoff;
289 
290 #ifdef CONFIG_IP_VS_IPV6
291         if (af == AF_INET6)
292                 tcphoff = sizeof(struct ipv6hdr);
293         else
294 #endif
295                 tcphoff = ip_hdrlen(skb);
296 
297         switch (skb->ip_summed) {
298         case CHECKSUM_NONE:
299                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
300         case CHECKSUM_COMPLETE:
301 #ifdef CONFIG_IP_VS_IPV6
302                 if (af == AF_INET6) {
303                         if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
304                                             &ipv6_hdr(skb)->daddr,
305                                             skb->len - tcphoff,
306                                             ipv6_hdr(skb)->nexthdr,
307                                             skb->csum)) {
308                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
309                                                  "Failed checksum for");
310                                 return 0;
311                         }
312                 } else
313 #endif
314                         if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
315                                               ip_hdr(skb)->daddr,
316                                               skb->len - tcphoff,
317                                               ip_hdr(skb)->protocol,
318                                               skb->csum)) {
319                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
320                                                  "Failed checksum for");
321                                 return 0;
322                         }
323                 break;
324         default:
325                 /* No need to checksum. */
326                 break;
327         }
328 
329         return 1;
330 }
331 
332 
333 #define TCP_DIR_INPUT           0
334 #define TCP_DIR_OUTPUT          4
335 #define TCP_DIR_INPUT_ONLY      8
336 
337 static const int tcp_state_off[IP_VS_DIR_LAST] = {
338         [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
339         [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
340         [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
341 };
342 
343 /*
344  *      Timeout table[state]
345  */
346 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
347         [IP_VS_TCP_S_NONE]              =       2*HZ,
348         [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
349         [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
350         [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
351         [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
352         [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
353         [IP_VS_TCP_S_CLOSE]             =       10*HZ,
354         [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
355         [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
356         [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
357         [IP_VS_TCP_S_SYNACK]            =       120*HZ,
358         [IP_VS_TCP_S_LAST]              =       2*HZ,
359 };
360 
361 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
362         [IP_VS_TCP_S_NONE]              =       "NONE",
363         [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
364         [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
365         [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
366         [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
367         [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
368         [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
369         [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
370         [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
371         [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
372         [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
373         [IP_VS_TCP_S_LAST]              =       "BUG!",
374 };
375 
376 #define sNO IP_VS_TCP_S_NONE
377 #define sES IP_VS_TCP_S_ESTABLISHED
378 #define sSS IP_VS_TCP_S_SYN_SENT
379 #define sSR IP_VS_TCP_S_SYN_RECV
380 #define sFW IP_VS_TCP_S_FIN_WAIT
381 #define sTW IP_VS_TCP_S_TIME_WAIT
382 #define sCL IP_VS_TCP_S_CLOSE
383 #define sCW IP_VS_TCP_S_CLOSE_WAIT
384 #define sLA IP_VS_TCP_S_LAST_ACK
385 #define sLI IP_VS_TCP_S_LISTEN
386 #define sSA IP_VS_TCP_S_SYNACK
387 
388 struct tcp_states_t {
389         int next_state[IP_VS_TCP_S_LAST];
390 };
391 
392 static const char * tcp_state_name(int state)
393 {
394         if (state >= IP_VS_TCP_S_LAST)
395                 return "ERR!";
396         return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
397 }
398 
399 static struct tcp_states_t tcp_states [] = {
400 /*      INPUT */
401 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
402 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
403 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
404 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
405 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
406 
407 /*      OUTPUT */
408 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
409 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
410 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
411 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
412 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
413 
414 /*      INPUT-ONLY */
415 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
416 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
417 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
418 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
419 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
420 };
421 
422 static struct tcp_states_t tcp_states_dos [] = {
423 /*      INPUT */
424 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
425 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
426 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
427 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
428 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
429 
430 /*      OUTPUT */
431 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
432 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
433 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
434 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
435 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
436 
437 /*      INPUT-ONLY */
438 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
439 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
440 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
441 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
442 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
443 };
444 
445 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
446 {
447         int on = (flags & 1);           /* secure_tcp */
448 
449         /*
450         ** FIXME: change secure_tcp to independent sysctl var
451         ** or make it per-service or per-app because it is valid
452         ** for most if not for all of the applications. Something
453         ** like "capabilities" (flags) for each object.
454         */
455         pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
456 }
457 
458 static inline int tcp_state_idx(struct tcphdr *th)
459 {
460         if (th->rst)
461                 return 3;
462         if (th->syn)
463                 return 0;
464         if (th->fin)
465                 return 1;
466         if (th->ack)
467                 return 2;
468         return -1;
469 }
470 
471 static inline void
472 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
473               int direction, struct tcphdr *th)
474 {
475         int state_idx;
476         int new_state = IP_VS_TCP_S_CLOSE;
477         int state_off = tcp_state_off[direction];
478 
479         /*
480          *    Update state offset to INPUT_ONLY if necessary
481          *    or delete NO_OUTPUT flag if output packet detected
482          */
483         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
484                 if (state_off == TCP_DIR_OUTPUT)
485                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
486                 else
487                         state_off = TCP_DIR_INPUT_ONLY;
488         }
489 
490         if ((state_idx = tcp_state_idx(th)) < 0) {
491                 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
492                 goto tcp_state_out;
493         }
494 
495         new_state =
496                 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
497 
498   tcp_state_out:
499         if (new_state != cp->state) {
500                 struct ip_vs_dest *dest = cp->dest;
501 
502                 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
503                               "%s:%d state: %s->%s conn->refcnt:%d\n",
504                               pd->pp->name,
505                               ((state_off == TCP_DIR_OUTPUT) ?
506                                "output " : "input "),
507                               th->syn ? 'S' : '.',
508                               th->fin ? 'F' : '.',
509                               th->ack ? 'A' : '.',
510                               th->rst ? 'R' : '.',
511                               IP_VS_DBG_ADDR(cp->af, &cp->daddr),
512                               ntohs(cp->dport),
513                               IP_VS_DBG_ADDR(cp->af, &cp->caddr),
514                               ntohs(cp->cport),
515                               tcp_state_name(cp->state),
516                               tcp_state_name(new_state),
517                               atomic_read(&cp->refcnt));
518 
519                 if (dest) {
520                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
521                             (new_state != IP_VS_TCP_S_ESTABLISHED)) {
522                                 atomic_dec(&dest->activeconns);
523                                 atomic_inc(&dest->inactconns);
524                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
525                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
526                                    (new_state == IP_VS_TCP_S_ESTABLISHED)) {
527                                 atomic_inc(&dest->activeconns);
528                                 atomic_dec(&dest->inactconns);
529                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
530                         }
531                 }
532         }
533 
534         if (likely(pd))
535                 cp->timeout = pd->timeout_table[cp->state = new_state];
536         else    /* What to do ? */
537                 cp->timeout = tcp_timeouts[cp->state = new_state];
538 }
539 
540 /*
541  *      Handle state transitions
542  */
543 static void
544 tcp_state_transition(struct ip_vs_conn *cp, int direction,
545                      const struct sk_buff *skb,
546                      struct ip_vs_proto_data *pd)
547 {
548         struct tcphdr _tcph, *th;
549 
550 #ifdef CONFIG_IP_VS_IPV6
551         int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
552 #else
553         int ihl = ip_hdrlen(skb);
554 #endif
555 
556         th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
557         if (th == NULL)
558                 return;
559 
560         spin_lock(&cp->lock);
561         set_tcp_state(pd, cp, direction, th);
562         spin_unlock(&cp->lock);
563 }
564 
565 static inline __u16 tcp_app_hashkey(__be16 port)
566 {
567         return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
568                 & TCP_APP_TAB_MASK;
569 }
570 
571 
572 static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
573 {
574         struct ip_vs_app *i;
575         __u16 hash;
576         __be16 port = inc->port;
577         int ret = 0;
578         struct netns_ipvs *ipvs = net_ipvs(net);
579         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
580 
581         hash = tcp_app_hashkey(port);
582 
583         spin_lock_bh(&ipvs->tcp_app_lock);
584         list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
585                 if (i->port == port) {
586                         ret = -EEXIST;
587                         goto out;
588                 }
589         }
590         list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
591         atomic_inc(&pd->appcnt);
592 
593   out:
594         spin_unlock_bh(&ipvs->tcp_app_lock);
595         return ret;
596 }
597 
598 
599 static void
600 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
601 {
602         struct netns_ipvs *ipvs = net_ipvs(net);
603         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
604 
605         spin_lock_bh(&ipvs->tcp_app_lock);
606         atomic_dec(&pd->appcnt);
607         list_del(&inc->p_list);
608         spin_unlock_bh(&ipvs->tcp_app_lock);
609 }
610 
611 
612 static int
613 tcp_app_conn_bind(struct ip_vs_conn *cp)
614 {
615         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
616         int hash;
617         struct ip_vs_app *inc;
618         int result = 0;
619 
620         /* Default binding: bind app only for NAT */
621         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
622                 return 0;
623 
624         /* Lookup application incarnations and bind the right one */
625         hash = tcp_app_hashkey(cp->vport);
626 
627         spin_lock(&ipvs->tcp_app_lock);
628         list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
629                 if (inc->port == cp->vport) {
630                         if (unlikely(!ip_vs_app_inc_get(inc)))
631                                 break;
632                         spin_unlock(&ipvs->tcp_app_lock);
633 
634                         IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
635                                       "%s:%u to app %s on port %u\n",
636                                       __func__,
637                                       IP_VS_DBG_ADDR(cp->af, &cp->caddr),
638                                       ntohs(cp->cport),
639                                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
640                                       ntohs(cp->vport),
641                                       inc->name, ntohs(inc->port));
642 
643                         cp->app = inc;
644                         if (inc->init_conn)
645                                 result = inc->init_conn(inc, cp);
646                         goto out;
647                 }
648         }
649         spin_unlock(&ipvs->tcp_app_lock);
650 
651   out:
652         return result;
653 }
654 
655 
656 /*
657  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
658  */
659 void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
660 {
661         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
662 
663         spin_lock(&cp->lock);
664         cp->state = IP_VS_TCP_S_LISTEN;
665         cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
666                            : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
667         spin_unlock(&cp->lock);
668 }
669 
670 /* ---------------------------------------------
671  *   timeouts is netns related now.
672  * ---------------------------------------------
673  */
674 static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
675 {
676         struct netns_ipvs *ipvs = net_ipvs(net);
677 
678         ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
679         spin_lock_init(&ipvs->tcp_app_lock);
680         pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
681                                                         sizeof(tcp_timeouts));
682         if (!pd->timeout_table)
683                 return -ENOMEM;
684         pd->tcp_state_table =  tcp_states;
685         return 0;
686 }
687 
688 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
689 {
690         kfree(pd->timeout_table);
691 }
692 
693 
694 struct ip_vs_protocol ip_vs_protocol_tcp = {
695         .name =                 "TCP",
696         .protocol =             IPPROTO_TCP,
697         .num_states =           IP_VS_TCP_S_LAST,
698         .dont_defrag =          0,
699         .init =                 NULL,
700         .exit =                 NULL,
701         .init_netns =           __ip_vs_tcp_init,
702         .exit_netns =           __ip_vs_tcp_exit,
703         .register_app =         tcp_register_app,
704         .unregister_app =       tcp_unregister_app,
705         .conn_schedule =        tcp_conn_schedule,
706         .conn_in_get =          ip_vs_conn_in_get_proto,
707         .conn_out_get =         ip_vs_conn_out_get_proto,
708         .snat_handler =         tcp_snat_handler,
709         .dnat_handler =         tcp_dnat_handler,
710         .csum_check =           tcp_csum_check,
711         .state_name =           tcp_state_name,
712         .state_transition =     tcp_state_transition,
713         .app_conn_bind =        tcp_app_conn_bind,
714         .debug_packet =         ip_vs_tcpudp_debug_packet,
715         .timeout_change =       tcp_timeout_change,
716 };
717 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp