1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/jhash.h> 25 #include <linux/siphash.h> 26 #include <linux/err.h> 27 #include <linux/percpu.h> 28 #include <linux/moduleparam.h> 29 #include <linux/notifier.h> 30 #include <linux/kernel.h> 31 #include <linux/netdevice.h> 32 #include <linux/socket.h> 33 #include <linux/mm.h> 34 #include <linux/nsproxy.h> 35 #include <linux/rculist_nulls.h> 36 37 #include <net/netfilter/nf_conntrack.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_seqadj.h> 42 #include <net/netfilter/nf_conntrack_core.h> 43 #include <net/netfilter/nf_conntrack_extend.h> 44 #include <net/netfilter/nf_conntrack_acct.h> 45 #include <net/netfilter/nf_conntrack_ecache.h> 46 #include <net/netfilter/nf_conntrack_zones.h> 47 #include <net/netfilter/nf_conntrack_timestamp.h> 48 #include <net/netfilter/nf_conntrack_timeout.h> 49 #include <net/netfilter/nf_conntrack_labels.h> 50 #include <net/netfilter/nf_conntrack_synproxy.h> 51 #include <net/netfilter/nf_nat.h> 52 #include <net/netfilter/nf_nat_helper.h> 53 #include <net/netns/hash.h> 54 #include <net/ip.h> 55 56 #include "nf_internals.h" 57 58 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 59 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 60 61 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 62 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 63 64 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 65 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 66 67 struct conntrack_gc_work { 68 struct delayed_work dwork; 69 u32 last_bucket; 70 bool exiting; 71 bool early_drop; 72 long next_gc_run; 73 }; 74 75 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 76 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 77 static __read_mostly bool nf_conntrack_locks_all; 78 79 /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ 80 #define GC_MAX_BUCKETS_DIV 128u 81 /* upper bound of full table scan */ 82 #define GC_MAX_SCAN_JIFFIES (16u * HZ) 83 /* desired ratio of entries found to be expired */ 84 #define GC_EVICT_RATIO 50u 85 86 static struct conntrack_gc_work conntrack_gc_work; 87 88 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 89 { 90 /* 1) Acquire the lock */ 91 spin_lock(lock); 92 93 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 94 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 95 */ 96 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 97 return; 98 99 /* fast path failed, unlock */ 100 spin_unlock(lock); 101 102 /* Slow path 1) get global lock */ 103 spin_lock(&nf_conntrack_locks_all_lock); 104 105 /* Slow path 2) get the lock we want */ 106 spin_lock(lock); 107 108 /* Slow path 3) release the global lock */ 109 spin_unlock(&nf_conntrack_locks_all_lock); 110 } 111 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 112 113 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 114 { 115 h1 %= CONNTRACK_LOCKS; 116 h2 %= CONNTRACK_LOCKS; 117 spin_unlock(&nf_conntrack_locks[h1]); 118 if (h1 != h2) 119 spin_unlock(&nf_conntrack_locks[h2]); 120 } 121 122 /* return true if we need to recompute hashes (in case hash table was resized) */ 123 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 124 unsigned int h2, unsigned int sequence) 125 { 126 h1 %= CONNTRACK_LOCKS; 127 h2 %= CONNTRACK_LOCKS; 128 if (h1 <= h2) { 129 nf_conntrack_lock(&nf_conntrack_locks[h1]); 130 if (h1 != h2) 131 spin_lock_nested(&nf_conntrack_locks[h2], 132 SINGLE_DEPTH_NESTING); 133 } else { 134 nf_conntrack_lock(&nf_conntrack_locks[h2]); 135 spin_lock_nested(&nf_conntrack_locks[h1], 136 SINGLE_DEPTH_NESTING); 137 } 138 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 139 nf_conntrack_double_unlock(h1, h2); 140 return true; 141 } 142 return false; 143 } 144 145 static void nf_conntrack_all_lock(void) 146 { 147 int i; 148 149 spin_lock(&nf_conntrack_locks_all_lock); 150 151 nf_conntrack_locks_all = true; 152 153 for (i = 0; i < CONNTRACK_LOCKS; i++) { 154 spin_lock(&nf_conntrack_locks[i]); 155 156 /* This spin_unlock provides the "release" to ensure that 157 * nf_conntrack_locks_all==true is visible to everyone that 158 * acquired spin_lock(&nf_conntrack_locks[]). 159 */ 160 spin_unlock(&nf_conntrack_locks[i]); 161 } 162 } 163 164 static void nf_conntrack_all_unlock(void) 165 { 166 /* All prior stores must be complete before we clear 167 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 168 * might observe the false value but not the entire 169 * critical section. 170 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 171 */ 172 smp_store_release(&nf_conntrack_locks_all, false); 173 spin_unlock(&nf_conntrack_locks_all_lock); 174 } 175 176 unsigned int nf_conntrack_htable_size __read_mostly; 177 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 178 179 unsigned int nf_conntrack_max __read_mostly; 180 EXPORT_SYMBOL_GPL(nf_conntrack_max); 181 seqcount_t nf_conntrack_generation __read_mostly; 182 static unsigned int nf_conntrack_hash_rnd __read_mostly; 183 184 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 185 const struct net *net) 186 { 187 unsigned int n; 188 u32 seed; 189 190 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 191 192 /* The direction must be ignored, so we hash everything up to the 193 * destination ports (which is a multiple of 4) and treat the last 194 * three bytes manually. 195 */ 196 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); 197 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 198 return jhash2((u32 *)tuple, n, seed ^ 199 (((__force __u16)tuple->dst.u.all << 16) | 200 tuple->dst.protonum)); 201 } 202 203 static u32 scale_hash(u32 hash) 204 { 205 return reciprocal_scale(hash, nf_conntrack_htable_size); 206 } 207 208 static u32 __hash_conntrack(const struct net *net, 209 const struct nf_conntrack_tuple *tuple, 210 unsigned int size) 211 { 212 return reciprocal_scale(hash_conntrack_raw(tuple, net), size); 213 } 214 215 static u32 hash_conntrack(const struct net *net, 216 const struct nf_conntrack_tuple *tuple) 217 { 218 return scale_hash(hash_conntrack_raw(tuple, net)); 219 } 220 221 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 222 unsigned int dataoff, 223 struct nf_conntrack_tuple *tuple) 224 { struct { 225 __be16 sport; 226 __be16 dport; 227 } _inet_hdr, *inet_hdr; 228 229 /* Actually only need first 4 bytes to get ports. */ 230 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 231 if (!inet_hdr) 232 return false; 233 234 tuple->src.u.udp.port = inet_hdr->sport; 235 tuple->dst.u.udp.port = inet_hdr->dport; 236 return true; 237 } 238 239 static bool 240 nf_ct_get_tuple(const struct sk_buff *skb, 241 unsigned int nhoff, 242 unsigned int dataoff, 243 u_int16_t l3num, 244 u_int8_t protonum, 245 struct net *net, 246 struct nf_conntrack_tuple *tuple) 247 { 248 unsigned int size; 249 const __be32 *ap; 250 __be32 _addrs[8]; 251 252 memset(tuple, 0, sizeof(*tuple)); 253 254 tuple->src.l3num = l3num; 255 switch (l3num) { 256 case NFPROTO_IPV4: 257 nhoff += offsetof(struct iphdr, saddr); 258 size = 2 * sizeof(__be32); 259 break; 260 case NFPROTO_IPV6: 261 nhoff += offsetof(struct ipv6hdr, saddr); 262 size = sizeof(_addrs); 263 break; 264 default: 265 return true; 266 } 267 268 ap = skb_header_pointer(skb, nhoff, size, _addrs); 269 if (!ap) 270 return false; 271 272 switch (l3num) { 273 case NFPROTO_IPV4: 274 tuple->src.u3.ip = ap[0]; 275 tuple->dst.u3.ip = ap[1]; 276 break; 277 case NFPROTO_IPV6: 278 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 279 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 280 break; 281 } 282 283 tuple->dst.protonum = protonum; 284 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 285 286 switch (protonum) { 287 #if IS_ENABLED(CONFIG_IPV6) 288 case IPPROTO_ICMPV6: 289 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 290 #endif 291 case IPPROTO_ICMP: 292 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 293 #ifdef CONFIG_NF_CT_PROTO_GRE 294 case IPPROTO_GRE: 295 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 296 #endif 297 case IPPROTO_TCP: 298 case IPPROTO_UDP: /* fallthrough */ 299 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 300 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 301 case IPPROTO_UDPLITE: 302 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 303 #endif 304 #ifdef CONFIG_NF_CT_PROTO_SCTP 305 case IPPROTO_SCTP: 306 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 307 #endif 308 #ifdef CONFIG_NF_CT_PROTO_DCCP 309 case IPPROTO_DCCP: 310 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 311 #endif 312 default: 313 break; 314 } 315 316 return true; 317 } 318 319 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 320 u_int8_t *protonum) 321 { 322 int dataoff = -1; 323 const struct iphdr *iph; 324 struct iphdr _iph; 325 326 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 327 if (!iph) 328 return -1; 329 330 /* Conntrack defragments packets, we might still see fragments 331 * inside ICMP packets though. 332 */ 333 if (iph->frag_off & htons(IP_OFFSET)) 334 return -1; 335 336 dataoff = nhoff + (iph->ihl << 2); 337 *protonum = iph->protocol; 338 339 /* Check bogus IP headers */ 340 if (dataoff > skb->len) { 341 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 342 nhoff, iph->ihl << 2, skb->len); 343 return -1; 344 } 345 return dataoff; 346 } 347 348 #if IS_ENABLED(CONFIG_IPV6) 349 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 350 u8 *protonum) 351 { 352 int protoff = -1; 353 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 354 __be16 frag_off; 355 u8 nexthdr; 356 357 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 358 &nexthdr, sizeof(nexthdr)) != 0) { 359 pr_debug("can't get nexthdr\n"); 360 return -1; 361 } 362 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 363 /* 364 * (protoff == skb->len) means the packet has not data, just 365 * IPv6 and possibly extensions headers, but it is tracked anyway 366 */ 367 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 368 pr_debug("can't find proto in pkt\n"); 369 return -1; 370 } 371 372 *protonum = nexthdr; 373 return protoff; 374 } 375 #endif 376 377 static int get_l4proto(const struct sk_buff *skb, 378 unsigned int nhoff, u8 pf, u8 *l4num) 379 { 380 switch (pf) { 381 case NFPROTO_IPV4: 382 return ipv4_get_l4proto(skb, nhoff, l4num); 383 #if IS_ENABLED(CONFIG_IPV6) 384 case NFPROTO_IPV6: 385 return ipv6_get_l4proto(skb, nhoff, l4num); 386 #endif 387 default: 388 *l4num = 0; 389 break; 390 } 391 return -1; 392 } 393 394 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 395 u_int16_t l3num, 396 struct net *net, struct nf_conntrack_tuple *tuple) 397 { 398 u8 protonum; 399 int protoff; 400 401 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 402 if (protoff <= 0) 403 return false; 404 405 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 406 } 407 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 408 409 bool 410 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 411 const struct nf_conntrack_tuple *orig) 412 { 413 memset(inverse, 0, sizeof(*inverse)); 414 415 inverse->src.l3num = orig->src.l3num; 416 417 switch (orig->src.l3num) { 418 case NFPROTO_IPV4: 419 inverse->src.u3.ip = orig->dst.u3.ip; 420 inverse->dst.u3.ip = orig->src.u3.ip; 421 break; 422 case NFPROTO_IPV6: 423 inverse->src.u3.in6 = orig->dst.u3.in6; 424 inverse->dst.u3.in6 = orig->src.u3.in6; 425 break; 426 default: 427 break; 428 } 429 430 inverse->dst.dir = !orig->dst.dir; 431 432 inverse->dst.protonum = orig->dst.protonum; 433 434 switch (orig->dst.protonum) { 435 case IPPROTO_ICMP: 436 return nf_conntrack_invert_icmp_tuple(inverse, orig); 437 #if IS_ENABLED(CONFIG_IPV6) 438 case IPPROTO_ICMPV6: 439 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 440 #endif 441 } 442 443 inverse->src.u.all = orig->dst.u.all; 444 inverse->dst.u.all = orig->src.u.all; 445 return true; 446 } 447 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 448 449 /* Generate a almost-unique pseudo-id for a given conntrack. 450 * 451 * intentionally doesn't re-use any of the seeds used for hash 452 * table location, we assume id gets exposed to userspace. 453 * 454 * Following nf_conn items do not change throughout lifetime 455 * of the nf_conn: 456 * 457 * 1. nf_conn address 458 * 2. nf_conn->master address (normally NULL) 459 * 3. the associated net namespace 460 * 4. the original direction tuple 461 */ 462 u32 nf_ct_get_id(const struct nf_conn *ct) 463 { 464 static __read_mostly siphash_key_t ct_id_seed; 465 unsigned long a, b, c, d; 466 467 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 468 469 a = (unsigned long)ct; 470 b = (unsigned long)ct->master; 471 c = (unsigned long)nf_ct_net(ct); 472 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 473 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 474 &ct_id_seed); 475 #ifdef CONFIG_64BIT 476 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 477 #else 478 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 479 #endif 480 } 481 EXPORT_SYMBOL_GPL(nf_ct_get_id); 482 483 static void 484 clean_from_lists(struct nf_conn *ct) 485 { 486 pr_debug("clean_from_lists(%p)\n", ct); 487 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 488 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 489 490 /* Destroy all pending expectations */ 491 nf_ct_remove_expectations(ct); 492 } 493 494 /* must be called with local_bh_disable */ 495 static void nf_ct_add_to_dying_list(struct nf_conn *ct) 496 { 497 struct ct_pcpu *pcpu; 498 499 /* add this conntrack to the (per cpu) dying list */ 500 ct->cpu = smp_processor_id(); 501 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 502 503 spin_lock(&pcpu->lock); 504 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 505 &pcpu->dying); 506 spin_unlock(&pcpu->lock); 507 } 508 509 /* must be called with local_bh_disable */ 510 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) 511 { 512 struct ct_pcpu *pcpu; 513 514 /* add this conntrack to the (per cpu) unconfirmed list */ 515 ct->cpu = smp_processor_id(); 516 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 517 518 spin_lock(&pcpu->lock); 519 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 520 &pcpu->unconfirmed); 521 spin_unlock(&pcpu->lock); 522 } 523 524 /* must be called with local_bh_disable */ 525 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) 526 { 527 struct ct_pcpu *pcpu; 528 529 /* We overload first tuple to link into unconfirmed or dying list.*/ 530 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 531 532 spin_lock(&pcpu->lock); 533 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 534 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 535 spin_unlock(&pcpu->lock); 536 } 537 538 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 539 540 /* Released via destroy_conntrack() */ 541 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 542 const struct nf_conntrack_zone *zone, 543 gfp_t flags) 544 { 545 struct nf_conn *tmpl, *p; 546 547 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 548 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 549 if (!tmpl) 550 return NULL; 551 552 p = tmpl; 553 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 554 if (tmpl != p) { 555 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 556 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 557 } 558 } else { 559 tmpl = kzalloc(sizeof(*tmpl), flags); 560 if (!tmpl) 561 return NULL; 562 } 563 564 tmpl->status = IPS_TEMPLATE; 565 write_pnet(&tmpl->ct_net, net); 566 nf_ct_zone_add(tmpl, zone); 567 atomic_set(&tmpl->ct_general.use, 0); 568 569 return tmpl; 570 } 571 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 572 573 void nf_ct_tmpl_free(struct nf_conn *tmpl) 574 { 575 nf_ct_ext_destroy(tmpl); 576 577 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 578 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 579 else 580 kfree(tmpl); 581 } 582 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 583 584 static void destroy_gre_conntrack(struct nf_conn *ct) 585 { 586 #ifdef CONFIG_NF_CT_PROTO_GRE 587 struct nf_conn *master = ct->master; 588 589 if (master) 590 nf_ct_gre_keymap_destroy(master); 591 #endif 592 } 593 594 static void 595 destroy_conntrack(struct nf_conntrack *nfct) 596 { 597 struct nf_conn *ct = (struct nf_conn *)nfct; 598 599 pr_debug("destroy_conntrack(%p)\n", ct); 600 WARN_ON(atomic_read(&nfct->use) != 0); 601 602 if (unlikely(nf_ct_is_template(ct))) { 603 nf_ct_tmpl_free(ct); 604 return; 605 } 606 607 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 608 destroy_gre_conntrack(ct); 609 610 local_bh_disable(); 611 /* Expectations will have been removed in clean_from_lists, 612 * except TFTP can create an expectation on the first packet, 613 * before connection is in the list, so we need to clean here, 614 * too. 615 */ 616 nf_ct_remove_expectations(ct); 617 618 nf_ct_del_from_dying_or_unconfirmed_list(ct); 619 620 local_bh_enable(); 621 622 if (ct->master) 623 nf_ct_put(ct->master); 624 625 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 626 nf_conntrack_free(ct); 627 } 628 629 static void nf_ct_delete_from_lists(struct nf_conn *ct) 630 { 631 struct net *net = nf_ct_net(ct); 632 unsigned int hash, reply_hash; 633 unsigned int sequence; 634 635 nf_ct_helper_destroy(ct); 636 637 local_bh_disable(); 638 do { 639 sequence = read_seqcount_begin(&nf_conntrack_generation); 640 hash = hash_conntrack(net, 641 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 642 reply_hash = hash_conntrack(net, 643 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 644 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 645 646 clean_from_lists(ct); 647 nf_conntrack_double_unlock(hash, reply_hash); 648 649 nf_ct_add_to_dying_list(ct); 650 651 local_bh_enable(); 652 } 653 654 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 655 { 656 struct nf_conn_tstamp *tstamp; 657 658 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 659 return false; 660 661 tstamp = nf_conn_tstamp_find(ct); 662 if (tstamp && tstamp->stop == 0) 663 tstamp->stop = ktime_get_real_ns(); 664 665 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 666 portid, report) < 0) { 667 /* destroy event was not delivered. nf_ct_put will 668 * be done by event cache worker on redelivery. 669 */ 670 nf_ct_delete_from_lists(ct); 671 nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); 672 return false; 673 } 674 675 nf_conntrack_ecache_work(nf_ct_net(ct)); 676 nf_ct_delete_from_lists(ct); 677 nf_ct_put(ct); 678 return true; 679 } 680 EXPORT_SYMBOL_GPL(nf_ct_delete); 681 682 static inline bool 683 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 684 const struct nf_conntrack_tuple *tuple, 685 const struct nf_conntrack_zone *zone, 686 const struct net *net) 687 { 688 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 689 690 /* A conntrack can be recreated with the equal tuple, 691 * so we need to check that the conntrack is confirmed 692 */ 693 return nf_ct_tuple_equal(tuple, &h->tuple) && 694 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 695 nf_ct_is_confirmed(ct) && 696 net_eq(net, nf_ct_net(ct)); 697 } 698 699 static inline bool 700 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 701 { 702 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 703 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 704 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 705 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 706 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 707 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 708 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 709 } 710 711 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 712 static void nf_ct_gc_expired(struct nf_conn *ct) 713 { 714 if (!atomic_inc_not_zero(&ct->ct_general.use)) 715 return; 716 717 if (nf_ct_should_gc(ct)) 718 nf_ct_kill(ct); 719 720 nf_ct_put(ct); 721 } 722 723 /* 724 * Warning : 725 * - Caller must take a reference on returned object 726 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 727 */ 728 static struct nf_conntrack_tuple_hash * 729 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 730 const struct nf_conntrack_tuple *tuple, u32 hash) 731 { 732 struct nf_conntrack_tuple_hash *h; 733 struct hlist_nulls_head *ct_hash; 734 struct hlist_nulls_node *n; 735 unsigned int bucket, hsize; 736 737 begin: 738 nf_conntrack_get_ht(&ct_hash, &hsize); 739 bucket = reciprocal_scale(hash, hsize); 740 741 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 742 struct nf_conn *ct; 743 744 ct = nf_ct_tuplehash_to_ctrack(h); 745 if (nf_ct_is_expired(ct)) { 746 nf_ct_gc_expired(ct); 747 continue; 748 } 749 750 if (nf_ct_key_equal(h, tuple, zone, net)) 751 return h; 752 } 753 /* 754 * if the nulls value we got at the end of this lookup is 755 * not the expected one, we must restart lookup. 756 * We probably met an item that was moved to another chain. 757 */ 758 if (get_nulls_value(n) != bucket) { 759 NF_CT_STAT_INC_ATOMIC(net, search_restart); 760 goto begin; 761 } 762 763 return NULL; 764 } 765 766 /* Find a connection corresponding to a tuple. */ 767 static struct nf_conntrack_tuple_hash * 768 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 769 const struct nf_conntrack_tuple *tuple, u32 hash) 770 { 771 struct nf_conntrack_tuple_hash *h; 772 struct nf_conn *ct; 773 774 rcu_read_lock(); 775 776 h = ____nf_conntrack_find(net, zone, tuple, hash); 777 if (h) { 778 /* We have a candidate that matches the tuple we're interested 779 * in, try to obtain a reference and re-check tuple 780 */ 781 ct = nf_ct_tuplehash_to_ctrack(h); 782 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { 783 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 784 goto found; 785 786 /* TYPESAFE_BY_RCU recycled the candidate */ 787 nf_ct_put(ct); 788 } 789 790 h = NULL; 791 } 792 found: 793 rcu_read_unlock(); 794 795 return h; 796 } 797 798 struct nf_conntrack_tuple_hash * 799 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 800 const struct nf_conntrack_tuple *tuple) 801 { 802 return __nf_conntrack_find_get(net, zone, tuple, 803 hash_conntrack_raw(tuple, net)); 804 } 805 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 806 807 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 808 unsigned int hash, 809 unsigned int reply_hash) 810 { 811 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 812 &nf_conntrack_hash[hash]); 813 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 814 &nf_conntrack_hash[reply_hash]); 815 } 816 817 int 818 nf_conntrack_hash_check_insert(struct nf_conn *ct) 819 { 820 const struct nf_conntrack_zone *zone; 821 struct net *net = nf_ct_net(ct); 822 unsigned int hash, reply_hash; 823 struct nf_conntrack_tuple_hash *h; 824 struct hlist_nulls_node *n; 825 unsigned int sequence; 826 827 zone = nf_ct_zone(ct); 828 829 local_bh_disable(); 830 do { 831 sequence = read_seqcount_begin(&nf_conntrack_generation); 832 hash = hash_conntrack(net, 833 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 834 reply_hash = hash_conntrack(net, 835 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 836 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 837 838 /* See if there's one in the list already, including reverse */ 839 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 840 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 841 zone, net)) 842 goto out; 843 844 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 845 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 846 zone, net)) 847 goto out; 848 849 smp_wmb(); 850 /* The caller holds a reference to this object */ 851 atomic_set(&ct->ct_general.use, 2); 852 __nf_conntrack_hash_insert(ct, hash, reply_hash); 853 nf_conntrack_double_unlock(hash, reply_hash); 854 NF_CT_STAT_INC(net, insert); 855 local_bh_enable(); 856 return 0; 857 858 out: 859 nf_conntrack_double_unlock(hash, reply_hash); 860 NF_CT_STAT_INC(net, insert_failed); 861 local_bh_enable(); 862 return -EEXIST; 863 } 864 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 865 866 static inline void nf_ct_acct_update(struct nf_conn *ct, 867 enum ip_conntrack_info ctinfo, 868 unsigned int len) 869 { 870 struct nf_conn_acct *acct; 871 872 acct = nf_conn_acct_find(ct); 873 if (acct) { 874 struct nf_conn_counter *counter = acct->counter; 875 876 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); 877 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); 878 } 879 } 880 881 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 882 const struct nf_conn *loser_ct) 883 { 884 struct nf_conn_acct *acct; 885 886 acct = nf_conn_acct_find(loser_ct); 887 if (acct) { 888 struct nf_conn_counter *counter = acct->counter; 889 unsigned int bytes; 890 891 /* u32 should be fine since we must have seen one packet. */ 892 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 893 nf_ct_acct_update(ct, ctinfo, bytes); 894 } 895 } 896 897 /* Resolve race on insertion if this protocol allows this. */ 898 static __cold noinline int 899 nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, 900 enum ip_conntrack_info ctinfo, 901 struct nf_conntrack_tuple_hash *h) 902 { 903 /* This is the conntrack entry already in hashes that won race. */ 904 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 905 const struct nf_conntrack_l4proto *l4proto; 906 enum ip_conntrack_info oldinfo; 907 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); 908 909 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 910 if (l4proto->allow_clash && 911 !nf_ct_is_dying(ct) && 912 atomic_inc_not_zero(&ct->ct_general.use)) { 913 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 914 nf_ct_match(ct, loser_ct)) { 915 nf_ct_acct_merge(ct, ctinfo, loser_ct); 916 nf_conntrack_put(&loser_ct->ct_general); 917 nf_ct_set(skb, ct, oldinfo); 918 return NF_ACCEPT; 919 } 920 nf_ct_put(ct); 921 } 922 NF_CT_STAT_INC(net, drop); 923 return NF_DROP; 924 } 925 926 /* Confirm a connection given skb; places it in hash table */ 927 int 928 __nf_conntrack_confirm(struct sk_buff *skb) 929 { 930 const struct nf_conntrack_zone *zone; 931 unsigned int hash, reply_hash; 932 struct nf_conntrack_tuple_hash *h; 933 struct nf_conn *ct; 934 struct nf_conn_help *help; 935 struct nf_conn_tstamp *tstamp; 936 struct hlist_nulls_node *n; 937 enum ip_conntrack_info ctinfo; 938 struct net *net; 939 unsigned int sequence; 940 int ret = NF_DROP; 941 942 ct = nf_ct_get(skb, &ctinfo); 943 net = nf_ct_net(ct); 944 945 /* ipt_REJECT uses nf_conntrack_attach to attach related 946 ICMP/TCP RST packets in other direction. Actual packet 947 which created connection will be IP_CT_NEW or for an 948 expected connection, IP_CT_RELATED. */ 949 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 950 return NF_ACCEPT; 951 952 zone = nf_ct_zone(ct); 953 local_bh_disable(); 954 955 do { 956 sequence = read_seqcount_begin(&nf_conntrack_generation); 957 /* reuse the hash saved before */ 958 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 959 hash = scale_hash(hash); 960 reply_hash = hash_conntrack(net, 961 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 962 963 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 964 965 /* We're not in hash table, and we refuse to set up related 966 * connections for unconfirmed conns. But packet copies and 967 * REJECT will give spurious warnings here. 968 */ 969 970 /* Another skb with the same unconfirmed conntrack may 971 * win the race. This may happen for bridge(br_flood) 972 * or broadcast/multicast packets do skb_clone with 973 * unconfirmed conntrack. 974 */ 975 if (unlikely(nf_ct_is_confirmed(ct))) { 976 WARN_ON_ONCE(1); 977 nf_conntrack_double_unlock(hash, reply_hash); 978 local_bh_enable(); 979 return NF_DROP; 980 } 981 982 pr_debug("Confirming conntrack %p\n", ct); 983 /* We have to check the DYING flag after unlink to prevent 984 * a race against nf_ct_get_next_corpse() possibly called from 985 * user context, else we insert an already 'dead' hash, blocking 986 * further use of that particular connection -JM. 987 */ 988 nf_ct_del_from_dying_or_unconfirmed_list(ct); 989 990 if (unlikely(nf_ct_is_dying(ct))) { 991 nf_ct_add_to_dying_list(ct); 992 goto dying; 993 } 994 995 /* See if there's one in the list already, including reverse: 996 NAT could have grabbed it without realizing, since we're 997 not in the hash. If there is, we lost race. */ 998 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 999 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1000 zone, net)) 1001 goto out; 1002 1003 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 1004 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1005 zone, net)) 1006 goto out; 1007 1008 /* Timer relative to confirmation time, not original 1009 setting time, otherwise we'd get timer wrap in 1010 weird delay cases. */ 1011 ct->timeout += nfct_time_stamp; 1012 atomic_inc(&ct->ct_general.use); 1013 ct->status |= IPS_CONFIRMED; 1014 1015 /* set conntrack timestamp, if enabled. */ 1016 tstamp = nf_conn_tstamp_find(ct); 1017 if (tstamp) 1018 tstamp->start = ktime_get_real_ns(); 1019 1020 /* Since the lookup is lockless, hash insertion must be done after 1021 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1022 * guarantee that no other CPU can find the conntrack before the above 1023 * stores are visible. 1024 */ 1025 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1026 nf_conntrack_double_unlock(hash, reply_hash); 1027 local_bh_enable(); 1028 1029 help = nfct_help(ct); 1030 if (help && help->helper) 1031 nf_conntrack_event_cache(IPCT_HELPER, ct); 1032 1033 nf_conntrack_event_cache(master_ct(ct) ? 1034 IPCT_RELATED : IPCT_NEW, ct); 1035 return NF_ACCEPT; 1036 1037 out: 1038 nf_ct_add_to_dying_list(ct); 1039 ret = nf_ct_resolve_clash(net, skb, ctinfo, h); 1040 dying: 1041 nf_conntrack_double_unlock(hash, reply_hash); 1042 NF_CT_STAT_INC(net, insert_failed); 1043 local_bh_enable(); 1044 return ret; 1045 } 1046 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1047 1048 /* Returns true if a connection correspondings to the tuple (required 1049 for NAT). */ 1050 int 1051 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1052 const struct nf_conn *ignored_conntrack) 1053 { 1054 struct net *net = nf_ct_net(ignored_conntrack); 1055 const struct nf_conntrack_zone *zone; 1056 struct nf_conntrack_tuple_hash *h; 1057 struct hlist_nulls_head *ct_hash; 1058 unsigned int hash, hsize; 1059 struct hlist_nulls_node *n; 1060 struct nf_conn *ct; 1061 1062 zone = nf_ct_zone(ignored_conntrack); 1063 1064 rcu_read_lock(); 1065 begin: 1066 nf_conntrack_get_ht(&ct_hash, &hsize); 1067 hash = __hash_conntrack(net, tuple, hsize); 1068 1069 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1070 ct = nf_ct_tuplehash_to_ctrack(h); 1071 1072 if (ct == ignored_conntrack) 1073 continue; 1074 1075 if (nf_ct_is_expired(ct)) { 1076 nf_ct_gc_expired(ct); 1077 continue; 1078 } 1079 1080 if (nf_ct_key_equal(h, tuple, zone, net)) { 1081 /* Tuple is taken already, so caller will need to find 1082 * a new source port to use. 1083 * 1084 * Only exception: 1085 * If the *original tuples* are identical, then both 1086 * conntracks refer to the same flow. 1087 * This is a rare situation, it can occur e.g. when 1088 * more than one UDP packet is sent from same socket 1089 * in different threads. 1090 * 1091 * Let nf_ct_resolve_clash() deal with this later. 1092 */ 1093 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1094 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) 1095 continue; 1096 1097 NF_CT_STAT_INC_ATOMIC(net, found); 1098 rcu_read_unlock(); 1099 return 1; 1100 } 1101 } 1102 1103 if (get_nulls_value(n) != hash) { 1104 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1105 goto begin; 1106 } 1107 1108 rcu_read_unlock(); 1109 1110 return 0; 1111 } 1112 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1113 1114 #define NF_CT_EVICTION_RANGE 8 1115 1116 /* There's a small race here where we may free a just-assured 1117 connection. Too bad: we're in trouble anyway. */ 1118 static unsigned int early_drop_list(struct net *net, 1119 struct hlist_nulls_head *head) 1120 { 1121 struct nf_conntrack_tuple_hash *h; 1122 struct hlist_nulls_node *n; 1123 unsigned int drops = 0; 1124 struct nf_conn *tmp; 1125 1126 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1127 tmp = nf_ct_tuplehash_to_ctrack(h); 1128 1129 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1130 continue; 1131 1132 if (nf_ct_is_expired(tmp)) { 1133 nf_ct_gc_expired(tmp); 1134 continue; 1135 } 1136 1137 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1138 !net_eq(nf_ct_net(tmp), net) || 1139 nf_ct_is_dying(tmp)) 1140 continue; 1141 1142 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1143 continue; 1144 1145 /* kill only if still in same netns -- might have moved due to 1146 * SLAB_TYPESAFE_BY_RCU rules. 1147 * 1148 * We steal the timer reference. If that fails timer has 1149 * already fired or someone else deleted it. Just drop ref 1150 * and move to next entry. 1151 */ 1152 if (net_eq(nf_ct_net(tmp), net) && 1153 nf_ct_is_confirmed(tmp) && 1154 nf_ct_delete(tmp, 0, 0)) 1155 drops++; 1156 1157 nf_ct_put(tmp); 1158 } 1159 1160 return drops; 1161 } 1162 1163 static noinline int early_drop(struct net *net, unsigned int hash) 1164 { 1165 unsigned int i, bucket; 1166 1167 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1168 struct hlist_nulls_head *ct_hash; 1169 unsigned int hsize, drops; 1170 1171 rcu_read_lock(); 1172 nf_conntrack_get_ht(&ct_hash, &hsize); 1173 if (!i) 1174 bucket = reciprocal_scale(hash, hsize); 1175 else 1176 bucket = (bucket + 1) % hsize; 1177 1178 drops = early_drop_list(net, &ct_hash[bucket]); 1179 rcu_read_unlock(); 1180 1181 if (drops) { 1182 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1183 return true; 1184 } 1185 } 1186 1187 return false; 1188 } 1189 1190 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1191 { 1192 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1193 } 1194 1195 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1196 { 1197 const struct nf_conntrack_l4proto *l4proto; 1198 1199 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1200 return true; 1201 1202 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1203 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1204 return true; 1205 1206 return false; 1207 } 1208 1209 #define DAY (86400 * HZ) 1210 1211 /* Set an arbitrary timeout large enough not to ever expire, this save 1212 * us a check for the IPS_OFFLOAD_BIT from the packet path via 1213 * nf_ct_is_expired(). 1214 */ 1215 static void nf_ct_offload_timeout(struct nf_conn *ct) 1216 { 1217 if (nf_ct_expires(ct) < DAY / 2) 1218 ct->timeout = nfct_time_stamp + DAY; 1219 } 1220 1221 static void gc_worker(struct work_struct *work) 1222 { 1223 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); 1224 unsigned int i, goal, buckets = 0, expired_count = 0; 1225 unsigned int nf_conntrack_max95 = 0; 1226 struct conntrack_gc_work *gc_work; 1227 unsigned int ratio, scanned = 0; 1228 unsigned long next_run; 1229 1230 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1231 1232 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; 1233 i = gc_work->last_bucket; 1234 if (gc_work->early_drop) 1235 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1236 1237 do { 1238 struct nf_conntrack_tuple_hash *h; 1239 struct hlist_nulls_head *ct_hash; 1240 struct hlist_nulls_node *n; 1241 unsigned int hashsz; 1242 struct nf_conn *tmp; 1243 1244 i++; 1245 rcu_read_lock(); 1246 1247 nf_conntrack_get_ht(&ct_hash, &hashsz); 1248 if (i >= hashsz) 1249 i = 0; 1250 1251 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1252 struct net *net; 1253 1254 tmp = nf_ct_tuplehash_to_ctrack(h); 1255 1256 scanned++; 1257 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1258 nf_ct_offload_timeout(tmp); 1259 continue; 1260 } 1261 1262 if (nf_ct_is_expired(tmp)) { 1263 nf_ct_gc_expired(tmp); 1264 expired_count++; 1265 continue; 1266 } 1267 1268 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1269 continue; 1270 1271 net = nf_ct_net(tmp); 1272 if (atomic_read(&net->ct.count) < nf_conntrack_max95) 1273 continue; 1274 1275 /* need to take reference to avoid possible races */ 1276 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1277 continue; 1278 1279 if (gc_worker_skip_ct(tmp)) { 1280 nf_ct_put(tmp); 1281 continue; 1282 } 1283 1284 if (gc_worker_can_early_drop(tmp)) 1285 nf_ct_kill(tmp); 1286 1287 nf_ct_put(tmp); 1288 } 1289 1290 /* could check get_nulls_value() here and restart if ct 1291 * was moved to another chain. But given gc is best-effort 1292 * we will just continue with next hash slot. 1293 */ 1294 rcu_read_unlock(); 1295 cond_resched(); 1296 } while (++buckets < goal); 1297 1298 if (gc_work->exiting) 1299 return; 1300 1301 /* 1302 * Eviction will normally happen from the packet path, and not 1303 * from this gc worker. 1304 * 1305 * This worker is only here to reap expired entries when system went 1306 * idle after a busy period. 1307 * 1308 * The heuristics below are supposed to balance conflicting goals: 1309 * 1310 * 1. Minimize time until we notice a stale entry 1311 * 2. Maximize scan intervals to not waste cycles 1312 * 1313 * Normally, expire ratio will be close to 0. 1314 * 1315 * As soon as a sizeable fraction of the entries have expired 1316 * increase scan frequency. 1317 */ 1318 ratio = scanned ? expired_count * 100 / scanned : 0; 1319 if (ratio > GC_EVICT_RATIO) { 1320 gc_work->next_gc_run = min_interval; 1321 } else { 1322 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; 1323 1324 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); 1325 1326 gc_work->next_gc_run += min_interval; 1327 if (gc_work->next_gc_run > max) 1328 gc_work->next_gc_run = max; 1329 } 1330 1331 next_run = gc_work->next_gc_run; 1332 gc_work->last_bucket = i; 1333 gc_work->early_drop = false; 1334 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1335 } 1336 1337 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1338 { 1339 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); 1340 gc_work->next_gc_run = HZ; 1341 gc_work->exiting = false; 1342 } 1343 1344 static struct nf_conn * 1345 __nf_conntrack_alloc(struct net *net, 1346 const struct nf_conntrack_zone *zone, 1347 const struct nf_conntrack_tuple *orig, 1348 const struct nf_conntrack_tuple *repl, 1349 gfp_t gfp, u32 hash) 1350 { 1351 struct nf_conn *ct; 1352 1353 /* We don't want any race condition at early drop stage */ 1354 atomic_inc(&net->ct.count); 1355 1356 if (nf_conntrack_max && 1357 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 1358 if (!early_drop(net, hash)) { 1359 if (!conntrack_gc_work.early_drop) 1360 conntrack_gc_work.early_drop = true; 1361 atomic_dec(&net->ct.count); 1362 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1363 return ERR_PTR(-ENOMEM); 1364 } 1365 } 1366 1367 /* 1368 * Do not use kmem_cache_zalloc(), as this cache uses 1369 * SLAB_TYPESAFE_BY_RCU. 1370 */ 1371 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1372 if (ct == NULL) 1373 goto out; 1374 1375 spin_lock_init(&ct->lock); 1376 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1377 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1378 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1379 /* save hash for reusing when confirming */ 1380 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1381 ct->status = 0; 1382 ct->timeout = 0; 1383 write_pnet(&ct->ct_net, net); 1384 memset(&ct->__nfct_init_offset[0], 0, 1385 offsetof(struct nf_conn, proto) - 1386 offsetof(struct nf_conn, __nfct_init_offset[0])); 1387 1388 nf_ct_zone_add(ct, zone); 1389 1390 /* Because we use RCU lookups, we set ct_general.use to zero before 1391 * this is inserted in any list. 1392 */ 1393 atomic_set(&ct->ct_general.use, 0); 1394 return ct; 1395 out: 1396 atomic_dec(&net->ct.count); 1397 return ERR_PTR(-ENOMEM); 1398 } 1399 1400 struct nf_conn *nf_conntrack_alloc(struct net *net, 1401 const struct nf_conntrack_zone *zone, 1402 const struct nf_conntrack_tuple *orig, 1403 const struct nf_conntrack_tuple *repl, 1404 gfp_t gfp) 1405 { 1406 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1407 } 1408 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1409 1410 void nf_conntrack_free(struct nf_conn *ct) 1411 { 1412 struct net *net = nf_ct_net(ct); 1413 1414 /* A freed object has refcnt == 0, that's 1415 * the golden rule for SLAB_TYPESAFE_BY_RCU 1416 */ 1417 WARN_ON(atomic_read(&ct->ct_general.use) != 0); 1418 1419 nf_ct_ext_destroy(ct); 1420 kmem_cache_free(nf_conntrack_cachep, ct); 1421 smp_mb__before_atomic(); 1422 atomic_dec(&net->ct.count); 1423 } 1424 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1425 1426 1427 /* Allocate a new conntrack: we return -ENOMEM if classification 1428 failed due to stress. Otherwise it really is unclassifiable. */ 1429 static noinline struct nf_conntrack_tuple_hash * 1430 init_conntrack(struct net *net, struct nf_conn *tmpl, 1431 const struct nf_conntrack_tuple *tuple, 1432 struct sk_buff *skb, 1433 unsigned int dataoff, u32 hash) 1434 { 1435 struct nf_conn *ct; 1436 struct nf_conn_help *help; 1437 struct nf_conntrack_tuple repl_tuple; 1438 struct nf_conntrack_ecache *ecache; 1439 struct nf_conntrack_expect *exp = NULL; 1440 const struct nf_conntrack_zone *zone; 1441 struct nf_conn_timeout *timeout_ext; 1442 struct nf_conntrack_zone tmp; 1443 1444 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { 1445 pr_debug("Can't invert tuple.\n"); 1446 return NULL; 1447 } 1448 1449 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1450 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1451 hash); 1452 if (IS_ERR(ct)) 1453 return (struct nf_conntrack_tuple_hash *)ct; 1454 1455 if (!nf_ct_add_synproxy(ct, tmpl)) { 1456 nf_conntrack_free(ct); 1457 return ERR_PTR(-ENOMEM); 1458 } 1459 1460 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1461 1462 if (timeout_ext) 1463 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1464 GFP_ATOMIC); 1465 1466 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1467 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1468 nf_ct_labels_ext_add(ct); 1469 1470 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1471 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1472 ecache ? ecache->expmask : 0, 1473 GFP_ATOMIC); 1474 1475 local_bh_disable(); 1476 if (net->ct.expect_count) { 1477 spin_lock(&nf_conntrack_expect_lock); 1478 exp = nf_ct_find_expectation(net, zone, tuple); 1479 if (exp) { 1480 pr_debug("expectation arrives ct=%p exp=%p\n", 1481 ct, exp); 1482 /* Welcome, Mr. Bond. We've been expecting you... */ 1483 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1484 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1485 ct->master = exp->master; 1486 if (exp->helper) { 1487 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1488 if (help) 1489 rcu_assign_pointer(help->helper, exp->helper); 1490 } 1491 1492 #ifdef CONFIG_NF_CONNTRACK_MARK 1493 ct->mark = exp->master->mark; 1494 #endif 1495 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1496 ct->secmark = exp->master->secmark; 1497 #endif 1498 NF_CT_STAT_INC(net, expect_new); 1499 } 1500 spin_unlock(&nf_conntrack_expect_lock); 1501 } 1502 if (!exp) 1503 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1504 1505 /* Now it is inserted into the unconfirmed list, bump refcount */ 1506 nf_conntrack_get(&ct->ct_general); 1507 nf_ct_add_to_unconfirmed_list(ct); 1508 1509 local_bh_enable(); 1510 1511 if (exp) { 1512 if (exp->expectfn) 1513 exp->expectfn(ct, exp); 1514 nf_ct_expect_put(exp); 1515 } 1516 1517 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1518 } 1519 1520 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1521 static int 1522 resolve_normal_ct(struct nf_conn *tmpl, 1523 struct sk_buff *skb, 1524 unsigned int dataoff, 1525 u_int8_t protonum, 1526 const struct nf_hook_state *state) 1527 { 1528 const struct nf_conntrack_zone *zone; 1529 struct nf_conntrack_tuple tuple; 1530 struct nf_conntrack_tuple_hash *h; 1531 enum ip_conntrack_info ctinfo; 1532 struct nf_conntrack_zone tmp; 1533 struct nf_conn *ct; 1534 u32 hash; 1535 1536 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1537 dataoff, state->pf, protonum, state->net, 1538 &tuple)) { 1539 pr_debug("Can't get tuple\n"); 1540 return 0; 1541 } 1542 1543 /* look for tuple match */ 1544 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1545 hash = hash_conntrack_raw(&tuple, state->net); 1546 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1547 if (!h) { 1548 h = init_conntrack(state->net, tmpl, &tuple, 1549 skb, dataoff, hash); 1550 if (!h) 1551 return 0; 1552 if (IS_ERR(h)) 1553 return PTR_ERR(h); 1554 } 1555 ct = nf_ct_tuplehash_to_ctrack(h); 1556 1557 /* It exists; we have (non-exclusive) reference. */ 1558 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1559 ctinfo = IP_CT_ESTABLISHED_REPLY; 1560 } else { 1561 /* Once we've had two way comms, always ESTABLISHED. */ 1562 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1563 pr_debug("normal packet for %p\n", ct); 1564 ctinfo = IP_CT_ESTABLISHED; 1565 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1566 pr_debug("related packet for %p\n", ct); 1567 ctinfo = IP_CT_RELATED; 1568 } else { 1569 pr_debug("new packet for %p\n", ct); 1570 ctinfo = IP_CT_NEW; 1571 } 1572 } 1573 nf_ct_set(skb, ct, ctinfo); 1574 return 0; 1575 } 1576 1577 /* 1578 * icmp packets need special treatment to handle error messages that are 1579 * related to a connection. 1580 * 1581 * Callers need to check if skb has a conntrack assigned when this 1582 * helper returns; in such case skb belongs to an already known connection. 1583 */ 1584 static unsigned int __cold 1585 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1586 struct sk_buff *skb, 1587 unsigned int dataoff, 1588 u8 protonum, 1589 const struct nf_hook_state *state) 1590 { 1591 int ret; 1592 1593 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1594 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1595 #if IS_ENABLED(CONFIG_IPV6) 1596 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1597 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1598 #endif 1599 else 1600 return NF_ACCEPT; 1601 1602 if (ret <= 0) { 1603 NF_CT_STAT_INC_ATOMIC(state->net, error); 1604 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1605 } 1606 1607 return ret; 1608 } 1609 1610 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1611 enum ip_conntrack_info ctinfo) 1612 { 1613 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1614 1615 if (!timeout) 1616 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1617 1618 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1619 return NF_ACCEPT; 1620 } 1621 1622 /* Returns verdict for packet, or -1 for invalid. */ 1623 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1624 struct sk_buff *skb, 1625 unsigned int dataoff, 1626 enum ip_conntrack_info ctinfo, 1627 const struct nf_hook_state *state) 1628 { 1629 switch (nf_ct_protonum(ct)) { 1630 case IPPROTO_TCP: 1631 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1632 ctinfo, state); 1633 case IPPROTO_UDP: 1634 return nf_conntrack_udp_packet(ct, skb, dataoff, 1635 ctinfo, state); 1636 case IPPROTO_ICMP: 1637 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1638 #if IS_ENABLED(CONFIG_IPV6) 1639 case IPPROTO_ICMPV6: 1640 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1641 #endif 1642 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1643 case IPPROTO_UDPLITE: 1644 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1645 ctinfo, state); 1646 #endif 1647 #ifdef CONFIG_NF_CT_PROTO_SCTP 1648 case IPPROTO_SCTP: 1649 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1650 ctinfo, state); 1651 #endif 1652 #ifdef CONFIG_NF_CT_PROTO_DCCP 1653 case IPPROTO_DCCP: 1654 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1655 ctinfo, state); 1656 #endif 1657 #ifdef CONFIG_NF_CT_PROTO_GRE 1658 case IPPROTO_GRE: 1659 return nf_conntrack_gre_packet(ct, skb, dataoff, 1660 ctinfo, state); 1661 #endif 1662 } 1663 1664 return generic_packet(ct, skb, ctinfo); 1665 } 1666 1667 unsigned int 1668 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1669 { 1670 enum ip_conntrack_info ctinfo; 1671 struct nf_conn *ct, *tmpl; 1672 u_int8_t protonum; 1673 int dataoff, ret; 1674 1675 tmpl = nf_ct_get(skb, &ctinfo); 1676 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1677 /* Previously seen (loopback or untracked)? Ignore. */ 1678 if ((tmpl && !nf_ct_is_template(tmpl)) || 1679 ctinfo == IP_CT_UNTRACKED) { 1680 NF_CT_STAT_INC_ATOMIC(state->net, ignore); 1681 return NF_ACCEPT; 1682 } 1683 skb->_nfct = 0; 1684 } 1685 1686 /* rcu_read_lock()ed by nf_hook_thresh */ 1687 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1688 if (dataoff <= 0) { 1689 pr_debug("not prepared to track yet or error occurred\n"); 1690 NF_CT_STAT_INC_ATOMIC(state->net, error); 1691 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1692 ret = NF_ACCEPT; 1693 goto out; 1694 } 1695 1696 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1697 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1698 protonum, state); 1699 if (ret <= 0) { 1700 ret = -ret; 1701 goto out; 1702 } 1703 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1704 if (skb->_nfct) 1705 goto out; 1706 } 1707 repeat: 1708 ret = resolve_normal_ct(tmpl, skb, dataoff, 1709 protonum, state); 1710 if (ret < 0) { 1711 /* Too stressed to deal. */ 1712 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1713 ret = NF_DROP; 1714 goto out; 1715 } 1716 1717 ct = nf_ct_get(skb, &ctinfo); 1718 if (!ct) { 1719 /* Not valid part of a connection */ 1720 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1721 ret = NF_ACCEPT; 1722 goto out; 1723 } 1724 1725 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 1726 if (ret <= 0) { 1727 /* Invalid: inverse of the return code tells 1728 * the netfilter core what to do */ 1729 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1730 nf_conntrack_put(&ct->ct_general); 1731 skb->_nfct = 0; 1732 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1733 if (ret == -NF_DROP) 1734 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1735 /* Special case: TCP tracker reports an attempt to reopen a 1736 * closed/aborted connection. We have to go back and create a 1737 * fresh conntrack. 1738 */ 1739 if (ret == -NF_REPEAT) 1740 goto repeat; 1741 ret = -ret; 1742 goto out; 1743 } 1744 1745 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 1746 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1747 nf_conntrack_event_cache(IPCT_REPLY, ct); 1748 out: 1749 if (tmpl) 1750 nf_ct_put(tmpl); 1751 1752 return ret; 1753 } 1754 EXPORT_SYMBOL_GPL(nf_conntrack_in); 1755 1756 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 1757 implicitly racy: see __nf_conntrack_confirm */ 1758 void nf_conntrack_alter_reply(struct nf_conn *ct, 1759 const struct nf_conntrack_tuple *newreply) 1760 { 1761 struct nf_conn_help *help = nfct_help(ct); 1762 1763 /* Should be unconfirmed, so not in hash table yet */ 1764 WARN_ON(nf_ct_is_confirmed(ct)); 1765 1766 pr_debug("Altering reply tuple of %p to ", ct); 1767 nf_ct_dump_tuple(newreply); 1768 1769 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1770 if (ct->master || (help && !hlist_empty(&help->expectations))) 1771 return; 1772 1773 rcu_read_lock(); 1774 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1775 rcu_read_unlock(); 1776 } 1777 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1778 1779 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1780 void __nf_ct_refresh_acct(struct nf_conn *ct, 1781 enum ip_conntrack_info ctinfo, 1782 const struct sk_buff *skb, 1783 u32 extra_jiffies, 1784 bool do_acct) 1785 { 1786 /* Only update if this is not a fixed timeout */ 1787 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1788 goto acct; 1789 1790 /* If not in hash table, timer will not be active yet */ 1791 if (nf_ct_is_confirmed(ct)) 1792 extra_jiffies += nfct_time_stamp; 1793 1794 if (READ_ONCE(ct->timeout) != extra_jiffies) 1795 WRITE_ONCE(ct->timeout, extra_jiffies); 1796 acct: 1797 if (do_acct) 1798 nf_ct_acct_update(ct, ctinfo, skb->len); 1799 } 1800 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1801 1802 bool nf_ct_kill_acct(struct nf_conn *ct, 1803 enum ip_conntrack_info ctinfo, 1804 const struct sk_buff *skb) 1805 { 1806 nf_ct_acct_update(ct, ctinfo, skb->len); 1807 1808 return nf_ct_delete(ct, 0, 0); 1809 } 1810 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 1811 1812 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1813 1814 #include <linux/netfilter/nfnetlink.h> 1815 #include <linux/netfilter/nfnetlink_conntrack.h> 1816 #include <linux/mutex.h> 1817 1818 /* Generic function for tcp/udp/sctp/dccp and alike. */ 1819 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1820 const struct nf_conntrack_tuple *tuple) 1821 { 1822 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 1823 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 1824 goto nla_put_failure; 1825 return 0; 1826 1827 nla_put_failure: 1828 return -1; 1829 } 1830 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1831 1832 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1833 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1834 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1835 }; 1836 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1837 1838 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1839 struct nf_conntrack_tuple *t) 1840 { 1841 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 1842 return -EINVAL; 1843 1844 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1845 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1846 1847 return 0; 1848 } 1849 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1850 1851 unsigned int nf_ct_port_nlattr_tuple_size(void) 1852 { 1853 static unsigned int size __read_mostly; 1854 1855 if (!size) 1856 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1857 1858 return size; 1859 } 1860 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 1861 #endif 1862 1863 /* Used by ipt_REJECT and ip6t_REJECT. */ 1864 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 1865 { 1866 struct nf_conn *ct; 1867 enum ip_conntrack_info ctinfo; 1868 1869 /* This ICMP is in reverse direction to the packet which caused it */ 1870 ct = nf_ct_get(skb, &ctinfo); 1871 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1872 ctinfo = IP_CT_RELATED_REPLY; 1873 else 1874 ctinfo = IP_CT_RELATED; 1875 1876 /* Attach to new skbuff, and increment count */ 1877 nf_ct_set(nskb, ct, ctinfo); 1878 nf_conntrack_get(skb_nfct(nskb)); 1879 } 1880 1881 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 1882 { 1883 struct nf_conntrack_tuple_hash *h; 1884 struct nf_conntrack_tuple tuple; 1885 enum ip_conntrack_info ctinfo; 1886 struct nf_nat_hook *nat_hook; 1887 unsigned int status; 1888 struct nf_conn *ct; 1889 int dataoff; 1890 u16 l3num; 1891 u8 l4num; 1892 1893 ct = nf_ct_get(skb, &ctinfo); 1894 if (!ct || nf_ct_is_confirmed(ct)) 1895 return 0; 1896 1897 l3num = nf_ct_l3num(ct); 1898 1899 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 1900 if (dataoff <= 0) 1901 return -1; 1902 1903 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 1904 l4num, net, &tuple)) 1905 return -1; 1906 1907 if (ct->status & IPS_SRC_NAT) { 1908 memcpy(tuple.src.u3.all, 1909 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 1910 sizeof(tuple.src.u3.all)); 1911 tuple.src.u.all = 1912 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 1913 } 1914 1915 if (ct->status & IPS_DST_NAT) { 1916 memcpy(tuple.dst.u3.all, 1917 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 1918 sizeof(tuple.dst.u3.all)); 1919 tuple.dst.u.all = 1920 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 1921 } 1922 1923 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 1924 if (!h) 1925 return 0; 1926 1927 /* Store status bits of the conntrack that is clashing to re-do NAT 1928 * mangling according to what it has been done already to this packet. 1929 */ 1930 status = ct->status; 1931 1932 nf_ct_put(ct); 1933 ct = nf_ct_tuplehash_to_ctrack(h); 1934 nf_ct_set(skb, ct, ctinfo); 1935 1936 nat_hook = rcu_dereference(nf_nat_hook); 1937 if (!nat_hook) 1938 return 0; 1939 1940 if (status & IPS_SRC_NAT && 1941 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 1942 IP_CT_DIR_ORIGINAL) == NF_DROP) 1943 return -1; 1944 1945 if (status & IPS_DST_NAT && 1946 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 1947 IP_CT_DIR_ORIGINAL) == NF_DROP) 1948 return -1; 1949 1950 return 0; 1951 } 1952 1953 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 1954 const struct sk_buff *skb) 1955 { 1956 const struct nf_conntrack_tuple *src_tuple; 1957 const struct nf_conntrack_tuple_hash *hash; 1958 struct nf_conntrack_tuple srctuple; 1959 enum ip_conntrack_info ctinfo; 1960 struct nf_conn *ct; 1961 1962 ct = nf_ct_get(skb, &ctinfo); 1963 if (ct) { 1964 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 1965 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 1966 return true; 1967 } 1968 1969 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 1970 NFPROTO_IPV4, dev_net(skb->dev), 1971 &srctuple)) 1972 return false; 1973 1974 hash = nf_conntrack_find_get(dev_net(skb->dev), 1975 &nf_ct_zone_dflt, 1976 &srctuple); 1977 if (!hash) 1978 return false; 1979 1980 ct = nf_ct_tuplehash_to_ctrack(hash); 1981 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 1982 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 1983 nf_ct_put(ct); 1984 1985 return true; 1986 } 1987 1988 /* Bring out ya dead! */ 1989 static struct nf_conn * 1990 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 1991 void *data, unsigned int *bucket) 1992 { 1993 struct nf_conntrack_tuple_hash *h; 1994 struct nf_conn *ct; 1995 struct hlist_nulls_node *n; 1996 spinlock_t *lockp; 1997 1998 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 1999 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2000 local_bh_disable(); 2001 nf_conntrack_lock(lockp); 2002 if (*bucket < nf_conntrack_htable_size) { 2003 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 2004 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 2005 continue; 2006 ct = nf_ct_tuplehash_to_ctrack(h); 2007 if (iter(ct, data)) 2008 goto found; 2009 } 2010 } 2011 spin_unlock(lockp); 2012 local_bh_enable(); 2013 cond_resched(); 2014 } 2015 2016 return NULL; 2017 found: 2018 atomic_inc(&ct->ct_general.use); 2019 spin_unlock(lockp); 2020 local_bh_enable(); 2021 return ct; 2022 } 2023 2024 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2025 void *data, u32 portid, int report) 2026 { 2027 unsigned int bucket = 0, sequence; 2028 struct nf_conn *ct; 2029 2030 might_sleep(); 2031 2032 for (;;) { 2033 sequence = read_seqcount_begin(&nf_conntrack_generation); 2034 2035 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 2036 /* Time to push up daises... */ 2037 2038 nf_ct_delete(ct, portid, report); 2039 nf_ct_put(ct); 2040 cond_resched(); 2041 } 2042 2043 if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) 2044 break; 2045 bucket = 0; 2046 } 2047 } 2048 2049 struct iter_data { 2050 int (*iter)(struct nf_conn *i, void *data); 2051 void *data; 2052 struct net *net; 2053 }; 2054 2055 static int iter_net_only(struct nf_conn *i, void *data) 2056 { 2057 struct iter_data *d = data; 2058 2059 if (!net_eq(d->net, nf_ct_net(i))) 2060 return 0; 2061 2062 return d->iter(i, d->data); 2063 } 2064 2065 static void 2066 __nf_ct_unconfirmed_destroy(struct net *net) 2067 { 2068 int cpu; 2069 2070 for_each_possible_cpu(cpu) { 2071 struct nf_conntrack_tuple_hash *h; 2072 struct hlist_nulls_node *n; 2073 struct ct_pcpu *pcpu; 2074 2075 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 2076 2077 spin_lock_bh(&pcpu->lock); 2078 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { 2079 struct nf_conn *ct; 2080 2081 ct = nf_ct_tuplehash_to_ctrack(h); 2082 2083 /* we cannot call iter() on unconfirmed list, the 2084 * owning cpu can reallocate ct->ext at any time. 2085 */ 2086 set_bit(IPS_DYING_BIT, &ct->status); 2087 } 2088 spin_unlock_bh(&pcpu->lock); 2089 cond_resched(); 2090 } 2091 } 2092 2093 void nf_ct_unconfirmed_destroy(struct net *net) 2094 { 2095 might_sleep(); 2096 2097 if (atomic_read(&net->ct.count) > 0) { 2098 __nf_ct_unconfirmed_destroy(net); 2099 nf_queue_nf_hook_drop(net); 2100 synchronize_net(); 2101 } 2102 } 2103 EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); 2104 2105 void nf_ct_iterate_cleanup_net(struct net *net, 2106 int (*iter)(struct nf_conn *i, void *data), 2107 void *data, u32 portid, int report) 2108 { 2109 struct iter_data d; 2110 2111 might_sleep(); 2112 2113 if (atomic_read(&net->ct.count) == 0) 2114 return; 2115 2116 d.iter = iter; 2117 d.data = data; 2118 d.net = net; 2119 2120 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); 2121 } 2122 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2123 2124 /** 2125 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2126 * @iter: callback to invoke for each conntrack 2127 * @data: data to pass to @iter 2128 * 2129 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2130 * unconfirmed list as dying (so they will not be inserted into 2131 * main table). 2132 * 2133 * Can only be called in module exit path. 2134 */ 2135 void 2136 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2137 { 2138 struct net *net; 2139 2140 down_read(&net_rwsem); 2141 for_each_net(net) { 2142 if (atomic_read(&net->ct.count) == 0) 2143 continue; 2144 __nf_ct_unconfirmed_destroy(net); 2145 nf_queue_nf_hook_drop(net); 2146 } 2147 up_read(&net_rwsem); 2148 2149 /* Need to wait for netns cleanup worker to finish, if its 2150 * running -- it might have deleted a net namespace from 2151 * the global list, so our __nf_ct_unconfirmed_destroy() might 2152 * not have affected all namespaces. 2153 */ 2154 net_ns_barrier(); 2155 2156 /* a conntrack could have been unlinked from unconfirmed list 2157 * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). 2158 * This makes sure its inserted into conntrack table. 2159 */ 2160 synchronize_net(); 2161 2162 nf_ct_iterate_cleanup(iter, data, 0, 0); 2163 } 2164 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2165 2166 static int kill_all(struct nf_conn *i, void *data) 2167 { 2168 return net_eq(nf_ct_net(i), data); 2169 } 2170 2171 void nf_conntrack_cleanup_start(void) 2172 { 2173 conntrack_gc_work.exiting = true; 2174 RCU_INIT_POINTER(ip_ct_attach, NULL); 2175 } 2176 2177 void nf_conntrack_cleanup_end(void) 2178 { 2179 RCU_INIT_POINTER(nf_ct_hook, NULL); 2180 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2181 kvfree(nf_conntrack_hash); 2182 2183 nf_conntrack_proto_fini(); 2184 nf_conntrack_seqadj_fini(); 2185 nf_conntrack_labels_fini(); 2186 nf_conntrack_helper_fini(); 2187 nf_conntrack_timeout_fini(); 2188 nf_conntrack_ecache_fini(); 2189 nf_conntrack_tstamp_fini(); 2190 nf_conntrack_acct_fini(); 2191 nf_conntrack_expect_fini(); 2192 2193 kmem_cache_destroy(nf_conntrack_cachep); 2194 } 2195 2196 /* 2197 * Mishearing the voices in his head, our hero wonders how he's 2198 * supposed to kill the mall. 2199 */ 2200 void nf_conntrack_cleanup_net(struct net *net) 2201 { 2202 LIST_HEAD(single); 2203 2204 list_add(&net->exit_list, &single); 2205 nf_conntrack_cleanup_net_list(&single); 2206 } 2207 2208 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2209 { 2210 int busy; 2211 struct net *net; 2212 2213 /* 2214 * This makes sure all current packets have passed through 2215 * netfilter framework. Roll on, two-stage module 2216 * delete... 2217 */ 2218 synchronize_net(); 2219 i_see_dead_people: 2220 busy = 0; 2221 list_for_each_entry(net, net_exit_list, exit_list) { 2222 nf_ct_iterate_cleanup(kill_all, net, 0, 0); 2223 if (atomic_read(&net->ct.count) != 0) 2224 busy = 1; 2225 } 2226 if (busy) { 2227 schedule(); 2228 goto i_see_dead_people; 2229 } 2230 2231 list_for_each_entry(net, net_exit_list, exit_list) { 2232 nf_conntrack_proto_pernet_fini(net); 2233 nf_conntrack_ecache_pernet_fini(net); 2234 nf_conntrack_expect_pernet_fini(net); 2235 free_percpu(net->ct.stat); 2236 free_percpu(net->ct.pcpu_lists); 2237 } 2238 } 2239 2240 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2241 { 2242 struct hlist_nulls_head *hash; 2243 unsigned int nr_slots, i; 2244 2245 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2246 return NULL; 2247 2248 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2249 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2250 2251 hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), 2252 GFP_KERNEL | __GFP_ZERO); 2253 2254 if (hash && nulls) 2255 for (i = 0; i < nr_slots; i++) 2256 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2257 2258 return hash; 2259 } 2260 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2261 2262 int nf_conntrack_hash_resize(unsigned int hashsize) 2263 { 2264 int i, bucket; 2265 unsigned int old_size; 2266 struct hlist_nulls_head *hash, *old_hash; 2267 struct nf_conntrack_tuple_hash *h; 2268 struct nf_conn *ct; 2269 2270 if (!hashsize) 2271 return -EINVAL; 2272 2273 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2274 if (!hash) 2275 return -ENOMEM; 2276 2277 old_size = nf_conntrack_htable_size; 2278 if (old_size == hashsize) { 2279 kvfree(hash); 2280 return 0; 2281 } 2282 2283 local_bh_disable(); 2284 nf_conntrack_all_lock(); 2285 write_seqcount_begin(&nf_conntrack_generation); 2286 2287 /* Lookups in the old hash might happen in parallel, which means we 2288 * might get false negatives during connection lookup. New connections 2289 * created because of a false negative won't make it into the hash 2290 * though since that required taking the locks. 2291 */ 2292 2293 for (i = 0; i < nf_conntrack_htable_size; i++) { 2294 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2295 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2296 struct nf_conntrack_tuple_hash, hnnode); 2297 ct = nf_ct_tuplehash_to_ctrack(h); 2298 hlist_nulls_del_rcu(&h->hnnode); 2299 bucket = __hash_conntrack(nf_ct_net(ct), 2300 &h->tuple, hashsize); 2301 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2302 } 2303 } 2304 old_size = nf_conntrack_htable_size; 2305 old_hash = nf_conntrack_hash; 2306 2307 nf_conntrack_hash = hash; 2308 nf_conntrack_htable_size = hashsize; 2309 2310 write_seqcount_end(&nf_conntrack_generation); 2311 nf_conntrack_all_unlock(); 2312 local_bh_enable(); 2313 2314 synchronize_net(); 2315 kvfree(old_hash); 2316 return 0; 2317 } 2318 2319 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2320 { 2321 unsigned int hashsize; 2322 int rc; 2323 2324 if (current->nsproxy->net_ns != &init_net) 2325 return -EOPNOTSUPP; 2326 2327 /* On boot, we can set this without any fancy locking. */ 2328 if (!nf_conntrack_hash) 2329 return param_set_uint(val, kp); 2330 2331 rc = kstrtouint(val, 0, &hashsize); 2332 if (rc) 2333 return rc; 2334 2335 return nf_conntrack_hash_resize(hashsize); 2336 } 2337 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 2338 2339 static __always_inline unsigned int total_extension_size(void) 2340 { 2341 /* remember to add new extensions below */ 2342 BUILD_BUG_ON(NF_CT_EXT_NUM > 9); 2343 2344 return sizeof(struct nf_ct_ext) + 2345 sizeof(struct nf_conn_help) 2346 #if IS_ENABLED(CONFIG_NF_NAT) 2347 + sizeof(struct nf_conn_nat) 2348 #endif 2349 + sizeof(struct nf_conn_seqadj) 2350 + sizeof(struct nf_conn_acct) 2351 #ifdef CONFIG_NF_CONNTRACK_EVENTS 2352 + sizeof(struct nf_conntrack_ecache) 2353 #endif 2354 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 2355 + sizeof(struct nf_conn_tstamp) 2356 #endif 2357 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 2358 + sizeof(struct nf_conn_timeout) 2359 #endif 2360 #ifdef CONFIG_NF_CONNTRACK_LABELS 2361 + sizeof(struct nf_conn_labels) 2362 #endif 2363 #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) 2364 + sizeof(struct nf_conn_synproxy) 2365 #endif 2366 ; 2367 }; 2368 2369 int nf_conntrack_init_start(void) 2370 { 2371 unsigned long nr_pages = totalram_pages(); 2372 int max_factor = 8; 2373 int ret = -ENOMEM; 2374 int i; 2375 2376 /* struct nf_ct_ext uses u8 to store offsets/size */ 2377 BUILD_BUG_ON(total_extension_size() > 255u); 2378 2379 seqcount_init(&nf_conntrack_generation); 2380 2381 for (i = 0; i < CONNTRACK_LOCKS; i++) 2382 spin_lock_init(&nf_conntrack_locks[i]); 2383 2384 if (!nf_conntrack_htable_size) { 2385 /* Idea from tcp.c: use 1/16384 of memory. 2386 * On i386: 32MB machine has 512 buckets. 2387 * >= 1GB machines have 16384 buckets. 2388 * >= 4GB machines have 65536 buckets. 2389 */ 2390 nf_conntrack_htable_size 2391 = (((nr_pages << PAGE_SHIFT) / 16384) 2392 / sizeof(struct hlist_head)); 2393 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2394 nf_conntrack_htable_size = 65536; 2395 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2396 nf_conntrack_htable_size = 16384; 2397 if (nf_conntrack_htable_size < 32) 2398 nf_conntrack_htable_size = 32; 2399 2400 /* Use a max. factor of four by default to get the same max as 2401 * with the old struct list_heads. When a table size is given 2402 * we use the old value of 8 to avoid reducing the max. 2403 * entries. */ 2404 max_factor = 4; 2405 } 2406 2407 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2408 if (!nf_conntrack_hash) 2409 return -ENOMEM; 2410 2411 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2412 2413 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2414 sizeof(struct nf_conn), 2415 NFCT_INFOMASK + 1, 2416 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2417 if (!nf_conntrack_cachep) 2418 goto err_cachep; 2419 2420 ret = nf_conntrack_expect_init(); 2421 if (ret < 0) 2422 goto err_expect; 2423 2424 ret = nf_conntrack_acct_init(); 2425 if (ret < 0) 2426 goto err_acct; 2427 2428 ret = nf_conntrack_tstamp_init(); 2429 if (ret < 0) 2430 goto err_tstamp; 2431 2432 ret = nf_conntrack_ecache_init(); 2433 if (ret < 0) 2434 goto err_ecache; 2435 2436 ret = nf_conntrack_timeout_init(); 2437 if (ret < 0) 2438 goto err_timeout; 2439 2440 ret = nf_conntrack_helper_init(); 2441 if (ret < 0) 2442 goto err_helper; 2443 2444 ret = nf_conntrack_labels_init(); 2445 if (ret < 0) 2446 goto err_labels; 2447 2448 ret = nf_conntrack_seqadj_init(); 2449 if (ret < 0) 2450 goto err_seqadj; 2451 2452 ret = nf_conntrack_proto_init(); 2453 if (ret < 0) 2454 goto err_proto; 2455 2456 conntrack_gc_work_init(&conntrack_gc_work); 2457 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2458 2459 return 0; 2460 2461 err_proto: 2462 nf_conntrack_seqadj_fini(); 2463 err_seqadj: 2464 nf_conntrack_labels_fini(); 2465 err_labels: 2466 nf_conntrack_helper_fini(); 2467 err_helper: 2468 nf_conntrack_timeout_fini(); 2469 err_timeout: 2470 nf_conntrack_ecache_fini(); 2471 err_ecache: 2472 nf_conntrack_tstamp_fini(); 2473 err_tstamp: 2474 nf_conntrack_acct_fini(); 2475 err_acct: 2476 nf_conntrack_expect_fini(); 2477 err_expect: 2478 kmem_cache_destroy(nf_conntrack_cachep); 2479 err_cachep: 2480 kvfree(nf_conntrack_hash); 2481 return ret; 2482 } 2483 2484 static struct nf_ct_hook nf_conntrack_hook = { 2485 .update = nf_conntrack_update, 2486 .destroy = destroy_conntrack, 2487 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2488 }; 2489 2490 void nf_conntrack_init_end(void) 2491 { 2492 /* For use by REJECT target */ 2493 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); 2494 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2495 } 2496 2497 /* 2498 * We need to use special "null" values, not used in hash table 2499 */ 2500 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2501 #define DYING_NULLS_VAL ((1<<30)+1) 2502 #define TEMPLATE_NULLS_VAL ((1<<30)+2) 2503 2504 int nf_conntrack_init_net(struct net *net) 2505 { 2506 int ret = -ENOMEM; 2507 int cpu; 2508 2509 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2510 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2511 atomic_set(&net->ct.count, 0); 2512 2513 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); 2514 if (!net->ct.pcpu_lists) 2515 goto err_stat; 2516 2517 for_each_possible_cpu(cpu) { 2518 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 2519 2520 spin_lock_init(&pcpu->lock); 2521 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); 2522 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); 2523 } 2524 2525 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2526 if (!net->ct.stat) 2527 goto err_pcpu_lists; 2528 2529 ret = nf_conntrack_expect_pernet_init(net); 2530 if (ret < 0) 2531 goto err_expect; 2532 2533 nf_conntrack_acct_pernet_init(net); 2534 nf_conntrack_tstamp_pernet_init(net); 2535 nf_conntrack_ecache_pernet_init(net); 2536 nf_conntrack_helper_pernet_init(net); 2537 nf_conntrack_proto_pernet_init(net); 2538 2539 return 0; 2540 2541 err_expect: 2542 free_percpu(net->ct.stat); 2543 err_pcpu_lists: 2544 free_percpu(net->ct.pcpu_lists); 2545 err_stat: 2546 return ret; 2547 } 2548
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.