~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/netfilter/ip_nat_core.c

Version: ~ [ linux-5.10-rc1 ] ~ [ linux-5.9.1 ] ~ [ linux-5.8.16 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.72 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.152 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.202 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.240 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.240 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* NAT for netfilter; shared with compatibility layer. */
  2 
  3 /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
  4    Public Licence. */
  5 #include <linux/module.h>
  6 #include <linux/types.h>
  7 #include <linux/timer.h>
  8 #include <linux/skbuff.h>
  9 #include <linux/netfilter_ipv4.h>
 10 #include <linux/vmalloc.h>
 11 #include <net/checksum.h>
 12 #include <net/icmp.h>
 13 #include <net/ip.h>
 14 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
 15 #include <linux/icmp.h>
 16 #include <linux/udp.h>
 17 
 18 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
 19 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
 20 
 21 #include <linux/netfilter_ipv4/ip_conntrack.h>
 22 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 23 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
 24 #include <linux/netfilter_ipv4/ip_nat.h>
 25 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
 26 #include <linux/netfilter_ipv4/ip_nat_core.h>
 27 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 28 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 29 #include <linux/netfilter_ipv4/listhelp.h>
 30 
 31 #if 0
 32 #define DEBUGP printk
 33 #else
 34 #define DEBUGP(format, args...)
 35 #endif
 36 
 37 DECLARE_RWLOCK(ip_nat_lock);
 38 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
 39 
 40 /* Calculated at init based on memory size */
 41 static unsigned int ip_nat_htable_size;
 42 
 43 static struct list_head *bysource;
 44 static struct list_head *byipsproto;
 45 LIST_HEAD(protos);
 46 LIST_HEAD(helpers);
 47 
 48 extern struct ip_nat_protocol unknown_nat_protocol;
 49 
 50 /* We keep extra hashes for each conntrack, for fast searching. */
 51 static inline size_t
 52 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
 53 {
 54         /* Modified src and dst, to ensure we don't create two
 55            identical streams. */
 56         return (src + dst + proto) % ip_nat_htable_size;
 57 }
 58 
 59 static inline size_t
 60 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
 61 {
 62         /* Original src, to ensure we map it consistently if poss. */
 63         return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
 64 }
 65 
 66 /* Noone using conntrack by the time this called. */
 67 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
 68 {
 69         struct ip_nat_info *info = &conn->nat.info;
 70         unsigned int hs, hp;
 71 
 72         if (!info->initialized)
 73                 return;
 74 
 75         IP_NF_ASSERT(info->bysource.conntrack);
 76         IP_NF_ASSERT(info->byipsproto.conntrack);
 77 
 78         hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
 79                          conn->tuplehash[IP_CT_DIR_ORIGINAL]
 80                          .tuple.dst.protonum);
 81 
 82         hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
 83                               conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
 84                               conn->tuplehash[IP_CT_DIR_REPLY]
 85                               .tuple.dst.protonum);
 86 
 87         WRITE_LOCK(&ip_nat_lock);
 88         LIST_DELETE(&bysource[hs], &info->bysource);
 89         LIST_DELETE(&byipsproto[hp], &info->byipsproto);
 90         WRITE_UNLOCK(&ip_nat_lock);
 91 }
 92 
 93 /* We do checksum mangling, so if they were wrong before they're still
 94  * wrong.  Also works for incomplete packets (eg. ICMP dest
 95  * unreachables.) */
 96 u_int16_t
 97 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
 98 {
 99         u_int32_t diffs[] = { oldvalinv, newval };
100         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
101                                       oldcheck^0xFFFF));
102 }
103 
104 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
105 {
106         return i->protonum == proto;
107 }
108 
109 struct ip_nat_protocol *
110 find_nat_proto(u_int16_t protonum)
111 {
112         struct ip_nat_protocol *i;
113 
114         MUST_BE_READ_LOCKED(&ip_nat_lock);
115         i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
116         if (!i)
117                 i = &unknown_nat_protocol;
118         return i;
119 }
120 
121 /* Is this tuple already taken? (not by us) */
122 int
123 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
124                   const struct ip_conntrack *ignored_conntrack)
125 {
126         /* Conntrack tracking doesn't keep track of outgoing tuples; only
127            incoming ones.  NAT means they don't have a fixed mapping,
128            so we invert the tuple and look for the incoming reply.
129 
130            We could keep a separate hash if this proves too slow. */
131         struct ip_conntrack_tuple reply;
132 
133         invert_tuplepr(&reply, tuple);
134         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
135 }
136 
137 /* Does tuple + the source manip come within the range mr */
138 static int
139 in_range(const struct ip_conntrack_tuple *tuple,
140          const struct ip_conntrack_manip *manip,
141          const struct ip_nat_multi_range *mr)
142 {
143         struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
144         unsigned int i;
145         struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
146 
147         for (i = 0; i < mr->rangesize; i++) {
148                 /* If we are allowed to map IPs, then we must be in the
149                    range specified, otherwise we must be unchanged. */
150                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
151                         if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
152                             || (ntohl(newtuple.src.ip)
153                                 > ntohl(mr->range[i].max_ip)))
154                                 continue;
155                 } else {
156                         if (newtuple.src.ip != tuple->src.ip)
157                                 continue;
158                 }
159 
160                 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
161                     || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
162                                        &mr->range[i].min, &mr->range[i].max))
163                         return 1;
164         }
165         return 0;
166 }
167 
168 static inline int
169 src_cmp(const struct ip_nat_hash *i,
170         const struct ip_conntrack_tuple *tuple,
171         const struct ip_nat_multi_range *mr)
172 {
173         return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
174                 == tuple->dst.protonum
175                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
176                 == tuple->src.ip
177                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
178                 == tuple->src.u.all
179                 && in_range(tuple,
180                             &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
181                             .tuple.src,
182                             mr));
183 }
184 
185 /* Only called for SRC manip */
186 static struct ip_conntrack_manip *
187 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
188                      const struct ip_nat_multi_range *mr)
189 {
190         unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
191         struct ip_nat_hash *i;
192 
193         MUST_BE_READ_LOCKED(&ip_nat_lock);
194         i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
195         if (i)
196                 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
197         else
198                 return NULL;
199 }
200 
201 #ifdef CONFIG_IP_NF_NAT_LOCAL
202 /* If it's really a local destination manip, it may need to do a
203    source manip too. */
204 static int
205 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
206 {
207         struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
208         struct rtable *rt;
209 
210         /* FIXME: IPTOS_TOS(iph->tos) --RR */
211         if (ip_route_output_key(&rt, &fl) != 0) {
212                 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
213                        NIPQUAD(var_ip));
214                 return 0;
215         }
216 
217         *other_ipp = rt->rt_src;
218         ip_rt_put(rt);
219         return 1;
220 }
221 #endif
222 
223 /* Simple way to iterate through all. */
224 static inline int fake_cmp(const struct ip_nat_hash *i,
225                            u_int32_t src, u_int32_t dst, u_int16_t protonum,
226                            unsigned int *score,
227                            const struct ip_conntrack *conntrack)
228 {
229         /* Compare backwards: we're dealing with OUTGOING tuples, and
230            inside the conntrack is the REPLY tuple.  Don't count this
231            conntrack. */
232         if (i->conntrack != conntrack
233             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
234             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
235             && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
236                 == protonum))
237                 (*score)++;
238         return 0;
239 }
240 
241 static inline unsigned int
242 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
243            const struct ip_conntrack *conntrack)
244 {
245         unsigned int score = 0;
246         unsigned int h;
247 
248         MUST_BE_READ_LOCKED(&ip_nat_lock);
249         h = hash_by_ipsproto(src, dst, protonum);
250         LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
251                   src, dst, protonum, &score, conntrack);
252 
253         return score;
254 }
255 
256 /* For [FUTURE] fragmentation handling, we want the least-used
257    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
258    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
259    1-65535, we don't do pro-rata allocation based on ports; we choose
260    the ip with the lowest src-ip/dst-ip/proto usage.
261 
262    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
263    range), we eliminate that and try again.  This is not the most
264    efficient approach, but if you're worried about that, don't hand us
265    ranges you don't really have.  */
266 static struct ip_nat_range *
267 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
268                     const struct ip_nat_multi_range *mr,
269                     const struct ip_conntrack *conntrack,
270                     unsigned int hooknum)
271 {
272         unsigned int i;
273         struct {
274                 const struct ip_nat_range *range;
275                 unsigned int score;
276                 struct ip_conntrack_tuple tuple;
277         } best = { NULL,  0xFFFFFFFF };
278         u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
279         static unsigned int randomness;
280 
281         if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
282                 var_ipp = &tuple->src.ip;
283                 saved_ip = tuple->dst.ip;
284                 other_ipp = &tuple->dst.ip;
285         } else {
286                 var_ipp = &tuple->dst.ip;
287                 saved_ip = tuple->src.ip;
288                 other_ipp = &tuple->src.ip;
289         }
290         /* Don't do do_extra_mangle unless necessary (overrides
291            explicit socket bindings, for example) */
292         orig_dstip = tuple->dst.ip;
293 
294         IP_NF_ASSERT(mr->rangesize >= 1);
295         for (i = 0; i < mr->rangesize; i++) {
296                 /* Host order */
297                 u_int32_t minip, maxip, j;
298 
299                 /* Don't do ranges which are already eliminated. */
300                 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
301                         continue;
302                 }
303 
304                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
305                         minip = ntohl(mr->range[i].min_ip);
306                         maxip = ntohl(mr->range[i].max_ip);
307                 } else
308                         minip = maxip = ntohl(*var_ipp);
309 
310                 randomness++;
311                 for (j = 0; j < maxip - minip + 1; j++) {
312                         unsigned int score;
313 
314                         *var_ipp = htonl(minip + (randomness + j) 
315                                          % (maxip - minip + 1));
316 
317                         /* Reset the other ip in case it was mangled by
318                          * do_extra_mangle last time. */
319                         *other_ipp = saved_ip;
320 
321 #ifdef CONFIG_IP_NF_NAT_LOCAL
322                         if (hooknum == NF_IP_LOCAL_OUT
323                             && *var_ipp != orig_dstip
324                             && !do_extra_mangle(*var_ipp, other_ipp)) {
325                                 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
326                                        i, NIPQUAD(*var_ipp));
327                                 /* Can't route?  This whole range part is
328                                  * probably screwed, but keep trying
329                                  * anyway. */
330                                 continue;
331                         }
332 #endif
333 
334                         /* Count how many others map onto this. */
335                         score = count_maps(tuple->src.ip, tuple->dst.ip,
336                                            tuple->dst.protonum, conntrack);
337                         if (score < best.score) {
338                                 /* Optimization: doesn't get any better than
339                                    this. */
340                                 if (score == 0)
341                                         return (struct ip_nat_range *)
342                                                 &mr->range[i];
343 
344                                 best.score = score;
345                                 best.tuple = *tuple;
346                                 best.range = &mr->range[i];
347                         }
348                 }
349         }
350         *tuple = best.tuple;
351 
352         /* Discard const. */
353         return (struct ip_nat_range *)best.range;
354 }
355 
356 /* Fast version doesn't iterate through hash chains, but only handles
357    common case of single IP address (null NAT, masquerade) */
358 static struct ip_nat_range *
359 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
360                          const struct ip_nat_multi_range *mr,
361                          const struct ip_conntrack *conntrack,
362                          unsigned int hooknum)
363 {
364         if (mr->rangesize != 1
365             || (mr->range[0].flags & IP_NAT_RANGE_FULL)
366             || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
367                 && mr->range[0].min_ip != mr->range[0].max_ip))
368                 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
369 
370         if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
371                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
372                         tuple->src.ip = mr->range[0].min_ip;
373                 else {
374                         /* Only do extra mangle when required (breaks
375                            socket binding) */
376 #ifdef CONFIG_IP_NF_NAT_LOCAL
377                         if (tuple->dst.ip != mr->range[0].min_ip
378                             && hooknum == NF_IP_LOCAL_OUT
379                             && !do_extra_mangle(mr->range[0].min_ip,
380                                                 &tuple->src.ip))
381                                 return NULL;
382 #endif
383                         tuple->dst.ip = mr->range[0].min_ip;
384                 }
385         }
386 
387         /* Discard const. */
388         return (struct ip_nat_range *)&mr->range[0];
389 }
390 
391 static int
392 get_unique_tuple(struct ip_conntrack_tuple *tuple,
393                  const struct ip_conntrack_tuple *orig_tuple,
394                  const struct ip_nat_multi_range *mrr,
395                  struct ip_conntrack *conntrack,
396                  unsigned int hooknum)
397 {
398         struct ip_nat_protocol *proto
399                 = find_nat_proto(orig_tuple->dst.protonum);
400         struct ip_nat_range *rptr;
401         unsigned int i;
402         int ret;
403 
404         /* We temporarily use flags for marking full parts, but we
405            always clean up afterwards */
406         struct ip_nat_multi_range *mr = (void *)mrr;
407 
408         /* 1) If this srcip/proto/src-proto-part is currently mapped,
409            and that same mapping gives a unique tuple within the given
410            range, use that.
411 
412            This is only required for source (ie. NAT/masq) mappings.
413            So far, we don't do local source mappings, so multiple
414            manips not an issue.  */
415         if (hooknum == NF_IP_POST_ROUTING) {
416                 struct ip_conntrack_manip *manip;
417 
418                 manip = find_appropriate_src(orig_tuple, mr);
419                 if (manip) {
420                         /* Apply same source manipulation. */
421                         *tuple = ((struct ip_conntrack_tuple)
422                                   { *manip, orig_tuple->dst });
423                         DEBUGP("get_unique_tuple: Found current src map\n");
424                         if (!ip_nat_used_tuple(tuple, conntrack))
425                                 return 1;
426                 }
427         }
428 
429         /* 2) Select the least-used IP/proto combination in the given
430            range.
431         */
432         *tuple = *orig_tuple;
433         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
434                != NULL) {
435                 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
436                 /* 3) The per-protocol part of the manip is made to
437                    map into the range to make a unique tuple. */
438 
439                 /* Only bother mapping if it's not already in range
440                    and unique */
441                 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
442                      || proto->in_range(tuple, HOOK2MANIP(hooknum),
443                                         &rptr->min, &rptr->max))
444                     && !ip_nat_used_tuple(tuple, conntrack)) {
445                         ret = 1;
446                         goto clear_fulls;
447                 } else {
448                         if (proto->unique_tuple(tuple, rptr,
449                                                 HOOK2MANIP(hooknum),
450                                                 conntrack)) {
451                                 /* Must be unique. */
452                                 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
453                                                                 conntrack));
454                                 ret = 1;
455                                 goto clear_fulls;
456                         } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
457                                 /* Try implicit source NAT; protocol
458                                    may be able to play with ports to
459                                    make it unique. */
460                                 struct ip_nat_range r
461                                         = { IP_NAT_RANGE_MAP_IPS, 
462                                             tuple->src.ip, tuple->src.ip,
463                                             { 0 }, { 0 } };
464                                 DEBUGP("Trying implicit mapping\n");
465                                 if (proto->unique_tuple(tuple, &r,
466                                                         IP_NAT_MANIP_SRC,
467                                                         conntrack)) {
468                                         /* Must be unique. */
469                                         IP_NF_ASSERT(!ip_nat_used_tuple
470                                                      (tuple, conntrack));
471                                         ret = 1;
472                                         goto clear_fulls;
473                                 }
474                         }
475                         DEBUGP("Protocol can't get unique tuple %u.\n",
476                                hooknum);
477                 }
478 
479                 /* Eliminate that from range, and try again. */
480                 rptr->flags |= IP_NAT_RANGE_FULL;
481                 *tuple = *orig_tuple;
482         }
483 
484         ret = 0;
485 
486  clear_fulls:
487         /* Clear full flags. */
488         IP_NF_ASSERT(mr->rangesize >= 1);
489         for (i = 0; i < mr->rangesize; i++)
490                 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
491 
492         return ret;
493 }
494 
495 static inline int
496 helper_cmp(const struct ip_nat_helper *helper,
497            const struct ip_conntrack_tuple *tuple)
498 {
499         return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
500 }
501 
502 /* Where to manip the reply packets (will be reverse manip). */
503 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
504 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
505     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
506 #ifdef CONFIG_IP_NF_NAT_LOCAL
507     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
508     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
509 #endif
510 };
511 
512 unsigned int
513 ip_nat_setup_info(struct ip_conntrack *conntrack,
514                   const struct ip_nat_multi_range *mr,
515                   unsigned int hooknum)
516 {
517         struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
518         struct ip_conntrack_tuple orig_tp;
519         struct ip_nat_info *info = &conntrack->nat.info;
520         int in_hashes = info->initialized;
521 
522         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
523         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
524                      || hooknum == NF_IP_POST_ROUTING
525                      || hooknum == NF_IP_LOCAL_OUT);
526         IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
527         IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
528 
529         /* What we've got will look like inverse of reply. Normally
530            this is what is in the conntrack, except for prior
531            manipulations (future optimization: if num_manips == 0,
532            orig_tp =
533            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
534         invert_tuplepr(&orig_tp,
535                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
536 
537 #if 0
538         {
539         unsigned int i;
540 
541         DEBUGP("Hook %u (%s), ", hooknum,
542                HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
543         DUMP_TUPLE(&orig_tp);
544         DEBUGP("Range %p: ", mr);
545         for (i = 0; i < mr->rangesize; i++) {
546                 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
547                        i,
548                        (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
549                        ? " MAP_IPS" : "",
550                        (mr->range[i].flags
551                         & IP_NAT_RANGE_PROTO_SPECIFIED)
552                        ? " PROTO_SPECIFIED" : "",
553                        (mr->range[i].flags & IP_NAT_RANGE_FULL)
554                        ? " FULL" : "",
555                        NIPQUAD(mr->range[i].min_ip),
556                        NIPQUAD(mr->range[i].max_ip),
557                        mr->range[i].min.all,
558                        mr->range[i].max.all);
559         }
560         }
561 #endif
562 
563         do {
564                 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
565                                       hooknum)) {
566                         DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
567                                conntrack);
568                         return NF_DROP;
569                 }
570 
571 #if 0
572                 DEBUGP("Hook %u (%s) %p\n", hooknum,
573                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
574                        conntrack);
575                 DEBUGP("Original: ");
576                 DUMP_TUPLE(&orig_tp);
577                 DEBUGP("New: ");
578                 DUMP_TUPLE(&new_tuple);
579 #endif
580 
581                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
582                    the original (A/B/C/D') and the mangled one (E/F/G/H').
583 
584                    We're only allowed to work with the SRC per-proto
585                    part, so we create inverses of both to start, then
586                    derive the other fields we need.  */
587 
588                 /* Reply connection: simply invert the new tuple
589                    (G/H/E/F') */
590                 invert_tuplepr(&reply, &new_tuple);
591 
592                 /* Alter conntrack table so it recognizes replies.
593                    If fail this race (reply tuple now used), repeat. */
594         } while (!ip_conntrack_alter_reply(conntrack, &reply));
595 
596         /* FIXME: We can simply used existing conntrack reply tuple
597            here --RR */
598         /* Create inverse of original: C/D/A/B' */
599         invert_tuplepr(&inv_tuple, &orig_tp);
600 
601         /* Has source changed?. */
602         if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
603                 /* In this direction, a source manip. */
604                 info->manips[info->num_manips++] =
605                         ((struct ip_nat_info_manip)
606                          { IP_CT_DIR_ORIGINAL, hooknum,
607                            IP_NAT_MANIP_SRC, new_tuple.src });
608 
609                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
610 
611                 /* In the reverse direction, a destination manip. */
612                 info->manips[info->num_manips++] =
613                         ((struct ip_nat_info_manip)
614                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
615                            IP_NAT_MANIP_DST, orig_tp.src });
616                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
617         }
618 
619         /* Has destination changed? */
620         if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
621                 /* In this direction, a destination manip */
622                 info->manips[info->num_manips++] =
623                         ((struct ip_nat_info_manip)
624                          { IP_CT_DIR_ORIGINAL, hooknum,
625                            IP_NAT_MANIP_DST, reply.src });
626 
627                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
628 
629                 /* In the reverse direction, a source manip. */
630                 info->manips[info->num_manips++] =
631                         ((struct ip_nat_info_manip)
632                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
633                            IP_NAT_MANIP_SRC, inv_tuple.src });
634                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
635         }
636 
637         /* If there's a helper, assign it; based on new tuple. */
638         if (!conntrack->master)
639                 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
640                                          &reply);
641 
642         /* It's done. */
643         info->initialized |= (1 << HOOK2MANIP(hooknum));
644 
645         if (in_hashes) {
646                 IP_NF_ASSERT(info->bysource.conntrack);
647                 replace_in_hashes(conntrack, info);
648         } else {
649                 place_in_hashes(conntrack, info);
650         }
651 
652         return NF_ACCEPT;
653 }
654 
655 void replace_in_hashes(struct ip_conntrack *conntrack,
656                        struct ip_nat_info *info)
657 {
658         /* Source has changed, so replace in hashes. */
659         unsigned int srchash
660                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
661                               .tuple.src,
662                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
663                               .tuple.dst.protonum);
664         /* We place packet as seen OUTGOUNG in byips_proto hash
665            (ie. reverse dst and src of reply packet. */
666         unsigned int ipsprotohash
667                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
668                                    .tuple.dst.ip,
669                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
670                                    .tuple.src.ip,
671                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
672                                    .tuple.dst.protonum);
673 
674         IP_NF_ASSERT(info->bysource.conntrack == conntrack);
675         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
676 
677         list_del(&info->bysource.list);
678         list_del(&info->byipsproto.list);
679 
680         list_prepend(&bysource[srchash], &info->bysource);
681         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
682 }
683 
684 void place_in_hashes(struct ip_conntrack *conntrack,
685                      struct ip_nat_info *info)
686 {
687         unsigned int srchash
688                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
689                               .tuple.src,
690                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
691                               .tuple.dst.protonum);
692         /* We place packet as seen OUTGOUNG in byips_proto hash
693            (ie. reverse dst and src of reply packet. */
694         unsigned int ipsprotohash
695                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
696                                    .tuple.dst.ip,
697                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
698                                    .tuple.src.ip,
699                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
700                                    .tuple.dst.protonum);
701 
702         IP_NF_ASSERT(!info->bysource.conntrack);
703 
704         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
705         info->byipsproto.conntrack = conntrack;
706         info->bysource.conntrack = conntrack;
707 
708         list_prepend(&bysource[srchash], &info->bysource);
709         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
710 }
711 
712 /* Returns true if succeeded. */
713 static int
714 manip_pkt(u_int16_t proto,
715           struct sk_buff **pskb,
716           unsigned int iphdroff,
717           const struct ip_conntrack_manip *manip,
718           enum ip_nat_manip_type maniptype)
719 {
720         struct iphdr *iph;
721 
722         (*pskb)->nfcache |= NFC_ALTERED;
723         if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
724                 return 0;
725 
726         iph = (void *)(*pskb)->data + iphdroff;
727 
728         /* Manipulate protcol part. */
729         if (!find_nat_proto(proto)->manip_pkt(pskb,
730                                               iphdroff + iph->ihl*4,
731                                               manip, maniptype))
732                 return 0;
733 
734         iph = (void *)(*pskb)->data + iphdroff;
735 
736         if (maniptype == IP_NAT_MANIP_SRC) {
737                 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
738                                                 iph->check);
739                 iph->saddr = manip->ip;
740         } else {
741                 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
742                                                 iph->check);
743                 iph->daddr = manip->ip;
744         }
745         return 1;
746 }
747 
748 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
749                                  struct sk_buff *skb)
750 {
751         struct ip_conntrack_protocol *proto;
752         int ret = 1;
753 
754         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
755         proto = __ip_ct_find_proto(skb->nh.iph->protocol);
756         if (proto->exp_matches_pkt)
757                 ret = proto->exp_matches_pkt(exp, skb);
758 
759         return ret;
760 }
761 
762 /* Do packet manipulations according to binding. */
763 unsigned int
764 do_bindings(struct ip_conntrack *ct,
765             enum ip_conntrack_info ctinfo,
766             struct ip_nat_info *info,
767             unsigned int hooknum,
768             struct sk_buff **pskb)
769 {
770         unsigned int i;
771         struct ip_nat_helper *helper;
772         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
773         int proto = (*pskb)->nh.iph->protocol;
774 
775         /* Need nat lock to protect against modification, but neither
776            conntrack (referenced) and helper (deleted with
777            synchronize_bh()) can vanish. */
778         READ_LOCK(&ip_nat_lock);
779         for (i = 0; i < info->num_manips; i++) {
780                 if (info->manips[i].direction == dir
781                     && info->manips[i].hooknum == hooknum) {
782                         DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
783                                *pskb,
784                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
785                                ? "SRC" : "DST",
786                                NIPQUAD(info->manips[i].manip.ip),
787                                htons(info->manips[i].manip.u.all));
788                         if (!manip_pkt(proto, pskb, 0,
789                                        &info->manips[i].manip,
790                                        info->manips[i].maniptype)) {
791                                 READ_UNLOCK(&ip_nat_lock);
792                                 return NF_DROP;
793                         }
794                 }
795         }
796         helper = info->helper;
797         READ_UNLOCK(&ip_nat_lock);
798 
799         if (helper) {
800                 struct ip_conntrack_expect *exp = NULL;
801                 struct list_head *cur_item;
802                 int ret = NF_ACCEPT;
803                 int helper_called = 0;
804 
805                 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
806 
807                 /* Always defragged for helpers */
808                 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
809                                & htons(IP_MF|IP_OFFSET)));
810 
811                 /* Have to grab read lock before sibling_list traversal */
812                 READ_LOCK(&ip_conntrack_lock);
813                 list_for_each(cur_item, &ct->sibling_list) { 
814                         exp = list_entry(cur_item, struct ip_conntrack_expect, 
815                                          expected_list);
816                                          
817                         /* if this expectation is already established, skip */
818                         if (exp->sibling)
819                                 continue;
820 
821                         if (exp_for_packet(exp, *pskb)) {
822                                 /* FIXME: May be true multiple times in the
823                                  * case of UDP!! */
824                                 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
825                                 ret = helper->help(ct, exp, info, ctinfo, 
826                                                    hooknum, pskb);
827                                 if (ret != NF_ACCEPT) {
828                                         READ_UNLOCK(&ip_conntrack_lock);
829                                         return ret;
830                                 }
831                                 helper_called = 1;
832                         }
833                 }
834                 /* Helper might want to manip the packet even when there is no
835                  * matching expectation for this packet */
836                 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
837                         DEBUGP("calling nat helper for packet without expectation\n");
838                         ret = helper->help(ct, NULL, info, ctinfo, 
839                                            hooknum, pskb);
840                         if (ret != NF_ACCEPT) {
841                                 READ_UNLOCK(&ip_conntrack_lock);
842                                 return ret;
843                         }
844                 }
845                 READ_UNLOCK(&ip_conntrack_lock);
846                 
847                 /* Adjust sequence number only once per packet 
848                  * (helper is called at all hooks) */
849                 if (proto == IPPROTO_TCP
850                     && (hooknum == NF_IP_POST_ROUTING
851                         || hooknum == NF_IP_LOCAL_IN)) {
852                         DEBUGP("ip_nat_core: adjusting sequence number\n");
853                         /* future: put this in a l4-proto specific function,
854                          * and call this function here. */
855                         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
856                                 ret = NF_DROP;
857                 }
858 
859                 return ret;
860 
861         } else 
862                 return NF_ACCEPT;
863 
864         /* not reached */
865 }
866 
867 int
868 icmp_reply_translation(struct sk_buff **pskb,
869                        struct ip_conntrack *conntrack,
870                        unsigned int hooknum,
871                        int dir)
872 {
873         struct {
874                 struct icmphdr icmp;
875                 struct iphdr ip;
876         } *inside;
877         unsigned int i;
878         struct ip_nat_info *info = &conntrack->nat.info;
879         int hdrlen;
880 
881         if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
882                 return 0;
883         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
884 
885         /* We're actually going to mangle it beyond trivial checksum
886            adjustment, so make sure the current checksum is correct. */
887         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
888                 hdrlen = (*pskb)->nh.iph->ihl * 4;
889                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
890                                                 (*pskb)->len - hdrlen, 0)))
891                         return 0;
892         }
893 
894         /* Must be RELATED */
895         IP_NF_ASSERT((*pskb)->nfct
896                      - (struct ip_conntrack *)(*pskb)->nfct->master
897                      == IP_CT_RELATED
898                      || (*pskb)->nfct
899                      - (struct ip_conntrack *)(*pskb)->nfct->master
900                      == IP_CT_RELATED+IP_CT_IS_REPLY);
901 
902         /* Redirects on non-null nats must be dropped, else they'll
903            start talking to each other without our translation, and be
904            confused... --RR */
905         if (inside->icmp.type == ICMP_REDIRECT) {
906                 /* Don't care about races here. */
907                 if (info->initialized
908                     != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
909                     || info->num_manips != 0)
910                         return 0;
911         }
912 
913         DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
914                *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
915         /* Note: May not be from a NAT'd host, but probably safest to
916            do translation always as if it came from the host itself
917            (even though a "host unreachable" coming from the host
918            itself is a bit weird).
919 
920            More explanation: some people use NAT for anonymizing.
921            Also, CERT recommends dropping all packets from private IP
922            addresses (although ICMP errors from internal links with
923            such addresses are not too uncommon, as Alan Cox points
924            out) */
925 
926         READ_LOCK(&ip_nat_lock);
927         for (i = 0; i < info->num_manips; i++) {
928                 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
929                        i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
930                        "ORIG" : "REPLY", info->manips[i].hooknum);
931 
932                 if (info->manips[i].direction != dir)
933                         continue;
934 
935                 /* Mapping the inner packet is just like a normal
936                    packet, except it was never src/dst reversed, so
937                    where we would normally apply a dst manip, we apply
938                    a src, and vice versa. */
939                 if (info->manips[i].hooknum == hooknum) {
940                         DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
941                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
942                                ? "DST" : "SRC",
943                                NIPQUAD(info->manips[i].manip.ip),
944                                ntohs(info->manips[i].manip.u.udp.port));
945                         if (!manip_pkt(inside->ip.protocol, pskb,
946                                        (*pskb)->nh.iph->ihl*4
947                                        + sizeof(inside->icmp),
948                                        &info->manips[i].manip,
949                                        !info->manips[i].maniptype))
950                                 goto unlock_fail;
951 
952                         /* Outer packet needs to have IP header NATed like
953                            it's a reply. */
954 
955                         /* Use mapping to map outer packet: 0 give no
956                            per-proto mapping */
957                         DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
958                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
959                                ? "SRC" : "DST",
960                                NIPQUAD(info->manips[i].manip.ip));
961                         if (!manip_pkt(0, pskb, 0,
962                                        &info->manips[i].manip,
963                                        info->manips[i].maniptype))
964                                 goto unlock_fail;
965                 }
966         }
967         READ_UNLOCK(&ip_nat_lock);
968 
969         hdrlen = (*pskb)->nh.iph->ihl * 4;
970 
971         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
972 
973         inside->icmp.checksum = 0;
974         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
975                                                        (*pskb)->len - hdrlen,
976                                                        0));
977         return 1;
978 
979  unlock_fail:
980         READ_UNLOCK(&ip_nat_lock);
981         return 0;
982 }
983 
984 int __init ip_nat_init(void)
985 {
986         size_t i;
987 
988         /* Leave them the same for the moment. */
989         ip_nat_htable_size = ip_conntrack_htable_size;
990 
991         /* One vmalloc for both hash tables */
992         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
993         if (!bysource) {
994                 return -ENOMEM;
995         }
996         byipsproto = bysource + ip_nat_htable_size;
997 
998         /* Sew in builtin protocols. */
999         WRITE_LOCK(&ip_nat_lock);
1000         list_append(&protos, &ip_nat_protocol_tcp);
1001         list_append(&protos, &ip_nat_protocol_udp);
1002         list_append(&protos, &ip_nat_protocol_icmp);
1003         WRITE_UNLOCK(&ip_nat_lock);
1004 
1005         for (i = 0; i < ip_nat_htable_size; i++) {
1006                 INIT_LIST_HEAD(&bysource[i]);
1007                 INIT_LIST_HEAD(&byipsproto[i]);
1008         }
1009 
1010         /* FIXME: Man, this is a hack.  <SIGH> */
1011         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1012         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1013 
1014         return 0;
1015 }
1016 
1017 /* Clear NAT section of all conntracks, in case we're loaded again. */
1018 static int clean_nat(const struct ip_conntrack *i, void *data)
1019 {
1020         memset((void *)&i->nat, 0, sizeof(i->nat));
1021         return 0;
1022 }
1023 
1024 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1025 void ip_nat_cleanup(void)
1026 {
1027         ip_ct_selective_cleanup(&clean_nat, NULL);
1028         ip_conntrack_destroyed = NULL;
1029         vfree(bysource);
1030 }
1031 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp