~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/netfilter/ipvs/ip_vs_mh.c

Version: ~ [ linux-6.1-rc7 ] ~ [ linux-6.0.10 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.80 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.156 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.225 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.267 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.300 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.334 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /* IPVS:        Maglev Hashing scheduling module
  3  *
  4  * Authors:     Inju Song <inju.song@navercorp.com>
  5  *
  6  */
  7 
  8 /* The mh algorithm is to assign a preference list of all the lookup
  9  * table positions to each destination and populate the table with
 10  * the most-preferred position of destinations. Then it is to select
 11  * destination with the hash key of source IP address through looking
 12  * up a the lookup table.
 13  *
 14  * The algorithm is detailed in:
 15  * [3.4 Consistent Hasing]
 16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
 17  *
 18  */
 19 
 20 #define KMSG_COMPONENT "IPVS"
 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 22 
 23 #include <linux/ip.h>
 24 #include <linux/slab.h>
 25 #include <linux/module.h>
 26 #include <linux/kernel.h>
 27 #include <linux/skbuff.h>
 28 
 29 #include <net/ip_vs.h>
 30 
 31 #include <linux/siphash.h>
 32 #include <linux/bitops.h>
 33 #include <linux/gcd.h>
 34 
 35 #define IP_VS_SVC_F_SCHED_MH_FALLBACK   IP_VS_SVC_F_SCHED1 /* MH fallback */
 36 #define IP_VS_SVC_F_SCHED_MH_PORT       IP_VS_SVC_F_SCHED2 /* MH use port */
 37 
 38 struct ip_vs_mh_lookup {
 39         struct ip_vs_dest __rcu *dest;  /* real server (cache) */
 40 };
 41 
 42 struct ip_vs_mh_dest_setup {
 43         unsigned int    offset; /* starting offset */
 44         unsigned int    skip;   /* skip */
 45         unsigned int    perm;   /* next_offset */
 46         int             turns;  /* weight / gcd() and rshift */
 47 };
 48 
 49 /* Available prime numbers for MH table */
 50 static int primes[] = {251, 509, 1021, 2039, 4093,
 51                        8191, 16381, 32749, 65521, 131071};
 52 
 53 /* For IPVS MH entry hash table */
 54 #ifndef CONFIG_IP_VS_MH_TAB_INDEX
 55 #define CONFIG_IP_VS_MH_TAB_INDEX       12
 56 #endif
 57 #define IP_VS_MH_TAB_BITS               (CONFIG_IP_VS_MH_TAB_INDEX / 2)
 58 #define IP_VS_MH_TAB_INDEX              (CONFIG_IP_VS_MH_TAB_INDEX - 8)
 59 #define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
 60 
 61 struct ip_vs_mh_state {
 62         struct rcu_head                 rcu_head;
 63         struct ip_vs_mh_lookup          *lookup;
 64         struct ip_vs_mh_dest_setup      *dest_setup;
 65         hsiphash_key_t                  hash1, hash2;
 66         int                             gcd;
 67         int                             rshift;
 68 };
 69 
 70 static inline void generate_hash_secret(hsiphash_key_t *hash1,
 71                                         hsiphash_key_t *hash2)
 72 {
 73         hash1->key[0] = 2654435761UL;
 74         hash1->key[1] = 2654435761UL;
 75 
 76         hash2->key[0] = 2654446892UL;
 77         hash2->key[1] = 2654446892UL;
 78 }
 79 
 80 /* Helper function to determine if server is unavailable */
 81 static inline bool is_unavailable(struct ip_vs_dest *dest)
 82 {
 83         return atomic_read(&dest->weight) <= 0 ||
 84                dest->flags & IP_VS_DEST_F_OVERLOAD;
 85 }
 86 
 87 /* Returns hash value for IPVS MH entry */
 88 static inline unsigned int
 89 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
 90                  __be16 port, hsiphash_key_t *key, unsigned int offset)
 91 {
 92         unsigned int v;
 93         __be32 addr_fold = addr->ip;
 94 
 95 #ifdef CONFIG_IP_VS_IPV6
 96         if (af == AF_INET6)
 97                 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
 98                             addr->ip6[2] ^ addr->ip6[3];
 99 #endif
100         v = (offset + ntohs(port) + ntohl(addr_fold));
101         return hsiphash(&v, sizeof(v), key);
102 }
103 
104 /* Reset all the hash buckets of the specified table. */
105 static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
106 {
107         int i;
108         struct ip_vs_mh_lookup *l;
109         struct ip_vs_dest *dest;
110 
111         l = &s->lookup[0];
112         for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
113                 dest = rcu_dereference_protected(l->dest, 1);
114                 if (dest) {
115                         ip_vs_dest_put(dest);
116                         RCU_INIT_POINTER(l->dest, NULL);
117                 }
118                 l++;
119         }
120 }
121 
122 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
123                               struct ip_vs_service *svc)
124 {
125         struct list_head *p;
126         struct ip_vs_mh_dest_setup *ds;
127         struct ip_vs_dest *dest;
128         int lw;
129 
130         /* If gcd is smaller then 1, number of dests or
131          * all last_weight of dests are zero. So, skip
132          * permutation for the dests.
133          */
134         if (s->gcd < 1)
135                 return 0;
136 
137         /* Set dest_setup for the dests permutation */
138         p = &svc->destinations;
139         ds = &s->dest_setup[0];
140         while ((p = p->next) != &svc->destinations) {
141                 dest = list_entry(p, struct ip_vs_dest, n_list);
142 
143                 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
144                                               dest->port, &s->hash1, 0) %
145                                               IP_VS_MH_TAB_SIZE;
146                 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
147                                             dest->port, &s->hash2, 0) %
148                                             (IP_VS_MH_TAB_SIZE - 1) + 1;
149                 ds->perm = ds->offset;
150 
151                 lw = atomic_read(&dest->last_weight);
152                 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
153                 ds++;
154         }
155 
156         return 0;
157 }
158 
159 static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
160                              struct ip_vs_service *svc)
161 {
162         int n, c, dt_count;
163         unsigned long *table;
164         struct list_head *p;
165         struct ip_vs_mh_dest_setup *ds;
166         struct ip_vs_dest *dest, *new_dest;
167 
168         /* If gcd is smaller then 1, number of dests or
169          * all last_weight of dests are zero. So, skip
170          * the population for the dests and reset lookup table.
171          */
172         if (s->gcd < 1) {
173                 ip_vs_mh_reset(s);
174                 return 0;
175         }
176 
177         table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
178                         sizeof(unsigned long), GFP_KERNEL);
179         if (!table)
180                 return -ENOMEM;
181 
182         p = &svc->destinations;
183         n = 0;
184         dt_count = 0;
185         while (n < IP_VS_MH_TAB_SIZE) {
186                 if (p == &svc->destinations)
187                         p = p->next;
188 
189                 ds = &s->dest_setup[0];
190                 while (p != &svc->destinations) {
191                         /* Ignore added server with zero weight */
192                         if (ds->turns < 1) {
193                                 p = p->next;
194                                 ds++;
195                                 continue;
196                         }
197 
198                         c = ds->perm;
199                         while (test_bit(c, table)) {
200                                 /* Add skip, mod IP_VS_MH_TAB_SIZE */
201                                 ds->perm += ds->skip;
202                                 if (ds->perm >= IP_VS_MH_TAB_SIZE)
203                                         ds->perm -= IP_VS_MH_TAB_SIZE;
204                                 c = ds->perm;
205                         }
206 
207                         __set_bit(c, table);
208 
209                         dest = rcu_dereference_protected(s->lookup[c].dest, 1);
210                         new_dest = list_entry(p, struct ip_vs_dest, n_list);
211                         if (dest != new_dest) {
212                                 if (dest)
213                                         ip_vs_dest_put(dest);
214                                 ip_vs_dest_hold(new_dest);
215                                 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
216                         }
217 
218                         if (++n == IP_VS_MH_TAB_SIZE)
219                                 goto out;
220 
221                         if (++dt_count >= ds->turns) {
222                                 dt_count = 0;
223                                 p = p->next;
224                                 ds++;
225                         }
226                 }
227         }
228 
229 out:
230         kfree(table);
231         return 0;
232 }
233 
234 /* Get ip_vs_dest associated with supplied parameters. */
235 static inline struct ip_vs_dest *
236 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
237              const union nf_inet_addr *addr, __be16 port)
238 {
239         unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
240                                              % IP_VS_MH_TAB_SIZE;
241         struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
242 
243         return (!dest || is_unavailable(dest)) ? NULL : dest;
244 }
245 
246 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */
247 static inline struct ip_vs_dest *
248 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
249                       const union nf_inet_addr *addr, __be16 port)
250 {
251         unsigned int offset, roffset;
252         unsigned int hash, ihash;
253         struct ip_vs_dest *dest;
254 
255         /* First try the dest it's supposed to go to */
256         ihash = ip_vs_mh_hashkey(svc->af, addr, port,
257                                  &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
258         dest = rcu_dereference(s->lookup[ihash].dest);
259         if (!dest)
260                 return NULL;
261         if (!is_unavailable(dest))
262                 return dest;
263 
264         IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
265                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
266 
267         /* If the original dest is unavailable, loop around the table
268          * starting from ihash to find a new dest
269          */
270         for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
271                 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
272                 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
273                                         roffset) % IP_VS_MH_TAB_SIZE;
274                 dest = rcu_dereference(s->lookup[hash].dest);
275                 if (!dest)
276                         break;
277                 if (!is_unavailable(dest))
278                         return dest;
279                 IP_VS_DBG_BUF(6,
280                               "MH: selected unavailable server %s:%u (offset %u), reselecting",
281                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
282                               ntohs(dest->port), roffset);
283         }
284 
285         return NULL;
286 }
287 
288 /* Assign all the hash buckets of the specified table with the service. */
289 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
290                              struct ip_vs_service *svc)
291 {
292         int ret;
293 
294         if (svc->num_dests > IP_VS_MH_TAB_SIZE)
295                 return -EINVAL;
296 
297         if (svc->num_dests >= 1) {
298                 s->dest_setup = kcalloc(svc->num_dests,
299                                         sizeof(struct ip_vs_mh_dest_setup),
300                                         GFP_KERNEL);
301                 if (!s->dest_setup)
302                         return -ENOMEM;
303         }
304 
305         ip_vs_mh_permutate(s, svc);
306 
307         ret = ip_vs_mh_populate(s, svc);
308         if (ret < 0)
309                 goto out;
310 
311         IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
312                       IP_VS_DBG_ADDR(svc->af, &svc->addr),
313                       ntohs(svc->port));
314 
315 out:
316         if (svc->num_dests >= 1) {
317                 kfree(s->dest_setup);
318                 s->dest_setup = NULL;
319         }
320         return ret;
321 }
322 
323 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
324 {
325         struct ip_vs_dest *dest;
326         int weight;
327         int g = 0;
328 
329         list_for_each_entry(dest, &svc->destinations, n_list) {
330                 weight = atomic_read(&dest->last_weight);
331                 if (weight > 0) {
332                         if (g > 0)
333                                 g = gcd(weight, g);
334                         else
335                                 g = weight;
336                 }
337         }
338         return g;
339 }
340 
341 /* To avoid assigning huge weight for the MH table,
342  * calculate shift value with gcd.
343  */
344 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
345 {
346         struct ip_vs_dest *dest;
347         int new_weight, weight = 0;
348         int mw, shift;
349 
350         /* If gcd is smaller then 1, number of dests or
351          * all last_weight of dests are zero. So, return
352          * shift value as zero.
353          */
354         if (gcd < 1)
355                 return 0;
356 
357         list_for_each_entry(dest, &svc->destinations, n_list) {
358                 new_weight = atomic_read(&dest->last_weight);
359                 if (new_weight > weight)
360                         weight = new_weight;
361         }
362 
363         /* Because gcd is greater than zero,
364          * the maximum weight and gcd are always greater than zero
365          */
366         mw = weight / gcd;
367 
368         /* shift = occupied bits of weight/gcd - MH highest bits */
369         shift = fls(mw) - IP_VS_MH_TAB_BITS;
370         return (shift >= 0) ? shift : 0;
371 }
372 
373 static void ip_vs_mh_state_free(struct rcu_head *head)
374 {
375         struct ip_vs_mh_state *s;
376 
377         s = container_of(head, struct ip_vs_mh_state, rcu_head);
378         kfree(s->lookup);
379         kfree(s);
380 }
381 
382 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
383 {
384         int ret;
385         struct ip_vs_mh_state *s;
386 
387         /* Allocate the MH table for this service */
388         s = kzalloc(sizeof(*s), GFP_KERNEL);
389         if (!s)
390                 return -ENOMEM;
391 
392         s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
393                             GFP_KERNEL);
394         if (!s->lookup) {
395                 kfree(s);
396                 return -ENOMEM;
397         }
398 
399         generate_hash_secret(&s->hash1, &s->hash2);
400         s->gcd = ip_vs_mh_gcd_weight(svc);
401         s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
402 
403         IP_VS_DBG(6,
404                   "MH lookup table (memory=%zdbytes) allocated for current service\n",
405                   sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
406 
407         /* Assign the lookup table with current dests */
408         ret = ip_vs_mh_reassign(s, svc);
409         if (ret < 0) {
410                 ip_vs_mh_reset(s);
411                 ip_vs_mh_state_free(&s->rcu_head);
412                 return ret;
413         }
414 
415         /* No more failures, attach state */
416         svc->sched_data = s;
417         return 0;
418 }
419 
420 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
421 {
422         struct ip_vs_mh_state *s = svc->sched_data;
423 
424         /* Got to clean up lookup entry here */
425         ip_vs_mh_reset(s);
426 
427         call_rcu(&s->rcu_head, ip_vs_mh_state_free);
428         IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
429                   sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
430 }
431 
432 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
433                                  struct ip_vs_dest *dest)
434 {
435         struct ip_vs_mh_state *s = svc->sched_data;
436 
437         s->gcd = ip_vs_mh_gcd_weight(svc);
438         s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
439 
440         /* Assign the lookup table with the updated service */
441         return ip_vs_mh_reassign(s, svc);
442 }
443 
444 /* Helper function to get port number */
445 static inline __be16
446 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
447 {
448         __be16 _ports[2], *ports;
449 
450         /* At this point we know that we have a valid packet of some kind.
451          * Because ICMP packets are only guaranteed to have the first 8
452          * bytes, let's just grab the ports.  Fortunately they're in the
453          * same position for all three of the protocols we care about.
454          */
455         switch (iph->protocol) {
456         case IPPROTO_TCP:
457         case IPPROTO_UDP:
458         case IPPROTO_SCTP:
459                 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
460                                            &_ports);
461                 if (unlikely(!ports))
462                         return 0;
463 
464                 if (likely(!ip_vs_iph_inverse(iph)))
465                         return ports[0];
466                 else
467                         return ports[1];
468         default:
469                 return 0;
470         }
471 }
472 
473 /* Maglev Hashing scheduling */
474 static struct ip_vs_dest *
475 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
476                   struct ip_vs_iphdr *iph)
477 {
478         struct ip_vs_dest *dest;
479         struct ip_vs_mh_state *s;
480         __be16 port = 0;
481         const union nf_inet_addr *hash_addr;
482 
483         hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
484 
485         IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
486 
487         if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
488                 port = ip_vs_mh_get_port(skb, iph);
489 
490         s = (struct ip_vs_mh_state *)svc->sched_data;
491 
492         if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
493                 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
494         else
495                 dest = ip_vs_mh_get(svc, s, hash_addr, port);
496 
497         if (!dest) {
498                 ip_vs_scheduler_err(svc, "no destination available");
499                 return NULL;
500         }
501 
502         IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
503                       IP_VS_DBG_ADDR(svc->af, hash_addr),
504                       ntohs(port),
505                       IP_VS_DBG_ADDR(dest->af, &dest->addr),
506                       ntohs(dest->port));
507 
508         return dest;
509 }
510 
511 /* IPVS MH Scheduler structure */
512 static struct ip_vs_scheduler ip_vs_mh_scheduler = {
513         .name =                 "mh",
514         .refcnt =               ATOMIC_INIT(0),
515         .module =               THIS_MODULE,
516         .n_list  =              LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
517         .init_service =         ip_vs_mh_init_svc,
518         .done_service =         ip_vs_mh_done_svc,
519         .add_dest =             ip_vs_mh_dest_changed,
520         .del_dest =             ip_vs_mh_dest_changed,
521         .upd_dest =             ip_vs_mh_dest_changed,
522         .schedule =             ip_vs_mh_schedule,
523 };
524 
525 static int __init ip_vs_mh_init(void)
526 {
527         return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
528 }
529 
530 static void __exit ip_vs_mh_cleanup(void)
531 {
532         unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
533         rcu_barrier();
534 }
535 
536 module_init(ip_vs_mh_init);
537 module_exit(ip_vs_mh_cleanup);
538 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
539 MODULE_LICENSE("GPL v2");
540 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
541 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp