~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/netfilter/nft_set_pipapo_avx2.c

Version: ~ [ linux-5.8 ] ~ [ linux-5.7.14 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.57 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.138 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.193 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.232 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.232 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 
  3 /* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
  4  *
  5  * Copyright (c) 2019-2020 Red Hat GmbH
  6  *
  7  * Author: Stefano Brivio <sbrivio@redhat.com>
  8  */
  9 
 10 #include <linux/kernel.h>
 11 #include <linux/init.h>
 12 #include <linux/module.h>
 13 #include <linux/netlink.h>
 14 #include <linux/netfilter.h>
 15 #include <linux/netfilter/nf_tables.h>
 16 #include <net/netfilter/nf_tables_core.h>
 17 #include <uapi/linux/netfilter/nf_tables.h>
 18 #include <linux/bitmap.h>
 19 #include <linux/bitops.h>
 20 
 21 #include <linux/compiler.h>
 22 #include <asm/fpu/api.h>
 23 
 24 #include "nft_set_pipapo_avx2.h"
 25 #include "nft_set_pipapo.h"
 26 
 27 #define NFT_PIPAPO_LONGS_PER_M256       (XSAVE_YMM_SIZE / BITS_PER_LONG)
 28 
 29 /* Load from memory into YMM register with non-temporal hint ("stream load"),
 30  * that is, don't fetch lines from memory into the cache. This avoids pushing
 31  * precious packet data out of the cache hierarchy, and is appropriate when:
 32  *
 33  * - loading buckets from lookup tables, as they are not going to be used
 34  *   again before packets are entirely classified
 35  *
 36  * - loading the result bitmap from the previous field, as it's never used
 37  *   again
 38  */
 39 #define NFT_PIPAPO_AVX2_LOAD(reg, loc)                                  \
 40         asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
 41 
 42 /* Stream a single lookup table bucket into YMM register given lookup table,
 43  * group index, value of packet bits, bucket size.
 44  */
 45 #define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize)          \
 46         NFT_PIPAPO_AVX2_LOAD(reg,                                       \
 47                              lt[((group) * NFT_PIPAPO_BUCKETS(4) +      \
 48                                  (v)) * (bsize)])
 49 #define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize)          \
 50         NFT_PIPAPO_AVX2_LOAD(reg,                                       \
 51                              lt[((group) * NFT_PIPAPO_BUCKETS(8) +      \
 52                                  (v)) * (bsize)])
 53 
 54 /* Bitwise AND: the staple operation of this algorithm */
 55 #define NFT_PIPAPO_AVX2_AND(dst, a, b)                                  \
 56         asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
 57 
 58 /* Jump to label if @reg is zero */
 59 #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label)                        \
 60         asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";"        \
 61                           "je %l[" #label "]" : : : : label)
 62 
 63 /* Store 256 bits from YMM register into memory. Contrary to bucket load
 64  * operation, we don't bypass the cache here, as stored matching results
 65  * are always used shortly after.
 66  */
 67 #define NFT_PIPAPO_AVX2_STORE(loc, reg)                                 \
 68         asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
 69 
 70 /* Zero out a complete YMM register, @reg */
 71 #define NFT_PIPAPO_AVX2_ZERO(reg)                                       \
 72         asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
 73 
 74 /* Current working bitmap index, toggled between field matches */
 75 static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
 76 
 77 /**
 78  * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
 79  *
 80  * This zeroes out ymm15, which is later used whenever we need to clear a
 81  * memory location, by storing its content into memory.
 82  */
 83 static void nft_pipapo_avx2_prepare(void)
 84 {
 85         NFT_PIPAPO_AVX2_ZERO(15);
 86 }
 87 
 88 /**
 89  * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
 90  * @data:       Base memory area
 91  * @start:      First bit to set
 92  * @len:        Count of bits to fill
 93  *
 94  * This is nothing else than a version of bitmap_set(), as used e.g. by
 95  * pipapo_refill(), tailored for the microarchitectures using it and better
 96  * suited for the specific usage: it's very likely that we'll set a small number
 97  * of bits, not crossing a word boundary, and correct branch prediction is
 98  * critical here.
 99  *
100  * This function doesn't actually use any AVX2 instruction.
101  */
102 static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
103 {
104         int offset = start % BITS_PER_LONG;
105         unsigned long mask;
106 
107         data += start / BITS_PER_LONG;
108 
109         if (likely(len == 1)) {
110                 *data |= BIT(offset);
111                 return;
112         }
113 
114         if (likely(len < BITS_PER_LONG || offset)) {
115                 if (likely(len + offset <= BITS_PER_LONG)) {
116                         *data |= GENMASK(len - 1 + offset, offset);
117                         return;
118                 }
119 
120                 *data |= ~0UL << offset;
121                 len -= BITS_PER_LONG - offset;
122                 data++;
123 
124                 if (len <= BITS_PER_LONG) {
125                         mask = ~0UL >> (BITS_PER_LONG - len);
126                         *data |= mask;
127                         return;
128                 }
129         }
130 
131         memset(data, 0xff, len / BITS_PER_BYTE);
132         data += len / BITS_PER_LONG;
133 
134         len %= BITS_PER_LONG;
135         if (len)
136                 *data |= ~0UL >> (BITS_PER_LONG - len);
137 }
138 
139 /**
140  * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
141  * @offset:     Start from given bitmap (equivalent to bucket) offset, in longs
142  * @map:        Bitmap to be scanned for set bits
143  * @dst:        Destination bitmap
144  * @mt:         Mapping table containing bit set specifiers
145  * @len:        Length of bitmap in longs
146  * @last:       Return index of first set bit, if this is the last field
147  *
148  * This is an alternative implementation of pipapo_refill() suitable for usage
149  * with AVX2 lookup routines: we know there are four words to be scanned, at
150  * a given offset inside the map, for each matching iteration.
151  *
152  * This function doesn't actually use any AVX2 instruction.
153  *
154  * Return: first set bit index if @last, index of first filled word otherwise.
155  */
156 static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
157                                   unsigned long *dst,
158                                   union nft_pipapo_map_bucket *mt, bool last)
159 {
160         int ret = -1;
161 
162 #define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x)                              \
163         do {                                                            \
164                 while (map[(x)]) {                                      \
165                         int r = __builtin_ctzl(map[(x)]);               \
166                         int i = (offset + (x)) * BITS_PER_LONG + r;     \
167                                                                         \
168                         if (last)                                       \
169                                 return i;                               \
170                                                                         \
171                         nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n);   \
172                                                                         \
173                         if (ret == -1)                                  \
174                                 ret = mt[i].to;                         \
175                                                                         \
176                         map[(x)] &= ~(1UL << r);                        \
177                 }                                                       \
178         } while (0)
179 
180         NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
181         NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
182         NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
183         NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
184 #undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
185 
186         return ret;
187 }
188 
189 /**
190  * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
191  * @map:        Previous match result, used as initial bitmap
192  * @fill:       Destination bitmap to be filled with current match result
193  * @f:          Field, containing lookup and mapping tables
194  * @offset:     Ignore buckets before the given index, no bits are filled there
195  * @pkt:        Packet data, pointer to input nftables register
196  * @first:      If this is the first field, don't source previous result
197  * @last:       Last field: stop at the first match and return bit index
198  *
199  * Load buckets from lookup table corresponding to the values of each 4-bit
200  * group of packet bytes, and perform a bitwise intersection between them. If
201  * this is the first field in the set, simply AND the buckets together
202  * (equivalent to using an all-ones starting bitmap), use the provided starting
203  * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
204  * working bitmap, @fill.
205  *
206  * This is used for 8-bit fields (i.e. protocol numbers).
207  *
208  * Out-of-order (and superscalar) execution is vital here, so it's critical to
209  * avoid false data dependencies. CPU and compiler could (mostly) take care of
210  * this on their own, but the operation ordering is explicitly given here with
211  * a likely execution order in mind, to highlight possible stalls. That's why
212  * a number of logically distinct operations (i.e. loading buckets, intersecting
213  * buckets) are interleaved.
214  *
215  * Return: -1 on no match, rule index of match if @last, otherwise first long
216  * word index to be checked next (i.e. first filled word).
217  */
218 static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
219                                        struct nft_pipapo_field *f, int offset,
220                                        const u8 *pkt, bool first, bool last)
221 {
222         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
223         u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
224         unsigned long *lt = f->lt, bsize = f->bsize;
225 
226         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
227         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
228                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
229 
230                 if (first) {
231                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
232                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
233                         NFT_PIPAPO_AVX2_AND(4, 0, 1);
234                 } else {
235                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
236                         NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
237                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
238                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
239                         NFT_PIPAPO_AVX2_AND(3, 0, 1);
240                         NFT_PIPAPO_AVX2_AND(4, 2, 3);
241                 }
242 
243                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
244                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
245 
246                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
247                 if (last)
248                         return b;
249 
250                 if (unlikely(ret == -1))
251                         ret = b / XSAVE_YMM_SIZE;
252 
253                 continue;
254 nomatch:
255                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
256 nothing:
257                 ;
258         }
259 
260         return ret;
261 }
262 
263 /**
264  * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
265  * @map:        Previous match result, used as initial bitmap
266  * @fill:       Destination bitmap to be filled with current match result
267  * @f:          Field, containing lookup and mapping tables
268  * @offset:     Ignore buckets before the given index, no bits are filled there
269  * @pkt:        Packet data, pointer to input nftables register
270  * @first:      If this is the first field, don't source previous result
271  * @last:       Last field: stop at the first match and return bit index
272  *
273  * See nft_pipapo_avx2_lookup_4b_2().
274  *
275  * This is used for 16-bit fields (i.e. ports).
276  *
277  * Return: -1 on no match, rule index of match if @last, otherwise first long
278  * word index to be checked next (i.e. first filled word).
279  */
280 static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
281                                        struct nft_pipapo_field *f, int offset,
282                                        const u8 *pkt, bool first, bool last)
283 {
284         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
285         u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
286         unsigned long *lt = f->lt, bsize = f->bsize;
287 
288         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
289         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
290                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
291 
292                 if (first) {
293                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
294                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
295                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
296                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
297                         NFT_PIPAPO_AVX2_AND(4, 0, 1);
298                         NFT_PIPAPO_AVX2_AND(5, 2, 3);
299                         NFT_PIPAPO_AVX2_AND(7, 4, 5);
300                 } else {
301                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
302 
303                         NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
304 
305                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
306                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
307                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
308                         NFT_PIPAPO_AVX2_AND(5, 0, 1);
309 
310                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
311 
312                         NFT_PIPAPO_AVX2_AND(6, 2, 3);
313                         NFT_PIPAPO_AVX2_AND(7, 4, 5);
314                         /* Stall */
315                         NFT_PIPAPO_AVX2_AND(7, 6, 7);
316                 }
317 
318                 /* Stall */
319                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
320                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
321 
322                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
323                 if (last)
324                         return b;
325 
326                 if (unlikely(ret == -1))
327                         ret = b / XSAVE_YMM_SIZE;
328 
329                 continue;
330 nomatch:
331                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
332 nothing:
333                 ;
334         }
335 
336         return ret;
337 }
338 
339 /**
340  * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
341  * @map:        Previous match result, used as initial bitmap
342  * @fill:       Destination bitmap to be filled with current match result
343  * @f:          Field, containing lookup and mapping tables
344  * @offset:     Ignore buckets before the given index, no bits are filled there
345  * @pkt:        Packet data, pointer to input nftables register
346  * @first:      If this is the first field, don't source previous result
347  * @last:       Last field: stop at the first match and return bit index
348  *
349  * See nft_pipapo_avx2_lookup_4b_2().
350  *
351  * This is used for 32-bit fields (i.e. IPv4 addresses).
352  *
353  * Return: -1 on no match, rule index of match if @last, otherwise first long
354  * word index to be checked next (i.e. first filled word).
355  */
356 static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
357                                        struct nft_pipapo_field *f, int offset,
358                                        const u8 *pkt, bool first, bool last)
359 {
360         u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
361                       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
362                    };
363         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
364         unsigned long *lt = f->lt, bsize = f->bsize;
365 
366         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
367         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
368                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
369 
370                 if (first) {
371                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
372                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 1, pg[1], bsize);
373                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 2, pg[2], bsize);
374                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 3, pg[3], bsize);
375                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 4, pg[4], bsize);
376                         NFT_PIPAPO_AVX2_AND(5,   0,  1);
377                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 5, pg[5], bsize);
378                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 6, pg[6], bsize);
379                         NFT_PIPAPO_AVX2_AND(8,   2,  3);
380                         NFT_PIPAPO_AVX2_AND(9,   4,  5);
381                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
382                         NFT_PIPAPO_AVX2_AND(11,  6,  7);
383                         NFT_PIPAPO_AVX2_AND(12,  8,  9);
384                         NFT_PIPAPO_AVX2_AND(13, 10, 11);
385 
386                         /* Stall */
387                         NFT_PIPAPO_AVX2_AND(1,  12, 13);
388                 } else {
389                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
390                         NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
391                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 1, pg[1], bsize);
392                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 2, pg[2], bsize);
393                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 3, pg[3], bsize);
394 
395                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
396 
397                         NFT_PIPAPO_AVX2_AND(5,   0,  1);
398                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 4, pg[4], bsize);
399                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 5, pg[5], bsize);
400                         NFT_PIPAPO_AVX2_AND(8,   2,  3);
401                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt, 6, pg[6], bsize);
402                         NFT_PIPAPO_AVX2_AND(10,  4,  5);
403                         NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
404                         NFT_PIPAPO_AVX2_AND(12,  6,  7);
405                         NFT_PIPAPO_AVX2_AND(13,  8,  9);
406                         NFT_PIPAPO_AVX2_AND(14, 10, 11);
407 
408                         /* Stall */
409                         NFT_PIPAPO_AVX2_AND(1,  12, 13);
410                         NFT_PIPAPO_AVX2_AND(1,   1, 14);
411                 }
412 
413                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
414                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
415 
416                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
417                 if (last)
418                         return b;
419 
420                 if (unlikely(ret == -1))
421                         ret = b / XSAVE_YMM_SIZE;
422 
423                 continue;
424 
425 nomatch:
426                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
427 nothing:
428                 ;
429         }
430 
431         return ret;
432 }
433 
434 /**
435  * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
436  * @map:        Previous match result, used as initial bitmap
437  * @fill:       Destination bitmap to be filled with current match result
438  * @f:          Field, containing lookup and mapping tables
439  * @offset:     Ignore buckets before the given index, no bits are filled there
440  * @pkt:        Packet data, pointer to input nftables register
441  * @first:      If this is the first field, don't source previous result
442  * @last:       Last field: stop at the first match and return bit index
443  *
444  * See nft_pipapo_avx2_lookup_4b_2().
445  *
446  * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
447  *
448  * Return: -1 on no match, rule index of match if @last, otherwise first long
449  * word index to be checked next (i.e. first filled word).
450  */
451 static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
452                                         struct nft_pipapo_field *f, int offset,
453                                         const u8 *pkt, bool first, bool last)
454 {
455         u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
456                        pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
457                        pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
458                     };
459         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
460         unsigned long *lt = f->lt, bsize = f->bsize;
461 
462         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
463         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
464                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
465 
466                 if (!first)
467                         NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
468 
469                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
470                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
471                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
472 
473                 if (!first) {
474                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
475                         NFT_PIPAPO_AVX2_AND(1, 1, 0);
476                 }
477 
478                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
479                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt,  4,  pg[4], bsize);
480                 NFT_PIPAPO_AVX2_AND(6,   2,  3);
481                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
482                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt,  6,  pg[6], bsize);
483                 NFT_PIPAPO_AVX2_AND(9,   1,  4);
484                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt,  7,  pg[7], bsize);
485                 NFT_PIPAPO_AVX2_AND(11,  5,  6);
486                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt,  8,  pg[8], bsize);
487                 NFT_PIPAPO_AVX2_AND(13,  7,  8);
488                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt,  9,  pg[9], bsize);
489 
490                 NFT_PIPAPO_AVX2_AND(0,   9, 10);
491                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 10,  pg[10], bsize);
492                 NFT_PIPAPO_AVX2_AND(2,  11, 12);
493                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11,  pg[11], bsize);
494                 NFT_PIPAPO_AVX2_AND(4,  13, 14);
495                 NFT_PIPAPO_AVX2_AND(5,   0,  1);
496 
497                 NFT_PIPAPO_AVX2_AND(6,   2,  3);
498 
499                 /* Stalls */
500                 NFT_PIPAPO_AVX2_AND(7,   4,  5);
501                 NFT_PIPAPO_AVX2_AND(8,   6,  7);
502 
503                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
504                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
505 
506                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
507                 if (last)
508                         return b;
509 
510                 if (unlikely(ret == -1))
511                         ret = b / XSAVE_YMM_SIZE;
512 
513                 continue;
514 nomatch:
515                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
516 nothing:
517                 ;
518         }
519 
520         return ret;
521 }
522 
523 /**
524  * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
525  * @map:        Previous match result, used as initial bitmap
526  * @fill:       Destination bitmap to be filled with current match result
527  * @f:          Field, containing lookup and mapping tables
528  * @offset:     Ignore buckets before the given index, no bits are filled there
529  * @pkt:        Packet data, pointer to input nftables register
530  * @first:      If this is the first field, don't source previous result
531  * @last:       Last field: stop at the first match and return bit index
532  *
533  * See nft_pipapo_avx2_lookup_4b_2().
534  *
535  * This is used for 128-bit fields (i.e. IPv6 addresses).
536  *
537  * Return: -1 on no match, rule index of match if @last, otherwise first long
538  * word index to be checked next (i.e. first filled word).
539  */
540 static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
541                                         struct nft_pipapo_field *f, int offset,
542                                         const u8 *pkt, bool first, bool last)
543 {
544         u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
545                        pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
546                        pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
547                        pkt[6] >> 4,  pkt[6] & 0xf,  pkt[7] >> 4,  pkt[7] & 0xf,
548                        pkt[8] >> 4,  pkt[8] & 0xf,  pkt[9] >> 4,  pkt[9] & 0xf,
549                       pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
550                       pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
551                       pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
552                     };
553         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
554         unsigned long *lt = f->lt, bsize = f->bsize;
555 
556         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
557         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
558                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
559 
560                 if (!first)
561                         NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
562 
563                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
564                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
565                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
566                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
567                 if (!first) {
568                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
569                         NFT_PIPAPO_AVX2_AND(1, 1, 0);
570                 }
571 
572                 NFT_PIPAPO_AVX2_AND(5,   2,  3);
573                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt,  4,  pg[4], bsize);
574                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
575                 NFT_PIPAPO_AVX2_AND(8,   1,  4);
576                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt,  6,  pg[6], bsize);
577                 NFT_PIPAPO_AVX2_AND(10,  5,  6);
578                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt,  7,  pg[7], bsize);
579                 NFT_PIPAPO_AVX2_AND(12,  7,  8);
580                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt,  8,  pg[8], bsize);
581                 NFT_PIPAPO_AVX2_AND(14,  9, 10);
582 
583                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt,  9,  pg[9], bsize);
584                 NFT_PIPAPO_AVX2_AND(1,  11, 12);
585                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 10, pg[10], bsize);
586                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11, pg[11], bsize);
587                 NFT_PIPAPO_AVX2_AND(4,  13, 14);
588                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 12, pg[12], bsize);
589                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 13, pg[13], bsize);
590                 NFT_PIPAPO_AVX2_AND(7,   0,  1);
591                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 14, pg[14], bsize);
592                 NFT_PIPAPO_AVX2_AND(9,   2,  3);
593                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
594                 NFT_PIPAPO_AVX2_AND(11,  4,  5);
595                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
596                 NFT_PIPAPO_AVX2_AND(13,  6,  7);
597                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
598 
599                 NFT_PIPAPO_AVX2_AND(0,   8,  9);
600                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 18, pg[18], bsize);
601                 NFT_PIPAPO_AVX2_AND(2,  10, 11);
602                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 19, pg[19], bsize);
603                 NFT_PIPAPO_AVX2_AND(4,  12, 13);
604                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 20, pg[20], bsize);
605                 NFT_PIPAPO_AVX2_AND(6,  14,  0);
606                 NFT_PIPAPO_AVX2_AND(7,   1,  2);
607                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 21, pg[21], bsize);
608                 NFT_PIPAPO_AVX2_AND(9,   3,  4);
609                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
610                 NFT_PIPAPO_AVX2_AND(11,  5,  6);
611                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
612                 NFT_PIPAPO_AVX2_AND(13,  7,  8);
613 
614                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
615                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 25, pg[25], bsize);
616                 NFT_PIPAPO_AVX2_AND(1,   9, 10);
617                 NFT_PIPAPO_AVX2_AND(2,  11, 12);
618                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 26, pg[26], bsize);
619                 NFT_PIPAPO_AVX2_AND(4,  13, 14);
620                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 27, pg[27], bsize);
621                 NFT_PIPAPO_AVX2_AND(6,   0,  1);
622                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 28, pg[28], bsize);
623                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 29, pg[29], bsize);
624                 NFT_PIPAPO_AVX2_AND(9,   2,  3);
625                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
626                 NFT_PIPAPO_AVX2_AND(11,  4,  5);
627                 NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
628 
629                 NFT_PIPAPO_AVX2_AND(0,   6,  7);
630                 NFT_PIPAPO_AVX2_AND(1,   8,  9);
631                 NFT_PIPAPO_AVX2_AND(2,  10, 11);
632                 NFT_PIPAPO_AVX2_AND(3,  12,  0);
633 
634                 /* Stalls */
635                 NFT_PIPAPO_AVX2_AND(4,   1,  2);
636                 NFT_PIPAPO_AVX2_AND(5,   3,  4);
637 
638                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
639                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
640 
641                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
642                 if (last)
643                         return b;
644 
645                 if (unlikely(ret == -1))
646                         ret = b / XSAVE_YMM_SIZE;
647 
648                 continue;
649 nomatch:
650                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
651 nothing:
652                 ;
653         }
654 
655         return ret;
656 }
657 
658 /**
659  * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
660  * @map:        Previous match result, used as initial bitmap
661  * @fill:       Destination bitmap to be filled with current match result
662  * @f:          Field, containing lookup and mapping tables
663  * @offset:     Ignore buckets before the given index, no bits are filled there
664  * @pkt:        Packet data, pointer to input nftables register
665  * @first:      If this is the first field, don't source previous result
666  * @last:       Last field: stop at the first match and return bit index
667  *
668  * See nft_pipapo_avx2_lookup_4b_2().
669  *
670  * This is used for 8-bit fields (i.e. protocol numbers).
671  *
672  * Return: -1 on no match, rule index of match if @last, otherwise first long
673  * word index to be checked next (i.e. first filled word).
674  */
675 static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
676                                        struct nft_pipapo_field *f, int offset,
677                                        const u8 *pkt, bool first, bool last)
678 {
679         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
680         unsigned long *lt = f->lt, bsize = f->bsize;
681 
682         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
683         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
684                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
685 
686                 if (first) {
687                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
688                 } else {
689                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
690                         NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
691                         NFT_PIPAPO_AVX2_AND(2, 0, 1);
692                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
693                 }
694 
695                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
696                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
697 
698                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
699                 if (last)
700                         return b;
701 
702                 if (unlikely(ret == -1))
703                         ret = b / XSAVE_YMM_SIZE;
704 
705                 continue;
706 nomatch:
707                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
708 nothing:
709                 ;
710         }
711 
712         return ret;
713 }
714 
715 /**
716  * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
717  * @map:        Previous match result, used as initial bitmap
718  * @fill:       Destination bitmap to be filled with current match result
719  * @f:          Field, containing lookup and mapping tables
720  * @offset:     Ignore buckets before the given index, no bits are filled there
721  * @pkt:        Packet data, pointer to input nftables register
722  * @first:      If this is the first field, don't source previous result
723  * @last:       Last field: stop at the first match and return bit index
724  *
725  * See nft_pipapo_avx2_lookup_4b_2().
726  *
727  * This is used for 16-bit fields (i.e. ports).
728  *
729  * Return: -1 on no match, rule index of match if @last, otherwise first long
730  * word index to be checked next (i.e. first filled word).
731  */
732 static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
733                                        struct nft_pipapo_field *f, int offset,
734                                        const u8 *pkt, bool first, bool last)
735 {
736         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
737         unsigned long *lt = f->lt, bsize = f->bsize;
738 
739         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
740         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
741                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
742 
743                 if (first) {
744                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
745                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
746                         NFT_PIPAPO_AVX2_AND(4, 0, 1);
747                 } else {
748                         NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
749                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
750                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
751 
752                         /* Stall */
753                         NFT_PIPAPO_AVX2_AND(3, 0, 1);
754                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
755                         NFT_PIPAPO_AVX2_AND(4, 3, 2);
756                 }
757 
758                 /* Stall */
759                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
760                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
761 
762                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
763                 if (last)
764                         return b;
765 
766                 if (unlikely(ret == -1))
767                         ret = b / XSAVE_YMM_SIZE;
768 
769                 continue;
770 nomatch:
771                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
772 nothing:
773                 ;
774         }
775 
776         return ret;
777 }
778 
779 /**
780  * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
781  * @map:        Previous match result, used as initial bitmap
782  * @fill:       Destination bitmap to be filled with current match result
783  * @f:          Field, containing lookup and mapping tables
784  * @offset:     Ignore buckets before the given index, no bits are filled there
785  * @pkt:        Packet data, pointer to input nftables register
786  * @first:      If this is the first field, don't source previous result
787  * @last:       Last field: stop at the first match and return bit index
788  *
789  * See nft_pipapo_avx2_lookup_4b_2().
790  *
791  * This is used for 32-bit fields (i.e. IPv4 addresses).
792  *
793  * Return: -1 on no match, rule index of match if @last, otherwise first long
794  * word index to be checked next (i.e. first filled word).
795  */
796 static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
797                                        struct nft_pipapo_field *f, int offset,
798                                        const u8 *pkt, bool first, bool last)
799 {
800         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
801         unsigned long *lt = f->lt, bsize = f->bsize;
802 
803         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
804         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
805                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
806 
807                 if (first) {
808                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
809                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
810                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
811                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
812 
813                         /* Stall */
814                         NFT_PIPAPO_AVX2_AND(4, 0, 1);
815                         NFT_PIPAPO_AVX2_AND(5, 2, 3);
816                         NFT_PIPAPO_AVX2_AND(0, 4, 5);
817                 } else {
818                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
819                         NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
820                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
821                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
822                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
823 
824                         NFT_PIPAPO_AVX2_AND(5, 0, 1);
825                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
826                         NFT_PIPAPO_AVX2_AND(6, 2, 3);
827 
828                         /* Stall */
829                         NFT_PIPAPO_AVX2_AND(7, 4, 5);
830                         NFT_PIPAPO_AVX2_AND(0, 6, 7);
831                 }
832 
833                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
834                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
835 
836                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
837                 if (last)
838                         return b;
839 
840                 if (unlikely(ret == -1))
841                         ret = b / XSAVE_YMM_SIZE;
842 
843                 continue;
844 
845 nomatch:
846                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
847 nothing:
848                 ;
849         }
850 
851         return ret;
852 }
853 
854 /**
855  * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
856  * @map:        Previous match result, used as initial bitmap
857  * @fill:       Destination bitmap to be filled with current match result
858  * @f:          Field, containing lookup and mapping tables
859  * @offset:     Ignore buckets before the given index, no bits are filled there
860  * @pkt:        Packet data, pointer to input nftables register
861  * @first:      If this is the first field, don't source previous result
862  * @last:       Last field: stop at the first match and return bit index
863  *
864  * See nft_pipapo_avx2_lookup_4b_2().
865  *
866  * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
867  *
868  * Return: -1 on no match, rule index of match if @last, otherwise first long
869  * word index to be checked next (i.e. first filled word).
870  */
871 static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
872                                        struct nft_pipapo_field *f, int offset,
873                                        const u8 *pkt, bool first, bool last)
874 {
875         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
876         unsigned long *lt = f->lt, bsize = f->bsize;
877 
878         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
879         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
880                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
881 
882                 if (first) {
883                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
884                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
885                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
886                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
887                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 4, pkt[4], bsize);
888 
889                         NFT_PIPAPO_AVX2_AND(5, 0, 1);
890                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(6,  lt, 6, pkt[5], bsize);
891                         NFT_PIPAPO_AVX2_AND(7, 2, 3);
892 
893                         /* Stall */
894                         NFT_PIPAPO_AVX2_AND(0, 4, 5);
895                         NFT_PIPAPO_AVX2_AND(1, 6, 7);
896                         NFT_PIPAPO_AVX2_AND(4, 0, 1);
897                 } else {
898                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
899                         NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
900                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
901                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
902                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
903 
904                         NFT_PIPAPO_AVX2_AND(5, 0, 1);
905                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
906 
907                         NFT_PIPAPO_AVX2_AND(6, 2, 3);
908                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(7,  lt, 4, pkt[4], bsize);
909                         NFT_PIPAPO_AVX2_AND(0, 4, 5);
910                         NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 5, pkt[5], bsize);
911                         NFT_PIPAPO_AVX2_AND(2, 6, 7);
912 
913                         /* Stall */
914                         NFT_PIPAPO_AVX2_AND(3, 0, 1);
915                         NFT_PIPAPO_AVX2_AND(4, 2, 3);
916                 }
917 
918                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
919                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
920 
921                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
922                 if (last)
923                         return b;
924 
925                 if (unlikely(ret == -1))
926                         ret = b / XSAVE_YMM_SIZE;
927 
928                 continue;
929 
930 nomatch:
931                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
932 nothing:
933                 ;
934         }
935 
936         return ret;
937 }
938 
939 /**
940  * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
941  * @map:        Previous match result, used as initial bitmap
942  * @fill:       Destination bitmap to be filled with current match result
943  * @f:          Field, containing lookup and mapping tables
944  * @offset:     Ignore buckets before the given index, no bits are filled there
945  * @pkt:        Packet data, pointer to input nftables register
946  * @first:      If this is the first field, don't source previous result
947  * @last:       Last field: stop at the first match and return bit index
948  *
949  * See nft_pipapo_avx2_lookup_4b_2().
950  *
951  * This is used for 128-bit fields (i.e. IPv6 addresses).
952  *
953  * Return: -1 on no match, rule index of match if @last, otherwise first long
954  * word index to be checked next (i.e. first filled word).
955  */
956 static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
957                                         struct nft_pipapo_field *f, int offset,
958                                         const u8 *pkt, bool first, bool last)
959 {
960         int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
961         unsigned long *lt = f->lt, bsize = f->bsize;
962 
963         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
964         for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
965                 int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
966 
967                 if (!first)
968                         NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
969 
970                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  0,  pkt[0], bsize);
971                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  1,  pkt[1], bsize);
972                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt,  2,  pkt[2], bsize);
973                 if (!first) {
974                         NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
975                         NFT_PIPAPO_AVX2_AND(1, 1, 0);
976                 }
977                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt,  3,  pkt[3], bsize);
978 
979                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  4,  pkt[4], bsize);
980                 NFT_PIPAPO_AVX2_AND(6, 1, 2);
981                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  5,  pkt[5], bsize);
982                 NFT_PIPAPO_AVX2_AND(0, 3, 4);
983                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  6,  pkt[6], bsize);
984 
985                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  7,  pkt[7], bsize);
986                 NFT_PIPAPO_AVX2_AND(3, 5, 6);
987                 NFT_PIPAPO_AVX2_AND(4, 0, 1);
988                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  8,  pkt[8], bsize);
989 
990                 NFT_PIPAPO_AVX2_AND(6, 2, 3);
991                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  9,  pkt[9], bsize);
992                 NFT_PIPAPO_AVX2_AND(0, 4, 5);
993                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
994                 NFT_PIPAPO_AVX2_AND(2, 6, 7);
995                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
996                 NFT_PIPAPO_AVX2_AND(4, 0, 1);
997                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
998                 NFT_PIPAPO_AVX2_AND(6, 2, 3);
999                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
1000                 NFT_PIPAPO_AVX2_AND(0, 4, 5);
1001                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
1002                 NFT_PIPAPO_AVX2_AND(2, 6, 7);
1003                 NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
1004                 NFT_PIPAPO_AVX2_AND(4, 0, 1);
1005 
1006                 /* Stall */
1007                 NFT_PIPAPO_AVX2_AND(5, 2, 3);
1008                 NFT_PIPAPO_AVX2_AND(6, 4, 5);
1009 
1010                 NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
1011                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
1012 
1013                 b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
1014                 if (last)
1015                         return b;
1016 
1017                 if (unlikely(ret == -1))
1018                         ret = b / XSAVE_YMM_SIZE;
1019 
1020                 continue;
1021 
1022 nomatch:
1023                 NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
1024 nothing:
1025                 ;
1026         }
1027 
1028         return ret;
1029 }
1030 
1031 /**
1032  * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
1033  * @map:        Previous match result, used as initial bitmap
1034  * @fill:       Destination bitmap to be filled with current match result
1035  * @f:          Field, containing lookup and mapping tables
1036  * @offset:     Ignore buckets before the given index, no bits are filled there
1037  * @pkt:        Packet data, pointer to input nftables register
1038  * @first:      If this is the first field, don't source previous result
1039  * @last:       Last field: stop at the first match and return bit index
1040  *
1041  * This function should never be called, but is provided for the case the field
1042  * size doesn't match any of the known data types. Matching rate is
1043  * substantially lower than AVX2 routines.
1044  *
1045  * Return: -1 on no match, rule index of match if @last, otherwise first long
1046  * word index to be checked next (i.e. first filled word).
1047  */
1048 static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
1049                                         struct nft_pipapo_field *f, int offset,
1050                                         const u8 *pkt, bool first, bool last)
1051 {
1052         unsigned long *lt = f->lt, bsize = f->bsize;
1053         int i, ret = -1, b;
1054 
1055         lt += offset * NFT_PIPAPO_LONGS_PER_M256;
1056 
1057         if (first)
1058                 memset(map, 0xff, bsize * sizeof(*map));
1059 
1060         for (i = offset; i < bsize; i++) {
1061                 if (f->bb == 8)
1062                         pipapo_and_field_buckets_8bit(f, map, pkt);
1063                 else
1064                         pipapo_and_field_buckets_4bit(f, map, pkt);
1065                 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1066 
1067                 b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
1068 
1069                 if (last)
1070                         return b;
1071 
1072                 if (ret == -1)
1073                         ret = b / XSAVE_YMM_SIZE;
1074         }
1075 
1076         return ret;
1077 }
1078 
1079 /**
1080  * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
1081  * @desc:       Set description, element count and field description used
1082  * @features:   Flags: NFT_SET_INTERVAL needs to be there
1083  * @est:        Storage for estimation data
1084  *
1085  * Return: true if set is compatible and AVX2 available, false otherwise.
1086  */
1087 bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
1088                               struct nft_set_estimate *est)
1089 {
1090         if (!(features & NFT_SET_INTERVAL) ||
1091             desc->field_count < NFT_PIPAPO_MIN_FIELDS)
1092                 return false;
1093 
1094         if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
1095                 return false;
1096 
1097         est->size = pipapo_estimate_size(desc);
1098         if (!est->size)
1099                 return false;
1100 
1101         est->lookup = NFT_SET_CLASS_O_LOG_N;
1102 
1103         est->space = NFT_SET_CLASS_O_N;
1104 
1105         return true;
1106 }
1107 
1108 /**
1109  * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
1110  * @net:        Network namespace
1111  * @set:        nftables API set representation
1112  * @elem:       nftables API element representation containing key data
1113  * @ext:        nftables API extension pointer, filled with matching reference
1114  *
1115  * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
1116  *
1117  * This implementation exploits the repetitive characteristic of the algorithm
1118  * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
1119  *
1120  * Return: true on match, false otherwise.
1121  */
1122 bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
1123                             const u32 *key, const struct nft_set_ext **ext)
1124 {
1125         struct nft_pipapo *priv = nft_set_priv(set);
1126         unsigned long *res, *fill, *scratch;
1127         u8 genmask = nft_genmask_cur(net);
1128         const u8 *rp = (const u8 *)key;
1129         struct nft_pipapo_match *m;
1130         struct nft_pipapo_field *f;
1131         bool map_index;
1132         int i, ret = 0;
1133 
1134         m = rcu_dereference(priv->match);
1135 
1136         /* This also protects access to all data related to scratch maps */
1137         kernel_fpu_begin();
1138 
1139         scratch = *raw_cpu_ptr(m->scratch_aligned);
1140         if (unlikely(!scratch)) {
1141                 kernel_fpu_end();
1142                 return false;
1143         }
1144         map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
1145 
1146         res  = scratch + (map_index ? m->bsize_max : 0);
1147         fill = scratch + (map_index ? 0 : m->bsize_max);
1148 
1149         /* Starting map doesn't need to be set for this implementation */
1150 
1151         nft_pipapo_avx2_prepare();
1152 
1153 next_match:
1154         nft_pipapo_for_each_field(f, i, m) {
1155                 bool last = i == m->field_count - 1, first = !i;
1156 
1157 #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n)                                \
1158                 (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f,  \
1159                                                          ret, rp,       \
1160                                                          first, last))
1161 
1162                 if (likely(f->bb == 8)) {
1163                         if (f->groups == 1) {
1164                                 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
1165                         } else if (f->groups == 2) {
1166                                 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
1167                         } else if (f->groups == 4) {
1168                                 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
1169                         } else if (f->groups == 6) {
1170                                 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
1171                         } else if (f->groups == 16) {
1172                                 NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
1173                         } else {
1174                                 ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1175                                                                   ret, rp,
1176                                                                   first, last);
1177                         }
1178                 } else {
1179                         if (f->groups == 2) {
1180                                 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
1181                         } else if (f->groups == 4) {
1182                                 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
1183                         } else if (f->groups == 8) {
1184                                 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
1185                         } else if (f->groups == 12) {
1186                                 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
1187                         } else if (f->groups == 32) {
1188                                 NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
1189                         } else {
1190                                 ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1191                                                                   ret, rp,
1192                                                                   first, last);
1193                         }
1194                 }
1195                 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1196 
1197 #undef NFT_SET_PIPAPO_AVX2_LOOKUP
1198 
1199                 if (ret < 0)
1200                         goto out;
1201 
1202                 if (last) {
1203                         *ext = &f->mt[ret].e->ext;
1204                         if (unlikely(nft_set_elem_expired(*ext) ||
1205                                      !nft_set_elem_active(*ext, genmask))) {
1206                                 ret = 0;
1207                                 goto next_match;
1208                         }
1209 
1210                         goto out;
1211                 }
1212 
1213                 swap(res, fill);
1214                 rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
1215         }
1216 
1217 out:
1218         if (i % 2)
1219                 raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
1220         kernel_fpu_end();
1221 
1222         return ret >= 0;
1223 }
1224 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp