1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2017 - 2018 Intel Corporation. */ 3 4 #include <asm/barrier.h> 5 #include <errno.h> 6 #include <getopt.h> 7 #include <libgen.h> 8 #include <linux/bpf.h> 9 #include <linux/compiler.h> 10 #include <linux/if_link.h> 11 #include <linux/if_xdp.h> 12 #include <linux/if_ether.h> 13 #include <linux/ip.h> 14 #include <linux/limits.h> 15 #include <linux/udp.h> 16 #include <arpa/inet.h> 17 #include <locale.h> 18 #include <net/ethernet.h> 19 #include <net/if.h> 20 #include <poll.h> 21 #include <pthread.h> 22 #include <signal.h> 23 #include <stdbool.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <sys/capability.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 #include <sys/socket.h> 31 #include <sys/types.h> 32 #include <sys/un.h> 33 #include <time.h> 34 #include <unistd.h> 35 36 #include <bpf/libbpf.h> 37 #include <bpf/xsk.h> 38 #include <bpf/bpf.h> 39 #include "xdpsock.h" 40 41 #ifndef SOL_XDP 42 #define SOL_XDP 283 43 #endif 44 45 #ifndef AF_XDP 46 #define AF_XDP 44 47 #endif 48 49 #ifndef PF_XDP 50 #define PF_XDP AF_XDP 51 #endif 52 53 #define NUM_FRAMES (4 * 1024) 54 #define MIN_PKT_SIZE 64 55 56 #define DEBUG_HEXDUMP 0 57 58 typedef __u64 u64; 59 typedef __u32 u32; 60 typedef __u16 u16; 61 typedef __u8 u8; 62 63 static unsigned long prev_time; 64 65 enum benchmark_type { 66 BENCH_RXDROP = 0, 67 BENCH_TXONLY = 1, 68 BENCH_L2FWD = 2, 69 }; 70 71 static enum benchmark_type opt_bench = BENCH_RXDROP; 72 static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; 73 static const char *opt_if = ""; 74 static int opt_ifindex; 75 static int opt_queue; 76 static unsigned long opt_duration; 77 static unsigned long start_time; 78 static bool benchmark_done; 79 static u32 opt_batch_size = 64; 80 static int opt_pkt_count; 81 static u16 opt_pkt_size = MIN_PKT_SIZE; 82 static u32 opt_pkt_fill_pattern = 0x12345678; 83 static bool opt_extra_stats; 84 static bool opt_quiet; 85 static bool opt_app_stats; 86 static const char *opt_irq_str = ""; 87 static u32 irq_no; 88 static int irqs_at_init = -1; 89 static int opt_poll; 90 static int opt_interval = 1; 91 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; 92 static u32 opt_umem_flags; 93 static int opt_unaligned_chunks; 94 static int opt_mmap_flags; 95 static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; 96 static int opt_timeout = 1000; 97 static bool opt_need_wakeup = true; 98 static u32 opt_num_xsks = 1; 99 static u32 prog_id; 100 static bool opt_busy_poll; 101 static bool opt_reduced_cap; 102 103 struct xsk_ring_stats { 104 unsigned long rx_npkts; 105 unsigned long tx_npkts; 106 unsigned long rx_dropped_npkts; 107 unsigned long rx_invalid_npkts; 108 unsigned long tx_invalid_npkts; 109 unsigned long rx_full_npkts; 110 unsigned long rx_fill_empty_npkts; 111 unsigned long tx_empty_npkts; 112 unsigned long prev_rx_npkts; 113 unsigned long prev_tx_npkts; 114 unsigned long prev_rx_dropped_npkts; 115 unsigned long prev_rx_invalid_npkts; 116 unsigned long prev_tx_invalid_npkts; 117 unsigned long prev_rx_full_npkts; 118 unsigned long prev_rx_fill_empty_npkts; 119 unsigned long prev_tx_empty_npkts; 120 }; 121 122 struct xsk_driver_stats { 123 unsigned long intrs; 124 unsigned long prev_intrs; 125 }; 126 127 struct xsk_app_stats { 128 unsigned long rx_empty_polls; 129 unsigned long fill_fail_polls; 130 unsigned long copy_tx_sendtos; 131 unsigned long tx_wakeup_sendtos; 132 unsigned long opt_polls; 133 unsigned long prev_rx_empty_polls; 134 unsigned long prev_fill_fail_polls; 135 unsigned long prev_copy_tx_sendtos; 136 unsigned long prev_tx_wakeup_sendtos; 137 unsigned long prev_opt_polls; 138 }; 139 140 struct xsk_umem_info { 141 struct xsk_ring_prod fq; 142 struct xsk_ring_cons cq; 143 struct xsk_umem *umem; 144 void *buffer; 145 }; 146 147 struct xsk_socket_info { 148 struct xsk_ring_cons rx; 149 struct xsk_ring_prod tx; 150 struct xsk_umem_info *umem; 151 struct xsk_socket *xsk; 152 struct xsk_ring_stats ring_stats; 153 struct xsk_app_stats app_stats; 154 struct xsk_driver_stats drv_stats; 155 u32 outstanding_tx; 156 }; 157 158 static int num_socks; 159 struct xsk_socket_info *xsks[MAX_SOCKS]; 160 int sock; 161 162 static unsigned long get_nsecs(void) 163 { 164 struct timespec ts; 165 166 clock_gettime(CLOCK_MONOTONIC, &ts); 167 return ts.tv_sec * 1000000000UL + ts.tv_nsec; 168 } 169 170 static void print_benchmark(bool running) 171 { 172 const char *bench_str = "INVALID"; 173 174 if (opt_bench == BENCH_RXDROP) 175 bench_str = "rxdrop"; 176 else if (opt_bench == BENCH_TXONLY) 177 bench_str = "txonly"; 178 else if (opt_bench == BENCH_L2FWD) 179 bench_str = "l2fwd"; 180 181 printf("%s:%d %s ", opt_if, opt_queue, bench_str); 182 if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) 183 printf("xdp-skb "); 184 else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) 185 printf("xdp-drv "); 186 else 187 printf(" "); 188 189 if (opt_poll) 190 printf("poll() "); 191 192 if (running) { 193 printf("running..."); 194 fflush(stdout); 195 } 196 } 197 198 static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) 199 { 200 struct xdp_statistics stats; 201 socklen_t optlen; 202 int err; 203 204 optlen = sizeof(stats); 205 err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); 206 if (err) 207 return err; 208 209 if (optlen == sizeof(struct xdp_statistics)) { 210 xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; 211 xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; 212 xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; 213 xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; 214 xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; 215 xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; 216 return 0; 217 } 218 219 return -EINVAL; 220 } 221 222 static void dump_app_stats(long dt) 223 { 224 int i; 225 226 for (i = 0; i < num_socks && xsks[i]; i++) { 227 char *fmt = "%-18s %'-14.0f %'-14lu\n"; 228 double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, 229 tx_wakeup_sendtos_ps, opt_polls_ps; 230 231 rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - 232 xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; 233 fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - 234 xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; 235 copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - 236 xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; 237 tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - 238 xsks[i]->app_stats.prev_tx_wakeup_sendtos) 239 * 1000000000. / dt; 240 opt_polls_ps = (xsks[i]->app_stats.opt_polls - 241 xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; 242 243 printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); 244 printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); 245 printf(fmt, "fill fail polls", fill_fail_polls_ps, 246 xsks[i]->app_stats.fill_fail_polls); 247 printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, 248 xsks[i]->app_stats.copy_tx_sendtos); 249 printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, 250 xsks[i]->app_stats.tx_wakeup_sendtos); 251 printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); 252 253 xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; 254 xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; 255 xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; 256 xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; 257 xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; 258 } 259 } 260 261 static bool get_interrupt_number(void) 262 { 263 FILE *f_int_proc; 264 char line[4096]; 265 bool found = false; 266 267 f_int_proc = fopen("/proc/interrupts", "r"); 268 if (f_int_proc == NULL) { 269 printf("Failed to open /proc/interrupts.\n"); 270 return found; 271 } 272 273 while (!feof(f_int_proc) && !found) { 274 /* Make sure to read a full line at a time */ 275 if (fgets(line, sizeof(line), f_int_proc) == NULL || 276 line[strlen(line) - 1] != '\n') { 277 printf("Error reading from interrupts file\n"); 278 break; 279 } 280 281 /* Extract interrupt number from line */ 282 if (strstr(line, opt_irq_str) != NULL) { 283 irq_no = atoi(line); 284 found = true; 285 break; 286 } 287 } 288 289 fclose(f_int_proc); 290 291 return found; 292 } 293 294 static int get_irqs(void) 295 { 296 char count_path[PATH_MAX]; 297 int total_intrs = -1; 298 FILE *f_count_proc; 299 char line[4096]; 300 301 snprintf(count_path, sizeof(count_path), 302 "/sys/kernel/irq/%i/per_cpu_count", irq_no); 303 f_count_proc = fopen(count_path, "r"); 304 if (f_count_proc == NULL) { 305 printf("Failed to open %s\n", count_path); 306 return total_intrs; 307 } 308 309 if (fgets(line, sizeof(line), f_count_proc) == NULL || 310 line[strlen(line) - 1] != '\n') { 311 printf("Error reading from %s\n", count_path); 312 } else { 313 static const char com[2] = ","; 314 char *token; 315 316 total_intrs = 0; 317 token = strtok(line, com); 318 while (token != NULL) { 319 /* sum up interrupts across all cores */ 320 total_intrs += atoi(token); 321 token = strtok(NULL, com); 322 } 323 } 324 325 fclose(f_count_proc); 326 327 return total_intrs; 328 } 329 330 static void dump_driver_stats(long dt) 331 { 332 int i; 333 334 for (i = 0; i < num_socks && xsks[i]; i++) { 335 char *fmt = "%-18s %'-14.0f %'-14lu\n"; 336 double intrs_ps; 337 int n_ints = get_irqs(); 338 339 if (n_ints < 0) { 340 printf("error getting intr info for intr %i\n", irq_no); 341 return; 342 } 343 xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; 344 345 intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * 346 1000000000. / dt; 347 348 printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); 349 printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); 350 351 xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; 352 } 353 } 354 355 static void dump_stats(void) 356 { 357 unsigned long now = get_nsecs(); 358 long dt = now - prev_time; 359 int i; 360 361 prev_time = now; 362 363 for (i = 0; i < num_socks && xsks[i]; i++) { 364 char *fmt = "%-18s %'-14.0f %'-14lu\n"; 365 double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, 366 tx_invalid_pps, tx_empty_pps; 367 368 rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * 369 1000000000. / dt; 370 tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * 371 1000000000. / dt; 372 373 printf("\n sock%d@", i); 374 print_benchmark(false); 375 printf("\n"); 376 377 printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", 378 dt / 1000000000.); 379 printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); 380 printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); 381 382 xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; 383 xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; 384 385 if (opt_extra_stats) { 386 if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { 387 dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - 388 xsks[i]->ring_stats.prev_rx_dropped_npkts) * 389 1000000000. / dt; 390 rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - 391 xsks[i]->ring_stats.prev_rx_invalid_npkts) * 392 1000000000. / dt; 393 tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - 394 xsks[i]->ring_stats.prev_tx_invalid_npkts) * 395 1000000000. / dt; 396 full_pps = (xsks[i]->ring_stats.rx_full_npkts - 397 xsks[i]->ring_stats.prev_rx_full_npkts) * 398 1000000000. / dt; 399 fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - 400 xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * 401 1000000000. / dt; 402 tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - 403 xsks[i]->ring_stats.prev_tx_empty_npkts) * 404 1000000000. / dt; 405 406 printf(fmt, "rx dropped", dropped_pps, 407 xsks[i]->ring_stats.rx_dropped_npkts); 408 printf(fmt, "rx invalid", rx_invalid_pps, 409 xsks[i]->ring_stats.rx_invalid_npkts); 410 printf(fmt, "tx invalid", tx_invalid_pps, 411 xsks[i]->ring_stats.tx_invalid_npkts); 412 printf(fmt, "rx queue full", full_pps, 413 xsks[i]->ring_stats.rx_full_npkts); 414 printf(fmt, "fill ring empty", fill_empty_pps, 415 xsks[i]->ring_stats.rx_fill_empty_npkts); 416 printf(fmt, "tx ring empty", tx_empty_pps, 417 xsks[i]->ring_stats.tx_empty_npkts); 418 419 xsks[i]->ring_stats.prev_rx_dropped_npkts = 420 xsks[i]->ring_stats.rx_dropped_npkts; 421 xsks[i]->ring_stats.prev_rx_invalid_npkts = 422 xsks[i]->ring_stats.rx_invalid_npkts; 423 xsks[i]->ring_stats.prev_tx_invalid_npkts = 424 xsks[i]->ring_stats.tx_invalid_npkts; 425 xsks[i]->ring_stats.prev_rx_full_npkts = 426 xsks[i]->ring_stats.rx_full_npkts; 427 xsks[i]->ring_stats.prev_rx_fill_empty_npkts = 428 xsks[i]->ring_stats.rx_fill_empty_npkts; 429 xsks[i]->ring_stats.prev_tx_empty_npkts = 430 xsks[i]->ring_stats.tx_empty_npkts; 431 } else { 432 printf("%-15s\n", "Error retrieving extra stats"); 433 } 434 } 435 } 436 437 if (opt_app_stats) 438 dump_app_stats(dt); 439 if (irq_no) 440 dump_driver_stats(dt); 441 } 442 443 static bool is_benchmark_done(void) 444 { 445 if (opt_duration > 0) { 446 unsigned long dt = (get_nsecs() - start_time); 447 448 if (dt >= opt_duration) 449 benchmark_done = true; 450 } 451 return benchmark_done; 452 } 453 454 static void *poller(void *arg) 455 { 456 (void)arg; 457 while (!is_benchmark_done()) { 458 sleep(opt_interval); 459 dump_stats(); 460 } 461 462 return NULL; 463 } 464 465 static void remove_xdp_program(void) 466 { 467 u32 curr_prog_id = 0; 468 int cmd = CLOSE_CONN; 469 470 if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { 471 printf("bpf_get_link_xdp_id failed\n"); 472 exit(EXIT_FAILURE); 473 } 474 if (prog_id == curr_prog_id) 475 bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); 476 else if (!curr_prog_id) 477 printf("couldn't find a prog id on a given interface\n"); 478 else 479 printf("program on interface changed, not removing\n"); 480 481 if (opt_reduced_cap) { 482 if (write(sock, &cmd, sizeof(int)) < 0) { 483 fprintf(stderr, "Error writing into stream socket: %s", strerror(errno)); 484 exit(EXIT_FAILURE); 485 } 486 } 487 } 488 489 static void int_exit(int sig) 490 { 491 benchmark_done = true; 492 } 493 494 static void xdpsock_cleanup(void) 495 { 496 struct xsk_umem *umem = xsks[0]->umem->umem; 497 int i; 498 499 dump_stats(); 500 for (i = 0; i < num_socks; i++) 501 xsk_socket__delete(xsks[i]->xsk); 502 (void)xsk_umem__delete(umem); 503 remove_xdp_program(); 504 } 505 506 static void __exit_with_error(int error, const char *file, const char *func, 507 int line) 508 { 509 fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, 510 line, error, strerror(error)); 511 dump_stats(); 512 remove_xdp_program(); 513 exit(EXIT_FAILURE); 514 } 515 516 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ 517 __LINE__) 518 static void swap_mac_addresses(void *data) 519 { 520 struct ether_header *eth = (struct ether_header *)data; 521 struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; 522 struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; 523 struct ether_addr tmp; 524 525 tmp = *src_addr; 526 *src_addr = *dst_addr; 527 *dst_addr = tmp; 528 } 529 530 static void hex_dump(void *pkt, size_t length, u64 addr) 531 { 532 const unsigned char *address = (unsigned char *)pkt; 533 const unsigned char *line = address; 534 size_t line_size = 32; 535 unsigned char c; 536 char buf[32]; 537 int i = 0; 538 539 if (!DEBUG_HEXDUMP) 540 return; 541 542 sprintf(buf, "addr=%llu", addr); 543 printf("length = %zu\n", length); 544 printf("%s | ", buf); 545 while (length-- > 0) { 546 printf("%02X ", *address++); 547 if (!(++i % line_size) || (length == 0 && i % line_size)) { 548 if (length == 0) { 549 while (i++ % line_size) 550 printf("__ "); 551 } 552 printf(" | "); /* right close */ 553 while (line < address) { 554 c = *line++; 555 printf("%c", (c < 33 || c == 255) ? 0x2E : c); 556 } 557 printf("\n"); 558 if (length > 0) 559 printf("%s | ", buf); 560 } 561 } 562 printf("\n"); 563 } 564 565 static void *memset32_htonl(void *dest, u32 val, u32 size) 566 { 567 u32 *ptr = (u32 *)dest; 568 int i; 569 570 val = htonl(val); 571 572 for (i = 0; i < (size & (~0x3)); i += 4) 573 ptr[i >> 2] = val; 574 575 for (; i < size; i++) 576 ((char *)dest)[i] = ((char *)&val)[i & 3]; 577 578 return dest; 579 } 580 581 /* 582 * This function code has been taken from 583 * Linux kernel lib/checksum.c 584 */ 585 static inline unsigned short from32to16(unsigned int x) 586 { 587 /* add up 16-bit and 16-bit for 16+c bit */ 588 x = (x & 0xffff) + (x >> 16); 589 /* add up carry.. */ 590 x = (x & 0xffff) + (x >> 16); 591 return x; 592 } 593 594 /* 595 * This function code has been taken from 596 * Linux kernel lib/checksum.c 597 */ 598 static unsigned int do_csum(const unsigned char *buff, int len) 599 { 600 unsigned int result = 0; 601 int odd; 602 603 if (len <= 0) 604 goto out; 605 odd = 1 & (unsigned long)buff; 606 if (odd) { 607 #ifdef __LITTLE_ENDIAN 608 result += (*buff << 8); 609 #else 610 result = *buff; 611 #endif 612 len--; 613 buff++; 614 } 615 if (len >= 2) { 616 if (2 & (unsigned long)buff) { 617 result += *(unsigned short *)buff; 618 len -= 2; 619 buff += 2; 620 } 621 if (len >= 4) { 622 const unsigned char *end = buff + 623 ((unsigned int)len & ~3); 624 unsigned int carry = 0; 625 626 do { 627 unsigned int w = *(unsigned int *)buff; 628 629 buff += 4; 630 result += carry; 631 result += w; 632 carry = (w > result); 633 } while (buff < end); 634 result += carry; 635 result = (result & 0xffff) + (result >> 16); 636 } 637 if (len & 2) { 638 result += *(unsigned short *)buff; 639 buff += 2; 640 } 641 } 642 if (len & 1) 643 #ifdef __LITTLE_ENDIAN 644 result += *buff; 645 #else 646 result += (*buff << 8); 647 #endif 648 result = from32to16(result); 649 if (odd) 650 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); 651 out: 652 return result; 653 } 654 655 __sum16 ip_fast_csum(const void *iph, unsigned int ihl); 656 657 /* 658 * This is a version of ip_compute_csum() optimized for IP headers, 659 * which always checksum on 4 octet boundaries. 660 * This function code has been taken from 661 * Linux kernel lib/checksum.c 662 */ 663 __sum16 ip_fast_csum(const void *iph, unsigned int ihl) 664 { 665 return (__force __sum16)~do_csum(iph, ihl * 4); 666 } 667 668 /* 669 * Fold a partial checksum 670 * This function code has been taken from 671 * Linux kernel include/asm-generic/checksum.h 672 */ 673 static inline __sum16 csum_fold(__wsum csum) 674 { 675 u32 sum = (__force u32)csum; 676 677 sum = (sum & 0xffff) + (sum >> 16); 678 sum = (sum & 0xffff) + (sum >> 16); 679 return (__force __sum16)~sum; 680 } 681 682 /* 683 * This function code has been taken from 684 * Linux kernel lib/checksum.c 685 */ 686 static inline u32 from64to32(u64 x) 687 { 688 /* add up 32-bit and 32-bit for 32+c bit */ 689 x = (x & 0xffffffff) + (x >> 32); 690 /* add up carry.. */ 691 x = (x & 0xffffffff) + (x >> 32); 692 return (u32)x; 693 } 694 695 __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, 696 __u32 len, __u8 proto, __wsum sum); 697 698 /* 699 * This function code has been taken from 700 * Linux kernel lib/checksum.c 701 */ 702 __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, 703 __u32 len, __u8 proto, __wsum sum) 704 { 705 unsigned long long s = (__force u32)sum; 706 707 s += (__force u32)saddr; 708 s += (__force u32)daddr; 709 #ifdef __BIG_ENDIAN__ 710 s += proto + len; 711 #else 712 s += (proto + len) << 8; 713 #endif 714 return (__force __wsum)from64to32(s); 715 } 716 717 /* 718 * This function has been taken from 719 * Linux kernel include/asm-generic/checksum.h 720 */ 721 static inline __sum16 722 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, 723 __u8 proto, __wsum sum) 724 { 725 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); 726 } 727 728 static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, 729 u8 proto, u16 *udp_pkt) 730 { 731 u32 csum = 0; 732 u32 cnt = 0; 733 734 /* udp hdr and data */ 735 for (; cnt < len; cnt += 2) 736 csum += udp_pkt[cnt >> 1]; 737 738 return csum_tcpudp_magic(saddr, daddr, len, proto, csum); 739 } 740 741 #define ETH_FCS_SIZE 4 742 743 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ 744 sizeof(struct udphdr)) 745 746 #define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) 747 #define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) 748 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) 749 #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) 750 751 static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; 752 753 static void gen_eth_hdr_data(void) 754 { 755 struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + 756 sizeof(struct ethhdr) + 757 sizeof(struct iphdr)); 758 struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + 759 sizeof(struct ethhdr)); 760 struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; 761 762 /* ethernet header */ 763 memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); 764 memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); 765 eth_hdr->h_proto = htons(ETH_P_IP); 766 767 /* IP header */ 768 ip_hdr->version = IPVERSION; 769 ip_hdr->ihl = 0x5; /* 20 byte header */ 770 ip_hdr->tos = 0x0; 771 ip_hdr->tot_len = htons(IP_PKT_SIZE); 772 ip_hdr->id = 0; 773 ip_hdr->frag_off = 0; 774 ip_hdr->ttl = IPDEFTTL; 775 ip_hdr->protocol = IPPROTO_UDP; 776 ip_hdr->saddr = htonl(0x0a0a0a10); 777 ip_hdr->daddr = htonl(0x0a0a0a20); 778 779 /* IP header checksum */ 780 ip_hdr->check = 0; 781 ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); 782 783 /* UDP header */ 784 udp_hdr->source = htons(0x1000); 785 udp_hdr->dest = htons(0x1000); 786 udp_hdr->len = htons(UDP_PKT_SIZE); 787 788 /* UDP data */ 789 memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, 790 UDP_PKT_DATA_SIZE); 791 792 /* UDP header checksum */ 793 udp_hdr->check = 0; 794 udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, 795 IPPROTO_UDP, (u16 *)udp_hdr); 796 } 797 798 static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) 799 { 800 memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, 801 PKT_SIZE); 802 } 803 804 static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) 805 { 806 struct xsk_umem_info *umem; 807 struct xsk_umem_config cfg = { 808 /* We recommend that you set the fill ring size >= HW RX ring size + 809 * AF_XDP RX ring size. Make sure you fill up the fill ring 810 * with buffers at regular intervals, and you will with this setting 811 * avoid allocation failures in the driver. These are usually quite 812 * expensive since drivers have not been written to assume that 813 * allocation failures are common. For regular sockets, kernel 814 * allocated memory is used that only runs out in OOM situations 815 * that should be rare. 816 */ 817 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, 818 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, 819 .frame_size = opt_xsk_frame_size, 820 .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, 821 .flags = opt_umem_flags 822 }; 823 int ret; 824 825 umem = calloc(1, sizeof(*umem)); 826 if (!umem) 827 exit_with_error(errno); 828 829 ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, 830 &cfg); 831 if (ret) 832 exit_with_error(-ret); 833 834 umem->buffer = buffer; 835 return umem; 836 } 837 838 static void xsk_populate_fill_ring(struct xsk_umem_info *umem) 839 { 840 int ret, i; 841 u32 idx; 842 843 ret = xsk_ring_prod__reserve(&umem->fq, 844 XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); 845 if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) 846 exit_with_error(-ret); 847 for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) 848 *xsk_ring_prod__fill_addr(&umem->fq, idx++) = 849 i * opt_xsk_frame_size; 850 xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); 851 } 852 853 static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, 854 bool rx, bool tx) 855 { 856 struct xsk_socket_config cfg; 857 struct xsk_socket_info *xsk; 858 struct xsk_ring_cons *rxr; 859 struct xsk_ring_prod *txr; 860 int ret; 861 862 xsk = calloc(1, sizeof(*xsk)); 863 if (!xsk) 864 exit_with_error(errno); 865 866 xsk->umem = umem; 867 cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 868 cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 869 if (opt_num_xsks > 1 || opt_reduced_cap) 870 cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; 871 else 872 cfg.libbpf_flags = 0; 873 cfg.xdp_flags = opt_xdp_flags; 874 cfg.bind_flags = opt_xdp_bind_flags; 875 876 rxr = rx ? &xsk->rx : NULL; 877 txr = tx ? &xsk->tx : NULL; 878 ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, 879 rxr, txr, &cfg); 880 if (ret) 881 exit_with_error(-ret); 882 883 ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); 884 if (ret) 885 exit_with_error(-ret); 886 887 xsk->app_stats.rx_empty_polls = 0; 888 xsk->app_stats.fill_fail_polls = 0; 889 xsk->app_stats.copy_tx_sendtos = 0; 890 xsk->app_stats.tx_wakeup_sendtos = 0; 891 xsk->app_stats.opt_polls = 0; 892 xsk->app_stats.prev_rx_empty_polls = 0; 893 xsk->app_stats.prev_fill_fail_polls = 0; 894 xsk->app_stats.prev_copy_tx_sendtos = 0; 895 xsk->app_stats.prev_tx_wakeup_sendtos = 0; 896 xsk->app_stats.prev_opt_polls = 0; 897 898 return xsk; 899 } 900 901 static struct option long_options[] = { 902 {"rxdrop", no_argument, 0, 'r'}, 903 {"txonly", no_argument, 0, 't'}, 904 {"l2fwd", no_argument, 0, 'l'}, 905 {"interface", required_argument, 0, 'i'}, 906 {"queue", required_argument, 0, 'q'}, 907 {"poll", no_argument, 0, 'p'}, 908 {"xdp-skb", no_argument, 0, 'S'}, 909 {"xdp-native", no_argument, 0, 'N'}, 910 {"interval", required_argument, 0, 'n'}, 911 {"zero-copy", no_argument, 0, 'z'}, 912 {"copy", no_argument, 0, 'c'}, 913 {"frame-size", required_argument, 0, 'f'}, 914 {"no-need-wakeup", no_argument, 0, 'm'}, 915 {"unaligned", no_argument, 0, 'u'}, 916 {"shared-umem", no_argument, 0, 'M'}, 917 {"force", no_argument, 0, 'F'}, 918 {"duration", required_argument, 0, 'd'}, 919 {"batch-size", required_argument, 0, 'b'}, 920 {"tx-pkt-count", required_argument, 0, 'C'}, 921 {"tx-pkt-size", required_argument, 0, 's'}, 922 {"tx-pkt-pattern", required_argument, 0, 'P'}, 923 {"extra-stats", no_argument, 0, 'x'}, 924 {"quiet", no_argument, 0, 'Q'}, 925 {"app-stats", no_argument, 0, 'a'}, 926 {"irq-string", no_argument, 0, 'I'}, 927 {"busy-poll", no_argument, 0, 'B'}, 928 {"reduce-cap", no_argument, 0, 'R'}, 929 {0, 0, 0, 0} 930 }; 931 932 static void usage(const char *prog) 933 { 934 const char *str = 935 " Usage: %s [OPTIONS]\n" 936 " Options:\n" 937 " -r, --rxdrop Discard all incoming packets (default)\n" 938 " -t, --txonly Only send packets\n" 939 " -l, --l2fwd MAC swap L2 forwarding\n" 940 " -i, --interface=n Run on interface n\n" 941 " -q, --queue=n Use queue n (default 0)\n" 942 " -p, --poll Use poll syscall\n" 943 " -S, --xdp-skb=n Use XDP skb-mod\n" 944 " -N, --xdp-native=n Enforce XDP native mode\n" 945 " -n, --interval=n Specify statistics update interval (default 1 sec).\n" 946 " -z, --zero-copy Force zero-copy mode.\n" 947 " -c, --copy Force copy mode.\n" 948 " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" 949 " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" 950 " -u, --unaligned Enable unaligned chunk placement\n" 951 " -M, --shared-umem Enable XDP_SHARED_UMEM (cannot be used with -R)\n" 952 " -F, --force Force loading the XDP prog\n" 953 " -d, --duration=n Duration in secs to run command.\n" 954 " Default: forever.\n" 955 " -b, --batch-size=n Batch size for sending or receiving\n" 956 " packets. Default: %d\n" 957 " -C, --tx-pkt-count=n Number of packets to send.\n" 958 " Default: Continuous packets.\n" 959 " -s, --tx-pkt-size=n Transmit packet size.\n" 960 " (Default: %d bytes)\n" 961 " Min size: %d, Max size %d.\n" 962 " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" 963 " -x, --extra-stats Display extra statistics.\n" 964 " -Q, --quiet Do not display any stats.\n" 965 " -a, --app-stats Display application (syscall) statistics.\n" 966 " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" 967 " -B, --busy-poll Busy poll.\n" 968 " -R, --reduce-cap Use reduced capabilities (cannot be used with -M)\n" 969 "\n"; 970 fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, 971 opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, 972 XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); 973 974 exit(EXIT_FAILURE); 975 } 976 977 static void parse_command_line(int argc, char **argv) 978 { 979 int option_index, c; 980 981 opterr = 0; 982 983 for (;;) { 984 c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:BR", 985 long_options, &option_index); 986 if (c == -1) 987 break; 988 989 switch (c) { 990 case 'r': 991 opt_bench = BENCH_RXDROP; 992 break; 993 case 't': 994 opt_bench = BENCH_TXONLY; 995 break; 996 case 'l': 997 opt_bench = BENCH_L2FWD; 998 break; 999 case 'i': 1000 opt_if = optarg; 1001 break; 1002 case 'q': 1003 opt_queue = atoi(optarg); 1004 break; 1005 case 'p': 1006 opt_poll = 1; 1007 break; 1008 case 'S': 1009 opt_xdp_flags |= XDP_FLAGS_SKB_MODE; 1010 opt_xdp_bind_flags |= XDP_COPY; 1011 break; 1012 case 'N': 1013 /* default, set below */ 1014 break; 1015 case 'n': 1016 opt_interval = atoi(optarg); 1017 break; 1018 case 'z': 1019 opt_xdp_bind_flags |= XDP_ZEROCOPY; 1020 break; 1021 case 'c': 1022 opt_xdp_bind_flags |= XDP_COPY; 1023 break; 1024 case 'u': 1025 opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; 1026 opt_unaligned_chunks = 1; 1027 opt_mmap_flags = MAP_HUGETLB; 1028 break; 1029 case 'F': 1030 opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; 1031 break; 1032 case 'f': 1033 opt_xsk_frame_size = atoi(optarg); 1034 break; 1035 case 'm': 1036 opt_need_wakeup = false; 1037 opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; 1038 break; 1039 case 'M': 1040 opt_num_xsks = MAX_SOCKS; 1041 break; 1042 case 'd': 1043 opt_duration = atoi(optarg); 1044 opt_duration *= 1000000000; 1045 break; 1046 case 'b': 1047 opt_batch_size = atoi(optarg); 1048 break; 1049 case 'C': 1050 opt_pkt_count = atoi(optarg); 1051 break; 1052 case 's': 1053 opt_pkt_size = atoi(optarg); 1054 if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || 1055 opt_pkt_size < MIN_PKT_SIZE) { 1056 fprintf(stderr, 1057 "ERROR: Invalid frame size %d\n", 1058 opt_pkt_size); 1059 usage(basename(argv[0])); 1060 } 1061 break; 1062 case 'P': 1063 opt_pkt_fill_pattern = strtol(optarg, NULL, 16); 1064 break; 1065 case 'x': 1066 opt_extra_stats = 1; 1067 break; 1068 case 'Q': 1069 opt_quiet = 1; 1070 break; 1071 case 'a': 1072 opt_app_stats = 1; 1073 break; 1074 case 'I': 1075 opt_irq_str = optarg; 1076 if (get_interrupt_number()) 1077 irqs_at_init = get_irqs(); 1078 if (irqs_at_init < 0) { 1079 fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); 1080 usage(basename(argv[0])); 1081 } 1082 break; 1083 case 'B': 1084 opt_busy_poll = 1; 1085 break; 1086 case 'R': 1087 opt_reduced_cap = true; 1088 break; 1089 default: 1090 usage(basename(argv[0])); 1091 } 1092 } 1093 1094 if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) 1095 opt_xdp_flags |= XDP_FLAGS_DRV_MODE; 1096 1097 opt_ifindex = if_nametoindex(opt_if); 1098 if (!opt_ifindex) { 1099 fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", 1100 opt_if); 1101 usage(basename(argv[0])); 1102 } 1103 1104 if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && 1105 !opt_unaligned_chunks) { 1106 fprintf(stderr, "--frame-size=%d is not a power of two\n", 1107 opt_xsk_frame_size); 1108 usage(basename(argv[0])); 1109 } 1110 1111 if (opt_reduced_cap && opt_num_xsks > 1) { 1112 fprintf(stderr, "ERROR: -M and -R cannot be used together\n"); 1113 usage(basename(argv[0])); 1114 } 1115 } 1116 1117 static void kick_tx(struct xsk_socket_info *xsk) 1118 { 1119 int ret; 1120 1121 ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); 1122 if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || 1123 errno == EBUSY || errno == ENETDOWN) 1124 return; 1125 exit_with_error(errno); 1126 } 1127 1128 static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) 1129 { 1130 struct xsk_umem_info *umem = xsk->umem; 1131 u32 idx_cq = 0, idx_fq = 0; 1132 unsigned int rcvd; 1133 size_t ndescs; 1134 1135 if (!xsk->outstanding_tx) 1136 return; 1137 1138 /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to 1139 * really send the packets. In zero-copy mode we do not have to do this, since Tx 1140 * is driven by the NAPI loop. So as an optimization, we do not have to call 1141 * sendto() all the time in zero-copy mode for l2fwd. 1142 */ 1143 if (opt_xdp_bind_flags & XDP_COPY) { 1144 xsk->app_stats.copy_tx_sendtos++; 1145 kick_tx(xsk); 1146 } 1147 1148 ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : 1149 xsk->outstanding_tx; 1150 1151 /* re-add completed Tx buffers */ 1152 rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); 1153 if (rcvd > 0) { 1154 unsigned int i; 1155 int ret; 1156 1157 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); 1158 while (ret != rcvd) { 1159 if (ret < 0) 1160 exit_with_error(-ret); 1161 if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&umem->fq)) { 1162 xsk->app_stats.fill_fail_polls++; 1163 recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 1164 NULL); 1165 } 1166 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); 1167 } 1168 1169 for (i = 0; i < rcvd; i++) 1170 *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = 1171 *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); 1172 1173 xsk_ring_prod__submit(&xsk->umem->fq, rcvd); 1174 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1175 xsk->outstanding_tx -= rcvd; 1176 } 1177 } 1178 1179 static inline void complete_tx_only(struct xsk_socket_info *xsk, 1180 int batch_size) 1181 { 1182 unsigned int rcvd; 1183 u32 idx; 1184 1185 if (!xsk->outstanding_tx) 1186 return; 1187 1188 if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { 1189 xsk->app_stats.tx_wakeup_sendtos++; 1190 kick_tx(xsk); 1191 } 1192 1193 rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); 1194 if (rcvd > 0) { 1195 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1196 xsk->outstanding_tx -= rcvd; 1197 } 1198 } 1199 1200 static void rx_drop(struct xsk_socket_info *xsk) 1201 { 1202 unsigned int rcvd, i; 1203 u32 idx_rx = 0, idx_fq = 0; 1204 int ret; 1205 1206 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1207 if (!rcvd) { 1208 if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1209 xsk->app_stats.rx_empty_polls++; 1210 recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1211 } 1212 return; 1213 } 1214 1215 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); 1216 while (ret != rcvd) { 1217 if (ret < 0) 1218 exit_with_error(-ret); 1219 if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1220 xsk->app_stats.fill_fail_polls++; 1221 recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1222 } 1223 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); 1224 } 1225 1226 for (i = 0; i < rcvd; i++) { 1227 u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; 1228 u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; 1229 u64 orig = xsk_umem__extract_addr(addr); 1230 1231 addr = xsk_umem__add_offset_to_addr(addr); 1232 char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); 1233 1234 hex_dump(pkt, len, addr); 1235 *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; 1236 } 1237 1238 xsk_ring_prod__submit(&xsk->umem->fq, rcvd); 1239 xsk_ring_cons__release(&xsk->rx, rcvd); 1240 xsk->ring_stats.rx_npkts += rcvd; 1241 } 1242 1243 static void rx_drop_all(void) 1244 { 1245 struct pollfd fds[MAX_SOCKS] = {}; 1246 int i, ret; 1247 1248 for (i = 0; i < num_socks; i++) { 1249 fds[i].fd = xsk_socket__fd(xsks[i]->xsk); 1250 fds[i].events = POLLIN; 1251 } 1252 1253 for (;;) { 1254 if (opt_poll) { 1255 for (i = 0; i < num_socks; i++) 1256 xsks[i]->app_stats.opt_polls++; 1257 ret = poll(fds, num_socks, opt_timeout); 1258 if (ret <= 0) 1259 continue; 1260 } 1261 1262 for (i = 0; i < num_socks; i++) 1263 rx_drop(xsks[i]); 1264 1265 if (benchmark_done) 1266 break; 1267 } 1268 } 1269 1270 static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) 1271 { 1272 u32 idx; 1273 unsigned int i; 1274 1275 while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < 1276 batch_size) { 1277 complete_tx_only(xsk, batch_size); 1278 if (benchmark_done) 1279 return; 1280 } 1281 1282 for (i = 0; i < batch_size; i++) { 1283 struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, 1284 idx + i); 1285 tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; 1286 tx_desc->len = PKT_SIZE; 1287 } 1288 1289 xsk_ring_prod__submit(&xsk->tx, batch_size); 1290 xsk->ring_stats.tx_npkts += batch_size; 1291 xsk->outstanding_tx += batch_size; 1292 *frame_nb += batch_size; 1293 *frame_nb %= NUM_FRAMES; 1294 complete_tx_only(xsk, batch_size); 1295 } 1296 1297 static inline int get_batch_size(int pkt_cnt) 1298 { 1299 if (!opt_pkt_count) 1300 return opt_batch_size; 1301 1302 if (pkt_cnt + opt_batch_size <= opt_pkt_count) 1303 return opt_batch_size; 1304 1305 return opt_pkt_count - pkt_cnt; 1306 } 1307 1308 static void complete_tx_only_all(void) 1309 { 1310 bool pending; 1311 int i; 1312 1313 do { 1314 pending = false; 1315 for (i = 0; i < num_socks; i++) { 1316 if (xsks[i]->outstanding_tx) { 1317 complete_tx_only(xsks[i], opt_batch_size); 1318 pending = !!xsks[i]->outstanding_tx; 1319 } 1320 } 1321 } while (pending); 1322 } 1323 1324 static void tx_only_all(void) 1325 { 1326 struct pollfd fds[MAX_SOCKS] = {}; 1327 u32 frame_nb[MAX_SOCKS] = {}; 1328 int pkt_cnt = 0; 1329 int i, ret; 1330 1331 for (i = 0; i < num_socks; i++) { 1332 fds[0].fd = xsk_socket__fd(xsks[i]->xsk); 1333 fds[0].events = POLLOUT; 1334 } 1335 1336 while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { 1337 int batch_size = get_batch_size(pkt_cnt); 1338 1339 if (opt_poll) { 1340 for (i = 0; i < num_socks; i++) 1341 xsks[i]->app_stats.opt_polls++; 1342 ret = poll(fds, num_socks, opt_timeout); 1343 if (ret <= 0) 1344 continue; 1345 1346 if (!(fds[0].revents & POLLOUT)) 1347 continue; 1348 } 1349 1350 for (i = 0; i < num_socks; i++) 1351 tx_only(xsks[i], &frame_nb[i], batch_size); 1352 1353 pkt_cnt += batch_size; 1354 1355 if (benchmark_done) 1356 break; 1357 } 1358 1359 if (opt_pkt_count) 1360 complete_tx_only_all(); 1361 } 1362 1363 static void l2fwd(struct xsk_socket_info *xsk) 1364 { 1365 unsigned int rcvd, i; 1366 u32 idx_rx = 0, idx_tx = 0; 1367 int ret; 1368 1369 complete_tx_l2fwd(xsk); 1370 1371 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1372 if (!rcvd) { 1373 if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1374 xsk->app_stats.rx_empty_polls++; 1375 recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1376 } 1377 return; 1378 } 1379 xsk->ring_stats.rx_npkts += rcvd; 1380 1381 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); 1382 while (ret != rcvd) { 1383 if (ret < 0) 1384 exit_with_error(-ret); 1385 complete_tx_l2fwd(xsk); 1386 if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->tx)) { 1387 xsk->app_stats.tx_wakeup_sendtos++; 1388 kick_tx(xsk); 1389 } 1390 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); 1391 } 1392 1393 for (i = 0; i < rcvd; i++) { 1394 u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; 1395 u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; 1396 u64 orig = addr; 1397 1398 addr = xsk_umem__add_offset_to_addr(addr); 1399 char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); 1400 1401 swap_mac_addresses(pkt); 1402 1403 hex_dump(pkt, len, addr); 1404 xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; 1405 xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; 1406 } 1407 1408 xsk_ring_prod__submit(&xsk->tx, rcvd); 1409 xsk_ring_cons__release(&xsk->rx, rcvd); 1410 1411 xsk->ring_stats.tx_npkts += rcvd; 1412 xsk->outstanding_tx += rcvd; 1413 } 1414 1415 static void l2fwd_all(void) 1416 { 1417 struct pollfd fds[MAX_SOCKS] = {}; 1418 int i, ret; 1419 1420 for (;;) { 1421 if (opt_poll) { 1422 for (i = 0; i < num_socks; i++) { 1423 fds[i].fd = xsk_socket__fd(xsks[i]->xsk); 1424 fds[i].events = POLLOUT | POLLIN; 1425 xsks[i]->app_stats.opt_polls++; 1426 } 1427 ret = poll(fds, num_socks, opt_timeout); 1428 if (ret <= 0) 1429 continue; 1430 } 1431 1432 for (i = 0; i < num_socks; i++) 1433 l2fwd(xsks[i]); 1434 1435 if (benchmark_done) 1436 break; 1437 } 1438 } 1439 1440 static void load_xdp_program(char **argv, struct bpf_object **obj) 1441 { 1442 struct bpf_prog_load_attr prog_load_attr = { 1443 .prog_type = BPF_PROG_TYPE_XDP, 1444 }; 1445 char xdp_filename[256]; 1446 int prog_fd; 1447 1448 snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); 1449 prog_load_attr.file = xdp_filename; 1450 1451 if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd)) 1452 exit(EXIT_FAILURE); 1453 if (prog_fd < 0) { 1454 fprintf(stderr, "ERROR: no program found: %s\n", 1455 strerror(prog_fd)); 1456 exit(EXIT_FAILURE); 1457 } 1458 1459 if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { 1460 fprintf(stderr, "ERROR: link set xdp fd failed\n"); 1461 exit(EXIT_FAILURE); 1462 } 1463 } 1464 1465 static void enter_xsks_into_map(struct bpf_object *obj) 1466 { 1467 struct bpf_map *map; 1468 int i, xsks_map; 1469 1470 map = bpf_object__find_map_by_name(obj, "xsks_map"); 1471 xsks_map = bpf_map__fd(map); 1472 if (xsks_map < 0) { 1473 fprintf(stderr, "ERROR: no xsks map found: %s\n", 1474 strerror(xsks_map)); 1475 exit(EXIT_FAILURE); 1476 } 1477 1478 for (i = 0; i < num_socks; i++) { 1479 int fd = xsk_socket__fd(xsks[i]->xsk); 1480 int key, ret; 1481 1482 key = i; 1483 ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); 1484 if (ret) { 1485 fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); 1486 exit(EXIT_FAILURE); 1487 } 1488 } 1489 } 1490 1491 static void apply_setsockopt(struct xsk_socket_info *xsk) 1492 { 1493 int sock_opt; 1494 1495 if (!opt_busy_poll) 1496 return; 1497 1498 sock_opt = 1; 1499 if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, 1500 (void *)&sock_opt, sizeof(sock_opt)) < 0) 1501 exit_with_error(errno); 1502 1503 sock_opt = 20; 1504 if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, 1505 (void *)&sock_opt, sizeof(sock_opt)) < 0) 1506 exit_with_error(errno); 1507 1508 sock_opt = opt_batch_size; 1509 if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, 1510 (void *)&sock_opt, sizeof(sock_opt)) < 0) 1511 exit_with_error(errno); 1512 } 1513 1514 static int recv_xsks_map_fd_from_ctrl_node(int sock, int *_fd) 1515 { 1516 char cms[CMSG_SPACE(sizeof(int))]; 1517 struct cmsghdr *cmsg; 1518 struct msghdr msg; 1519 struct iovec iov; 1520 int value; 1521 int len; 1522 1523 iov.iov_base = &value; 1524 iov.iov_len = sizeof(int); 1525 1526 msg.msg_name = 0; 1527 msg.msg_namelen = 0; 1528 msg.msg_iov = &iov; 1529 msg.msg_iovlen = 1; 1530 msg.msg_flags = 0; 1531 msg.msg_control = (caddr_t)cms; 1532 msg.msg_controllen = sizeof(cms); 1533 1534 len = recvmsg(sock, &msg, 0); 1535 1536 if (len < 0) { 1537 fprintf(stderr, "Recvmsg failed length incorrect.\n"); 1538 return -EINVAL; 1539 } 1540 1541 if (len == 0) { 1542 fprintf(stderr, "Recvmsg failed no data\n"); 1543 return -EINVAL; 1544 } 1545 1546 cmsg = CMSG_FIRSTHDR(&msg); 1547 *_fd = *(int *)CMSG_DATA(cmsg); 1548 1549 return 0; 1550 } 1551 1552 static int 1553 recv_xsks_map_fd(int *xsks_map_fd) 1554 { 1555 struct sockaddr_un server; 1556 int err; 1557 1558 sock = socket(AF_UNIX, SOCK_STREAM, 0); 1559 if (sock < 0) { 1560 fprintf(stderr, "Error opening socket stream: %s", strerror(errno)); 1561 return errno; 1562 } 1563 1564 server.sun_family = AF_UNIX; 1565 strcpy(server.sun_path, SOCKET_NAME); 1566 1567 if (connect(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { 1568 close(sock); 1569 fprintf(stderr, "Error connecting stream socket: %s", strerror(errno)); 1570 return errno; 1571 } 1572 1573 err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); 1574 if (err) { 1575 fprintf(stderr, "Error %d receiving fd\n", err); 1576 return err; 1577 } 1578 return 0; 1579 } 1580 1581 int main(int argc, char **argv) 1582 { 1583 struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; 1584 struct __user_cap_data_struct data[2] = { { 0 } }; 1585 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 1586 bool rx = false, tx = false; 1587 struct xsk_umem_info *umem; 1588 struct bpf_object *obj; 1589 int xsks_map_fd = 0; 1590 pthread_t pt; 1591 int i, ret; 1592 void *bufs; 1593 1594 parse_command_line(argc, argv); 1595 1596 if (opt_reduced_cap) { 1597 if (capget(&hdr, data) < 0) 1598 fprintf(stderr, "Error getting capabilities\n"); 1599 1600 data->effective &= CAP_TO_MASK(CAP_NET_RAW); 1601 data->permitted &= CAP_TO_MASK(CAP_NET_RAW); 1602 1603 if (capset(&hdr, data) < 0) 1604 fprintf(stderr, "Setting capabilities failed\n"); 1605 1606 if (capget(&hdr, data) < 0) { 1607 fprintf(stderr, "Error getting capabilities\n"); 1608 } else { 1609 fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", 1610 data[0].effective, data[0].inheritable, data[0].permitted); 1611 fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", 1612 data[1].effective, data[1].inheritable, data[1].permitted); 1613 } 1614 } else { 1615 if (setrlimit(RLIMIT_MEMLOCK, &r)) { 1616 fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", 1617 strerror(errno)); 1618 exit(EXIT_FAILURE); 1619 } 1620 1621 if (opt_num_xsks > 1) 1622 load_xdp_program(argv, &obj); 1623 } 1624 1625 /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ 1626 bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, 1627 PROT_READ | PROT_WRITE, 1628 MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); 1629 if (bufs == MAP_FAILED) { 1630 printf("ERROR: mmap failed\n"); 1631 exit(EXIT_FAILURE); 1632 } 1633 1634 /* Create sockets... */ 1635 umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); 1636 if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) { 1637 rx = true; 1638 xsk_populate_fill_ring(umem); 1639 } 1640 if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY) 1641 tx = true; 1642 for (i = 0; i < opt_num_xsks; i++) 1643 xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); 1644 1645 for (i = 0; i < opt_num_xsks; i++) 1646 apply_setsockopt(xsks[i]); 1647 1648 if (opt_bench == BENCH_TXONLY) { 1649 gen_eth_hdr_data(); 1650 1651 for (i = 0; i < NUM_FRAMES; i++) 1652 gen_eth_frame(umem, i * opt_xsk_frame_size); 1653 } 1654 1655 if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) 1656 enter_xsks_into_map(obj); 1657 1658 if (opt_reduced_cap) { 1659 ret = recv_xsks_map_fd(&xsks_map_fd); 1660 if (ret) { 1661 fprintf(stderr, "Error %d receiving xsks_map_fd\n", ret); 1662 exit_with_error(ret); 1663 } 1664 if (xsks[0]->xsk) { 1665 ret = xsk_socket__update_xskmap(xsks[0]->xsk, xsks_map_fd); 1666 if (ret) { 1667 fprintf(stderr, "Update of BPF map failed(%d)\n", ret); 1668 exit_with_error(ret); 1669 } 1670 } 1671 } 1672 1673 signal(SIGINT, int_exit); 1674 signal(SIGTERM, int_exit); 1675 signal(SIGABRT, int_exit); 1676 1677 setlocale(LC_ALL, ""); 1678 1679 if (!opt_quiet) { 1680 ret = pthread_create(&pt, NULL, poller, NULL); 1681 if (ret) 1682 exit_with_error(ret); 1683 } 1684 1685 prev_time = get_nsecs(); 1686 start_time = prev_time; 1687 1688 if (opt_bench == BENCH_RXDROP) 1689 rx_drop_all(); 1690 else if (opt_bench == BENCH_TXONLY) 1691 tx_only_all(); 1692 else 1693 l2fwd_all(); 1694 1695 benchmark_done = true; 1696 1697 if (!opt_quiet) 1698 pthread_join(pt, NULL); 1699 1700 xdpsock_cleanup(); 1701 1702 munmap(bufs, NUM_FRAMES * opt_xsk_frame_size); 1703 1704 return 0; 1705 } 1706
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.