1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/netdevice.h> 32 #include <linux/if_packet.h> 33 #include <linux/gfp.h> 34 #include <net/ip.h> 35 #include <net/protocol.h> 36 #include <net/netlink.h> 37 #include <linux/skbuff.h> 38 #include <net/sock.h> 39 #include <net/flow_dissector.h> 40 #include <linux/errno.h> 41 #include <linux/timer.h> 42 #include <asm/uaccess.h> 43 #include <asm/unaligned.h> 44 #include <linux/filter.h> 45 #include <linux/ratelimit.h> 46 #include <linux/seccomp.h> 47 #include <linux/if_vlan.h> 48 #include <linux/bpf.h> 49 #include <net/sch_generic.h> 50 #include <net/cls_cgroup.h> 51 #include <net/dst_metadata.h> 52 #include <net/dst.h> 53 #include <net/sock_reuseport.h> 54 55 /** 56 * sk_filter_trim_cap - run a packet through a socket filter 57 * @sk: sock associated with &sk_buff 58 * @skb: buffer to filter 59 * @cap: limit on how short the eBPF program may trim the packet 60 * 61 * Run the eBPF program and then cut skb->data to correct size returned by 62 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 63 * than pkt_len we keep whole skb->data. This is the socket level 64 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 65 * be accepted or -EPERM if the packet should be tossed. 66 * 67 */ 68 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 69 { 70 int err; 71 struct sk_filter *filter; 72 73 /* 74 * If the skb was allocated from pfmemalloc reserves, only 75 * allow SOCK_MEMALLOC sockets to use it as this socket is 76 * helping free memory 77 */ 78 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 79 return -ENOMEM; 80 81 err = security_sock_rcv_skb(sk, skb); 82 if (err) 83 return err; 84 85 rcu_read_lock(); 86 filter = rcu_dereference(sk->sk_filter); 87 if (filter) { 88 unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 89 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 90 } 91 rcu_read_unlock(); 92 93 return err; 94 } 95 EXPORT_SYMBOL(sk_filter_trim_cap); 96 97 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 98 { 99 return skb_get_poff((struct sk_buff *)(unsigned long) ctx); 100 } 101 102 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 103 { 104 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 105 struct nlattr *nla; 106 107 if (skb_is_nonlinear(skb)) 108 return 0; 109 110 if (skb->len < sizeof(struct nlattr)) 111 return 0; 112 113 if (a > skb->len - sizeof(struct nlattr)) 114 return 0; 115 116 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 117 if (nla) 118 return (void *) nla - (void *) skb->data; 119 120 return 0; 121 } 122 123 static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 124 { 125 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 126 struct nlattr *nla; 127 128 if (skb_is_nonlinear(skb)) 129 return 0; 130 131 if (skb->len < sizeof(struct nlattr)) 132 return 0; 133 134 if (a > skb->len - sizeof(struct nlattr)) 135 return 0; 136 137 nla = (struct nlattr *) &skb->data[a]; 138 if (nla->nla_len > skb->len - a) 139 return 0; 140 141 nla = nla_find_nested(nla, x); 142 if (nla) 143 return (void *) nla - (void *) skb->data; 144 145 return 0; 146 } 147 148 static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 149 { 150 return raw_smp_processor_id(); 151 } 152 153 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 154 .func = __get_raw_cpu_id, 155 .gpl_only = false, 156 .ret_type = RET_INTEGER, 157 }; 158 159 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 160 struct bpf_insn *insn_buf) 161 { 162 struct bpf_insn *insn = insn_buf; 163 164 switch (skb_field) { 165 case SKF_AD_MARK: 166 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 167 168 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 169 offsetof(struct sk_buff, mark)); 170 break; 171 172 case SKF_AD_PKTTYPE: 173 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 174 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 175 #ifdef __BIG_ENDIAN_BITFIELD 176 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 177 #endif 178 break; 179 180 case SKF_AD_QUEUE: 181 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 182 183 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 184 offsetof(struct sk_buff, queue_mapping)); 185 break; 186 187 case SKF_AD_VLAN_TAG: 188 case SKF_AD_VLAN_TAG_PRESENT: 189 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 190 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 191 192 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 193 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 194 offsetof(struct sk_buff, vlan_tci)); 195 if (skb_field == SKF_AD_VLAN_TAG) { 196 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 197 ~VLAN_TAG_PRESENT); 198 } else { 199 /* dst_reg >>= 12 */ 200 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 201 /* dst_reg &= 1 */ 202 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 203 } 204 break; 205 } 206 207 return insn - insn_buf; 208 } 209 210 static bool convert_bpf_extensions(struct sock_filter *fp, 211 struct bpf_insn **insnp) 212 { 213 struct bpf_insn *insn = *insnp; 214 u32 cnt; 215 216 switch (fp->k) { 217 case SKF_AD_OFF + SKF_AD_PROTOCOL: 218 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 219 220 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 221 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 222 offsetof(struct sk_buff, protocol)); 223 /* A = ntohs(A) [emitting a nop or swap16] */ 224 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 225 break; 226 227 case SKF_AD_OFF + SKF_AD_PKTTYPE: 228 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 229 insn += cnt - 1; 230 break; 231 232 case SKF_AD_OFF + SKF_AD_IFINDEX: 233 case SKF_AD_OFF + SKF_AD_HATYPE: 234 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 235 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 236 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); 237 238 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 239 BPF_REG_TMP, BPF_REG_CTX, 240 offsetof(struct sk_buff, dev)); 241 /* if (tmp != 0) goto pc + 1 */ 242 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 243 *insn++ = BPF_EXIT_INSN(); 244 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 245 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 246 offsetof(struct net_device, ifindex)); 247 else 248 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 249 offsetof(struct net_device, type)); 250 break; 251 252 case SKF_AD_OFF + SKF_AD_MARK: 253 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 254 insn += cnt - 1; 255 break; 256 257 case SKF_AD_OFF + SKF_AD_RXHASH: 258 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 259 260 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 261 offsetof(struct sk_buff, hash)); 262 break; 263 264 case SKF_AD_OFF + SKF_AD_QUEUE: 265 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 266 insn += cnt - 1; 267 break; 268 269 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 270 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 271 BPF_REG_A, BPF_REG_CTX, insn); 272 insn += cnt - 1; 273 break; 274 275 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 276 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 277 BPF_REG_A, BPF_REG_CTX, insn); 278 insn += cnt - 1; 279 break; 280 281 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 282 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 283 284 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 285 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 286 offsetof(struct sk_buff, vlan_proto)); 287 /* A = ntohs(A) [emitting a nop or swap16] */ 288 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 289 break; 290 291 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 292 case SKF_AD_OFF + SKF_AD_NLATTR: 293 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 294 case SKF_AD_OFF + SKF_AD_CPU: 295 case SKF_AD_OFF + SKF_AD_RANDOM: 296 /* arg1 = CTX */ 297 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 298 /* arg2 = A */ 299 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 300 /* arg3 = X */ 301 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 302 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 303 switch (fp->k) { 304 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 305 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 306 break; 307 case SKF_AD_OFF + SKF_AD_NLATTR: 308 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 309 break; 310 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 311 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 312 break; 313 case SKF_AD_OFF + SKF_AD_CPU: 314 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 315 break; 316 case SKF_AD_OFF + SKF_AD_RANDOM: 317 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 318 bpf_user_rnd_init_once(); 319 break; 320 } 321 break; 322 323 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 324 /* A ^= X */ 325 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 326 break; 327 328 default: 329 /* This is just a dummy call to avoid letting the compiler 330 * evict __bpf_call_base() as an optimization. Placed here 331 * where no-one bothers. 332 */ 333 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 334 return false; 335 } 336 337 *insnp = insn; 338 return true; 339 } 340 341 /** 342 * bpf_convert_filter - convert filter program 343 * @prog: the user passed filter program 344 * @len: the length of the user passed filter program 345 * @new_prog: buffer where converted program will be stored 346 * @new_len: pointer to store length of converted program 347 * 348 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 349 * Conversion workflow: 350 * 351 * 1) First pass for calculating the new program length: 352 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 353 * 354 * 2) 2nd pass to remap in two passes: 1st pass finds new 355 * jump offsets, 2nd pass remapping: 356 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 357 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 358 */ 359 static int bpf_convert_filter(struct sock_filter *prog, int len, 360 struct bpf_insn *new_prog, int *new_len) 361 { 362 int new_flen = 0, pass = 0, target, i; 363 struct bpf_insn *new_insn; 364 struct sock_filter *fp; 365 int *addrs = NULL; 366 u8 bpf_src; 367 368 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 369 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 370 371 if (len <= 0 || len > BPF_MAXINSNS) 372 return -EINVAL; 373 374 if (new_prog) { 375 addrs = kcalloc(len, sizeof(*addrs), 376 GFP_KERNEL | __GFP_NOWARN); 377 if (!addrs) 378 return -ENOMEM; 379 } 380 381 do_pass: 382 new_insn = new_prog; 383 fp = prog; 384 385 /* Classic BPF related prologue emission. */ 386 if (new_insn) { 387 /* Classic BPF expects A and X to be reset first. These need 388 * to be guaranteed to be the first two instructions. 389 */ 390 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 391 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 392 393 /* All programs must keep CTX in callee saved BPF_REG_CTX. 394 * In eBPF case it's done by the compiler, here we need to 395 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 396 */ 397 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 398 } else { 399 new_insn += 3; 400 } 401 402 for (i = 0; i < len; fp++, i++) { 403 struct bpf_insn tmp_insns[6] = { }; 404 struct bpf_insn *insn = tmp_insns; 405 406 if (addrs) 407 addrs[i] = new_insn - new_prog; 408 409 switch (fp->code) { 410 /* All arithmetic insns and skb loads map as-is. */ 411 case BPF_ALU | BPF_ADD | BPF_X: 412 case BPF_ALU | BPF_ADD | BPF_K: 413 case BPF_ALU | BPF_SUB | BPF_X: 414 case BPF_ALU | BPF_SUB | BPF_K: 415 case BPF_ALU | BPF_AND | BPF_X: 416 case BPF_ALU | BPF_AND | BPF_K: 417 case BPF_ALU | BPF_OR | BPF_X: 418 case BPF_ALU | BPF_OR | BPF_K: 419 case BPF_ALU | BPF_LSH | BPF_X: 420 case BPF_ALU | BPF_LSH | BPF_K: 421 case BPF_ALU | BPF_RSH | BPF_X: 422 case BPF_ALU | BPF_RSH | BPF_K: 423 case BPF_ALU | BPF_XOR | BPF_X: 424 case BPF_ALU | BPF_XOR | BPF_K: 425 case BPF_ALU | BPF_MUL | BPF_X: 426 case BPF_ALU | BPF_MUL | BPF_K: 427 case BPF_ALU | BPF_DIV | BPF_X: 428 case BPF_ALU | BPF_DIV | BPF_K: 429 case BPF_ALU | BPF_MOD | BPF_X: 430 case BPF_ALU | BPF_MOD | BPF_K: 431 case BPF_ALU | BPF_NEG: 432 case BPF_LD | BPF_ABS | BPF_W: 433 case BPF_LD | BPF_ABS | BPF_H: 434 case BPF_LD | BPF_ABS | BPF_B: 435 case BPF_LD | BPF_IND | BPF_W: 436 case BPF_LD | BPF_IND | BPF_H: 437 case BPF_LD | BPF_IND | BPF_B: 438 /* Check for overloaded BPF extension and 439 * directly convert it if found, otherwise 440 * just move on with mapping. 441 */ 442 if (BPF_CLASS(fp->code) == BPF_LD && 443 BPF_MODE(fp->code) == BPF_ABS && 444 convert_bpf_extensions(fp, &insn)) 445 break; 446 447 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 448 break; 449 450 /* Jump transformation cannot use BPF block macros 451 * everywhere as offset calculation and target updates 452 * require a bit more work than the rest, i.e. jump 453 * opcodes map as-is, but offsets need adjustment. 454 */ 455 456 #define BPF_EMIT_JMP \ 457 do { \ 458 if (target >= len || target < 0) \ 459 goto err; \ 460 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 461 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 462 insn->off -= insn - tmp_insns; \ 463 } while (0) 464 465 case BPF_JMP | BPF_JA: 466 target = i + fp->k + 1; 467 insn->code = fp->code; 468 BPF_EMIT_JMP; 469 break; 470 471 case BPF_JMP | BPF_JEQ | BPF_K: 472 case BPF_JMP | BPF_JEQ | BPF_X: 473 case BPF_JMP | BPF_JSET | BPF_K: 474 case BPF_JMP | BPF_JSET | BPF_X: 475 case BPF_JMP | BPF_JGT | BPF_K: 476 case BPF_JMP | BPF_JGT | BPF_X: 477 case BPF_JMP | BPF_JGE | BPF_K: 478 case BPF_JMP | BPF_JGE | BPF_X: 479 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 480 /* BPF immediates are signed, zero extend 481 * immediate into tmp register and use it 482 * in compare insn. 483 */ 484 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 485 486 insn->dst_reg = BPF_REG_A; 487 insn->src_reg = BPF_REG_TMP; 488 bpf_src = BPF_X; 489 } else { 490 insn->dst_reg = BPF_REG_A; 491 insn->imm = fp->k; 492 bpf_src = BPF_SRC(fp->code); 493 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 494 } 495 496 /* Common case where 'jump_false' is next insn. */ 497 if (fp->jf == 0) { 498 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 499 target = i + fp->jt + 1; 500 BPF_EMIT_JMP; 501 break; 502 } 503 504 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 505 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 506 insn->code = BPF_JMP | BPF_JNE | bpf_src; 507 target = i + fp->jf + 1; 508 BPF_EMIT_JMP; 509 break; 510 } 511 512 /* Other jumps are mapped into two insns: Jxx and JA. */ 513 target = i + fp->jt + 1; 514 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 515 BPF_EMIT_JMP; 516 insn++; 517 518 insn->code = BPF_JMP | BPF_JA; 519 target = i + fp->jf + 1; 520 BPF_EMIT_JMP; 521 break; 522 523 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 524 case BPF_LDX | BPF_MSH | BPF_B: 525 /* tmp = A */ 526 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 527 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 528 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 529 /* A &= 0xf */ 530 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 531 /* A <<= 2 */ 532 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 533 /* X = A */ 534 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 535 /* A = tmp */ 536 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 537 break; 538 539 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 540 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 541 */ 542 case BPF_RET | BPF_A: 543 case BPF_RET | BPF_K: 544 if (BPF_RVAL(fp->code) == BPF_K) 545 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 546 0, fp->k); 547 *insn = BPF_EXIT_INSN(); 548 break; 549 550 /* Store to stack. */ 551 case BPF_ST: 552 case BPF_STX: 553 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 554 BPF_ST ? BPF_REG_A : BPF_REG_X, 555 -(BPF_MEMWORDS - fp->k) * 4); 556 break; 557 558 /* Load from stack. */ 559 case BPF_LD | BPF_MEM: 560 case BPF_LDX | BPF_MEM: 561 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 562 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 563 -(BPF_MEMWORDS - fp->k) * 4); 564 break; 565 566 /* A = K or X = K */ 567 case BPF_LD | BPF_IMM: 568 case BPF_LDX | BPF_IMM: 569 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 570 BPF_REG_A : BPF_REG_X, fp->k); 571 break; 572 573 /* X = A */ 574 case BPF_MISC | BPF_TAX: 575 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 576 break; 577 578 /* A = X */ 579 case BPF_MISC | BPF_TXA: 580 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 581 break; 582 583 /* A = skb->len or X = skb->len */ 584 case BPF_LD | BPF_W | BPF_LEN: 585 case BPF_LDX | BPF_W | BPF_LEN: 586 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 587 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 588 offsetof(struct sk_buff, len)); 589 break; 590 591 /* Access seccomp_data fields. */ 592 case BPF_LDX | BPF_ABS | BPF_W: 593 /* A = *(u32 *) (ctx + K) */ 594 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 595 break; 596 597 /* Unknown instruction. */ 598 default: 599 goto err; 600 } 601 602 insn++; 603 if (new_prog) 604 memcpy(new_insn, tmp_insns, 605 sizeof(*insn) * (insn - tmp_insns)); 606 new_insn += insn - tmp_insns; 607 } 608 609 if (!new_prog) { 610 /* Only calculating new length. */ 611 *new_len = new_insn - new_prog; 612 return 0; 613 } 614 615 pass++; 616 if (new_flen != new_insn - new_prog) { 617 new_flen = new_insn - new_prog; 618 if (pass > 2) 619 goto err; 620 goto do_pass; 621 } 622 623 kfree(addrs); 624 BUG_ON(*new_len != new_flen); 625 return 0; 626 err: 627 kfree(addrs); 628 return -EINVAL; 629 } 630 631 /* Security: 632 * 633 * As we dont want to clear mem[] array for each packet going through 634 * __bpf_prog_run(), we check that filter loaded by user never try to read 635 * a cell if not previously written, and we check all branches to be sure 636 * a malicious user doesn't try to abuse us. 637 */ 638 static int check_load_and_stores(const struct sock_filter *filter, int flen) 639 { 640 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 641 int pc, ret = 0; 642 643 BUILD_BUG_ON(BPF_MEMWORDS > 16); 644 645 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 646 if (!masks) 647 return -ENOMEM; 648 649 memset(masks, 0xff, flen * sizeof(*masks)); 650 651 for (pc = 0; pc < flen; pc++) { 652 memvalid &= masks[pc]; 653 654 switch (filter[pc].code) { 655 case BPF_ST: 656 case BPF_STX: 657 memvalid |= (1 << filter[pc].k); 658 break; 659 case BPF_LD | BPF_MEM: 660 case BPF_LDX | BPF_MEM: 661 if (!(memvalid & (1 << filter[pc].k))) { 662 ret = -EINVAL; 663 goto error; 664 } 665 break; 666 case BPF_JMP | BPF_JA: 667 /* A jump must set masks on target */ 668 masks[pc + 1 + filter[pc].k] &= memvalid; 669 memvalid = ~0; 670 break; 671 case BPF_JMP | BPF_JEQ | BPF_K: 672 case BPF_JMP | BPF_JEQ | BPF_X: 673 case BPF_JMP | BPF_JGE | BPF_K: 674 case BPF_JMP | BPF_JGE | BPF_X: 675 case BPF_JMP | BPF_JGT | BPF_K: 676 case BPF_JMP | BPF_JGT | BPF_X: 677 case BPF_JMP | BPF_JSET | BPF_K: 678 case BPF_JMP | BPF_JSET | BPF_X: 679 /* A jump must set masks on targets */ 680 masks[pc + 1 + filter[pc].jt] &= memvalid; 681 masks[pc + 1 + filter[pc].jf] &= memvalid; 682 memvalid = ~0; 683 break; 684 } 685 } 686 error: 687 kfree(masks); 688 return ret; 689 } 690 691 static bool chk_code_allowed(u16 code_to_probe) 692 { 693 static const bool codes[] = { 694 /* 32 bit ALU operations */ 695 [BPF_ALU | BPF_ADD | BPF_K] = true, 696 [BPF_ALU | BPF_ADD | BPF_X] = true, 697 [BPF_ALU | BPF_SUB | BPF_K] = true, 698 [BPF_ALU | BPF_SUB | BPF_X] = true, 699 [BPF_ALU | BPF_MUL | BPF_K] = true, 700 [BPF_ALU | BPF_MUL | BPF_X] = true, 701 [BPF_ALU | BPF_DIV | BPF_K] = true, 702 [BPF_ALU | BPF_DIV | BPF_X] = true, 703 [BPF_ALU | BPF_MOD | BPF_K] = true, 704 [BPF_ALU | BPF_MOD | BPF_X] = true, 705 [BPF_ALU | BPF_AND | BPF_K] = true, 706 [BPF_ALU | BPF_AND | BPF_X] = true, 707 [BPF_ALU | BPF_OR | BPF_K] = true, 708 [BPF_ALU | BPF_OR | BPF_X] = true, 709 [BPF_ALU | BPF_XOR | BPF_K] = true, 710 [BPF_ALU | BPF_XOR | BPF_X] = true, 711 [BPF_ALU | BPF_LSH | BPF_K] = true, 712 [BPF_ALU | BPF_LSH | BPF_X] = true, 713 [BPF_ALU | BPF_RSH | BPF_K] = true, 714 [BPF_ALU | BPF_RSH | BPF_X] = true, 715 [BPF_ALU | BPF_NEG] = true, 716 /* Load instructions */ 717 [BPF_LD | BPF_W | BPF_ABS] = true, 718 [BPF_LD | BPF_H | BPF_ABS] = true, 719 [BPF_LD | BPF_B | BPF_ABS] = true, 720 [BPF_LD | BPF_W | BPF_LEN] = true, 721 [BPF_LD | BPF_W | BPF_IND] = true, 722 [BPF_LD | BPF_H | BPF_IND] = true, 723 [BPF_LD | BPF_B | BPF_IND] = true, 724 [BPF_LD | BPF_IMM] = true, 725 [BPF_LD | BPF_MEM] = true, 726 [BPF_LDX | BPF_W | BPF_LEN] = true, 727 [BPF_LDX | BPF_B | BPF_MSH] = true, 728 [BPF_LDX | BPF_IMM] = true, 729 [BPF_LDX | BPF_MEM] = true, 730 /* Store instructions */ 731 [BPF_ST] = true, 732 [BPF_STX] = true, 733 /* Misc instructions */ 734 [BPF_MISC | BPF_TAX] = true, 735 [BPF_MISC | BPF_TXA] = true, 736 /* Return instructions */ 737 [BPF_RET | BPF_K] = true, 738 [BPF_RET | BPF_A] = true, 739 /* Jump instructions */ 740 [BPF_JMP | BPF_JA] = true, 741 [BPF_JMP | BPF_JEQ | BPF_K] = true, 742 [BPF_JMP | BPF_JEQ | BPF_X] = true, 743 [BPF_JMP | BPF_JGE | BPF_K] = true, 744 [BPF_JMP | BPF_JGE | BPF_X] = true, 745 [BPF_JMP | BPF_JGT | BPF_K] = true, 746 [BPF_JMP | BPF_JGT | BPF_X] = true, 747 [BPF_JMP | BPF_JSET | BPF_K] = true, 748 [BPF_JMP | BPF_JSET | BPF_X] = true, 749 }; 750 751 if (code_to_probe >= ARRAY_SIZE(codes)) 752 return false; 753 754 return codes[code_to_probe]; 755 } 756 757 static bool bpf_check_basics_ok(const struct sock_filter *filter, 758 unsigned int flen) 759 { 760 if (filter == NULL) 761 return false; 762 if (flen == 0 || flen > BPF_MAXINSNS) 763 return false; 764 765 return true; 766 } 767 768 /** 769 * bpf_check_classic - verify socket filter code 770 * @filter: filter to verify 771 * @flen: length of filter 772 * 773 * Check the user's filter code. If we let some ugly 774 * filter code slip through kaboom! The filter must contain 775 * no references or jumps that are out of range, no illegal 776 * instructions, and must end with a RET instruction. 777 * 778 * All jumps are forward as they are not signed. 779 * 780 * Returns 0 if the rule set is legal or -EINVAL if not. 781 */ 782 static int bpf_check_classic(const struct sock_filter *filter, 783 unsigned int flen) 784 { 785 bool anc_found; 786 int pc; 787 788 /* Check the filter code now */ 789 for (pc = 0; pc < flen; pc++) { 790 const struct sock_filter *ftest = &filter[pc]; 791 792 /* May we actually operate on this code? */ 793 if (!chk_code_allowed(ftest->code)) 794 return -EINVAL; 795 796 /* Some instructions need special checks */ 797 switch (ftest->code) { 798 case BPF_ALU | BPF_DIV | BPF_K: 799 case BPF_ALU | BPF_MOD | BPF_K: 800 /* Check for division by zero */ 801 if (ftest->k == 0) 802 return -EINVAL; 803 break; 804 case BPF_ALU | BPF_LSH | BPF_K: 805 case BPF_ALU | BPF_RSH | BPF_K: 806 if (ftest->k >= 32) 807 return -EINVAL; 808 break; 809 case BPF_LD | BPF_MEM: 810 case BPF_LDX | BPF_MEM: 811 case BPF_ST: 812 case BPF_STX: 813 /* Check for invalid memory addresses */ 814 if (ftest->k >= BPF_MEMWORDS) 815 return -EINVAL; 816 break; 817 case BPF_JMP | BPF_JA: 818 /* Note, the large ftest->k might cause loops. 819 * Compare this with conditional jumps below, 820 * where offsets are limited. --ANK (981016) 821 */ 822 if (ftest->k >= (unsigned int)(flen - pc - 1)) 823 return -EINVAL; 824 break; 825 case BPF_JMP | BPF_JEQ | BPF_K: 826 case BPF_JMP | BPF_JEQ | BPF_X: 827 case BPF_JMP | BPF_JGE | BPF_K: 828 case BPF_JMP | BPF_JGE | BPF_X: 829 case BPF_JMP | BPF_JGT | BPF_K: 830 case BPF_JMP | BPF_JGT | BPF_X: 831 case BPF_JMP | BPF_JSET | BPF_K: 832 case BPF_JMP | BPF_JSET | BPF_X: 833 /* Both conditionals must be safe */ 834 if (pc + ftest->jt + 1 >= flen || 835 pc + ftest->jf + 1 >= flen) 836 return -EINVAL; 837 break; 838 case BPF_LD | BPF_W | BPF_ABS: 839 case BPF_LD | BPF_H | BPF_ABS: 840 case BPF_LD | BPF_B | BPF_ABS: 841 anc_found = false; 842 if (bpf_anc_helper(ftest) & BPF_ANC) 843 anc_found = true; 844 /* Ancillary operation unknown or unsupported */ 845 if (anc_found == false && ftest->k >= SKF_AD_OFF) 846 return -EINVAL; 847 } 848 } 849 850 /* Last instruction must be a RET code */ 851 switch (filter[flen - 1].code) { 852 case BPF_RET | BPF_K: 853 case BPF_RET | BPF_A: 854 return check_load_and_stores(filter, flen); 855 } 856 857 return -EINVAL; 858 } 859 860 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 861 const struct sock_fprog *fprog) 862 { 863 unsigned int fsize = bpf_classic_proglen(fprog); 864 struct sock_fprog_kern *fkprog; 865 866 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 867 if (!fp->orig_prog) 868 return -ENOMEM; 869 870 fkprog = fp->orig_prog; 871 fkprog->len = fprog->len; 872 873 fkprog->filter = kmemdup(fp->insns, fsize, 874 GFP_KERNEL | __GFP_NOWARN); 875 if (!fkprog->filter) { 876 kfree(fp->orig_prog); 877 return -ENOMEM; 878 } 879 880 return 0; 881 } 882 883 static void bpf_release_orig_filter(struct bpf_prog *fp) 884 { 885 struct sock_fprog_kern *fprog = fp->orig_prog; 886 887 if (fprog) { 888 kfree(fprog->filter); 889 kfree(fprog); 890 } 891 } 892 893 static void __bpf_prog_release(struct bpf_prog *prog) 894 { 895 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 896 bpf_prog_put(prog); 897 } else { 898 bpf_release_orig_filter(prog); 899 bpf_prog_free(prog); 900 } 901 } 902 903 static void __sk_filter_release(struct sk_filter *fp) 904 { 905 __bpf_prog_release(fp->prog); 906 kfree(fp); 907 } 908 909 /** 910 * sk_filter_release_rcu - Release a socket filter by rcu_head 911 * @rcu: rcu_head that contains the sk_filter to free 912 */ 913 static void sk_filter_release_rcu(struct rcu_head *rcu) 914 { 915 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 916 917 __sk_filter_release(fp); 918 } 919 920 /** 921 * sk_filter_release - release a socket filter 922 * @fp: filter to remove 923 * 924 * Remove a filter from a socket and release its resources. 925 */ 926 static void sk_filter_release(struct sk_filter *fp) 927 { 928 if (atomic_dec_and_test(&fp->refcnt)) 929 call_rcu(&fp->rcu, sk_filter_release_rcu); 930 } 931 932 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 933 { 934 u32 filter_size = bpf_prog_size(fp->prog->len); 935 936 atomic_sub(filter_size, &sk->sk_omem_alloc); 937 sk_filter_release(fp); 938 } 939 940 /* try to charge the socket memory if there is space available 941 * return true on success 942 */ 943 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 944 { 945 u32 filter_size = bpf_prog_size(fp->prog->len); 946 947 /* same check as in sock_kmalloc() */ 948 if (filter_size <= sysctl_optmem_max && 949 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 950 atomic_inc(&fp->refcnt); 951 atomic_add(filter_size, &sk->sk_omem_alloc); 952 return true; 953 } 954 return false; 955 } 956 957 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 958 { 959 struct sock_filter *old_prog; 960 struct bpf_prog *old_fp; 961 int err, new_len, old_len = fp->len; 962 963 /* We are free to overwrite insns et al right here as it 964 * won't be used at this point in time anymore internally 965 * after the migration to the internal BPF instruction 966 * representation. 967 */ 968 BUILD_BUG_ON(sizeof(struct sock_filter) != 969 sizeof(struct bpf_insn)); 970 971 /* Conversion cannot happen on overlapping memory areas, 972 * so we need to keep the user BPF around until the 2nd 973 * pass. At this time, the user BPF is stored in fp->insns. 974 */ 975 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 976 GFP_KERNEL | __GFP_NOWARN); 977 if (!old_prog) { 978 err = -ENOMEM; 979 goto out_err; 980 } 981 982 /* 1st pass: calculate the new program length. */ 983 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 984 if (err) 985 goto out_err_free; 986 987 /* Expand fp for appending the new filter representation. */ 988 old_fp = fp; 989 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 990 if (!fp) { 991 /* The old_fp is still around in case we couldn't 992 * allocate new memory, so uncharge on that one. 993 */ 994 fp = old_fp; 995 err = -ENOMEM; 996 goto out_err_free; 997 } 998 999 fp->len = new_len; 1000 1001 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1002 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 1003 if (err) 1004 /* 2nd bpf_convert_filter() can fail only if it fails 1005 * to allocate memory, remapping must succeed. Note, 1006 * that at this time old_fp has already been released 1007 * by krealloc(). 1008 */ 1009 goto out_err_free; 1010 1011 /* We are guaranteed to never error here with cBPF to eBPF 1012 * transitions, since there's no issue with type compatibility 1013 * checks on program arrays. 1014 */ 1015 fp = bpf_prog_select_runtime(fp, &err); 1016 1017 kfree(old_prog); 1018 return fp; 1019 1020 out_err_free: 1021 kfree(old_prog); 1022 out_err: 1023 __bpf_prog_release(fp); 1024 return ERR_PTR(err); 1025 } 1026 1027 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1028 bpf_aux_classic_check_t trans) 1029 { 1030 int err; 1031 1032 fp->bpf_func = NULL; 1033 fp->jited = 0; 1034 1035 err = bpf_check_classic(fp->insns, fp->len); 1036 if (err) { 1037 __bpf_prog_release(fp); 1038 return ERR_PTR(err); 1039 } 1040 1041 /* There might be additional checks and transformations 1042 * needed on classic filters, f.e. in case of seccomp. 1043 */ 1044 if (trans) { 1045 err = trans(fp->insns, fp->len); 1046 if (err) { 1047 __bpf_prog_release(fp); 1048 return ERR_PTR(err); 1049 } 1050 } 1051 1052 /* Probe if we can JIT compile the filter and if so, do 1053 * the compilation of the filter. 1054 */ 1055 bpf_jit_compile(fp); 1056 1057 /* JIT compiler couldn't process this filter, so do the 1058 * internal BPF translation for the optimized interpreter. 1059 */ 1060 if (!fp->jited) 1061 fp = bpf_migrate_filter(fp); 1062 1063 return fp; 1064 } 1065 1066 /** 1067 * bpf_prog_create - create an unattached filter 1068 * @pfp: the unattached filter that is created 1069 * @fprog: the filter program 1070 * 1071 * Create a filter independent of any socket. We first run some 1072 * sanity checks on it to make sure it does not explode on us later. 1073 * If an error occurs or there is insufficient memory for the filter 1074 * a negative errno code is returned. On success the return is zero. 1075 */ 1076 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1077 { 1078 unsigned int fsize = bpf_classic_proglen(fprog); 1079 struct bpf_prog *fp; 1080 1081 /* Make sure new filter is there and in the right amounts. */ 1082 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1083 return -EINVAL; 1084 1085 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1086 if (!fp) 1087 return -ENOMEM; 1088 1089 memcpy(fp->insns, fprog->filter, fsize); 1090 1091 fp->len = fprog->len; 1092 /* Since unattached filters are not copied back to user 1093 * space through sk_get_filter(), we do not need to hold 1094 * a copy here, and can spare us the work. 1095 */ 1096 fp->orig_prog = NULL; 1097 1098 /* bpf_prepare_filter() already takes care of freeing 1099 * memory in case something goes wrong. 1100 */ 1101 fp = bpf_prepare_filter(fp, NULL); 1102 if (IS_ERR(fp)) 1103 return PTR_ERR(fp); 1104 1105 *pfp = fp; 1106 return 0; 1107 } 1108 EXPORT_SYMBOL_GPL(bpf_prog_create); 1109 1110 /** 1111 * bpf_prog_create_from_user - create an unattached filter from user buffer 1112 * @pfp: the unattached filter that is created 1113 * @fprog: the filter program 1114 * @trans: post-classic verifier transformation handler 1115 * @save_orig: save classic BPF program 1116 * 1117 * This function effectively does the same as bpf_prog_create(), only 1118 * that it builds up its insns buffer from user space provided buffer. 1119 * It also allows for passing a bpf_aux_classic_check_t handler. 1120 */ 1121 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1122 bpf_aux_classic_check_t trans, bool save_orig) 1123 { 1124 unsigned int fsize = bpf_classic_proglen(fprog); 1125 struct bpf_prog *fp; 1126 int err; 1127 1128 /* Make sure new filter is there and in the right amounts. */ 1129 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1130 return -EINVAL; 1131 1132 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1133 if (!fp) 1134 return -ENOMEM; 1135 1136 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1137 __bpf_prog_free(fp); 1138 return -EFAULT; 1139 } 1140 1141 fp->len = fprog->len; 1142 fp->orig_prog = NULL; 1143 1144 if (save_orig) { 1145 err = bpf_prog_store_orig_filter(fp, fprog); 1146 if (err) { 1147 __bpf_prog_free(fp); 1148 return -ENOMEM; 1149 } 1150 } 1151 1152 /* bpf_prepare_filter() already takes care of freeing 1153 * memory in case something goes wrong. 1154 */ 1155 fp = bpf_prepare_filter(fp, trans); 1156 if (IS_ERR(fp)) 1157 return PTR_ERR(fp); 1158 1159 *pfp = fp; 1160 return 0; 1161 } 1162 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1163 1164 void bpf_prog_destroy(struct bpf_prog *fp) 1165 { 1166 __bpf_prog_release(fp); 1167 } 1168 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1169 1170 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1171 { 1172 struct sk_filter *fp, *old_fp; 1173 1174 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1175 if (!fp) 1176 return -ENOMEM; 1177 1178 fp->prog = prog; 1179 atomic_set(&fp->refcnt, 0); 1180 1181 if (!sk_filter_charge(sk, fp)) { 1182 kfree(fp); 1183 return -ENOMEM; 1184 } 1185 1186 old_fp = rcu_dereference_protected(sk->sk_filter, 1187 lockdep_sock_is_held(sk)); 1188 rcu_assign_pointer(sk->sk_filter, fp); 1189 1190 if (old_fp) 1191 sk_filter_uncharge(sk, old_fp); 1192 1193 return 0; 1194 } 1195 1196 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1197 { 1198 struct bpf_prog *old_prog; 1199 int err; 1200 1201 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1202 return -ENOMEM; 1203 1204 if (sk_unhashed(sk) && sk->sk_reuseport) { 1205 err = reuseport_alloc(sk); 1206 if (err) 1207 return err; 1208 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1209 /* The socket wasn't bound with SO_REUSEPORT */ 1210 return -EINVAL; 1211 } 1212 1213 old_prog = reuseport_attach_prog(sk, prog); 1214 if (old_prog) 1215 bpf_prog_destroy(old_prog); 1216 1217 return 0; 1218 } 1219 1220 static 1221 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1222 { 1223 unsigned int fsize = bpf_classic_proglen(fprog); 1224 struct bpf_prog *prog; 1225 int err; 1226 1227 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1228 return ERR_PTR(-EPERM); 1229 1230 /* Make sure new filter is there and in the right amounts. */ 1231 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1232 return ERR_PTR(-EINVAL); 1233 1234 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1235 if (!prog) 1236 return ERR_PTR(-ENOMEM); 1237 1238 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1239 __bpf_prog_free(prog); 1240 return ERR_PTR(-EFAULT); 1241 } 1242 1243 prog->len = fprog->len; 1244 1245 err = bpf_prog_store_orig_filter(prog, fprog); 1246 if (err) { 1247 __bpf_prog_free(prog); 1248 return ERR_PTR(-ENOMEM); 1249 } 1250 1251 /* bpf_prepare_filter() already takes care of freeing 1252 * memory in case something goes wrong. 1253 */ 1254 return bpf_prepare_filter(prog, NULL); 1255 } 1256 1257 /** 1258 * sk_attach_filter - attach a socket filter 1259 * @fprog: the filter program 1260 * @sk: the socket to use 1261 * 1262 * Attach the user's filter code. We first run some sanity checks on 1263 * it to make sure it does not explode on us later. If an error 1264 * occurs or there is insufficient memory for the filter a negative 1265 * errno code is returned. On success the return is zero. 1266 */ 1267 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1268 { 1269 struct bpf_prog *prog = __get_filter(fprog, sk); 1270 int err; 1271 1272 if (IS_ERR(prog)) 1273 return PTR_ERR(prog); 1274 1275 err = __sk_attach_prog(prog, sk); 1276 if (err < 0) { 1277 __bpf_prog_release(prog); 1278 return err; 1279 } 1280 1281 return 0; 1282 } 1283 EXPORT_SYMBOL_GPL(sk_attach_filter); 1284 1285 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1286 { 1287 struct bpf_prog *prog = __get_filter(fprog, sk); 1288 int err; 1289 1290 if (IS_ERR(prog)) 1291 return PTR_ERR(prog); 1292 1293 err = __reuseport_attach_prog(prog, sk); 1294 if (err < 0) { 1295 __bpf_prog_release(prog); 1296 return err; 1297 } 1298 1299 return 0; 1300 } 1301 1302 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1303 { 1304 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1305 return ERR_PTR(-EPERM); 1306 1307 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1308 } 1309 1310 int sk_attach_bpf(u32 ufd, struct sock *sk) 1311 { 1312 struct bpf_prog *prog = __get_bpf(ufd, sk); 1313 int err; 1314 1315 if (IS_ERR(prog)) 1316 return PTR_ERR(prog); 1317 1318 err = __sk_attach_prog(prog, sk); 1319 if (err < 0) { 1320 bpf_prog_put(prog); 1321 return err; 1322 } 1323 1324 return 0; 1325 } 1326 1327 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1328 { 1329 struct bpf_prog *prog = __get_bpf(ufd, sk); 1330 int err; 1331 1332 if (IS_ERR(prog)) 1333 return PTR_ERR(prog); 1334 1335 err = __reuseport_attach_prog(prog, sk); 1336 if (err < 0) { 1337 bpf_prog_put(prog); 1338 return err; 1339 } 1340 1341 return 0; 1342 } 1343 1344 struct bpf_scratchpad { 1345 union { 1346 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1347 u8 buff[MAX_BPF_STACK]; 1348 }; 1349 }; 1350 1351 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1352 1353 static inline int bpf_try_make_writable(struct sk_buff *skb, 1354 unsigned int write_len) 1355 { 1356 int err; 1357 1358 err = skb_ensure_writable(skb, write_len); 1359 bpf_compute_data_end(skb); 1360 1361 return err; 1362 } 1363 1364 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1365 { 1366 if (skb_at_tc_ingress(skb)) 1367 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1368 } 1369 1370 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1371 { 1372 if (skb_at_tc_ingress(skb)) 1373 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1374 } 1375 1376 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1377 { 1378 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1379 unsigned int offset = (unsigned int) r2; 1380 void *from = (void *) (long) r3; 1381 unsigned int len = (unsigned int) r4; 1382 void *ptr; 1383 1384 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1385 return -EINVAL; 1386 if (unlikely(offset > 0xffff)) 1387 return -EFAULT; 1388 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1389 return -EFAULT; 1390 1391 ptr = skb->data + offset; 1392 if (flags & BPF_F_RECOMPUTE_CSUM) 1393 __skb_postpull_rcsum(skb, ptr, len, offset); 1394 1395 memcpy(ptr, from, len); 1396 1397 if (flags & BPF_F_RECOMPUTE_CSUM) 1398 __skb_postpush_rcsum(skb, ptr, len, offset); 1399 if (flags & BPF_F_INVALIDATE_HASH) 1400 skb_clear_hash(skb); 1401 1402 return 0; 1403 } 1404 1405 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1406 .func = bpf_skb_store_bytes, 1407 .gpl_only = false, 1408 .ret_type = RET_INTEGER, 1409 .arg1_type = ARG_PTR_TO_CTX, 1410 .arg2_type = ARG_ANYTHING, 1411 .arg3_type = ARG_PTR_TO_STACK, 1412 .arg4_type = ARG_CONST_STACK_SIZE, 1413 .arg5_type = ARG_ANYTHING, 1414 }; 1415 1416 static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1417 { 1418 const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; 1419 unsigned int offset = (unsigned int) r2; 1420 void *to = (void *)(unsigned long) r3; 1421 unsigned int len = (unsigned int) r4; 1422 void *ptr; 1423 1424 if (unlikely(offset > 0xffff)) 1425 goto err_clear; 1426 1427 ptr = skb_header_pointer(skb, offset, len, to); 1428 if (unlikely(!ptr)) 1429 goto err_clear; 1430 if (ptr != to) 1431 memcpy(to, ptr, len); 1432 1433 return 0; 1434 err_clear: 1435 memset(to, 0, len); 1436 return -EFAULT; 1437 } 1438 1439 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1440 .func = bpf_skb_load_bytes, 1441 .gpl_only = false, 1442 .ret_type = RET_INTEGER, 1443 .arg1_type = ARG_PTR_TO_CTX, 1444 .arg2_type = ARG_ANYTHING, 1445 .arg3_type = ARG_PTR_TO_RAW_STACK, 1446 .arg4_type = ARG_CONST_STACK_SIZE, 1447 }; 1448 1449 static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1450 { 1451 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1452 unsigned int offset = (unsigned int) r2; 1453 __sum16 *ptr; 1454 1455 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1456 return -EINVAL; 1457 if (unlikely(offset > 0xffff || offset & 1)) 1458 return -EFAULT; 1459 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1460 return -EFAULT; 1461 1462 ptr = (__sum16 *)(skb->data + offset); 1463 switch (flags & BPF_F_HDR_FIELD_MASK) { 1464 case 0: 1465 if (unlikely(from != 0)) 1466 return -EINVAL; 1467 1468 csum_replace_by_diff(ptr, to); 1469 break; 1470 case 2: 1471 csum_replace2(ptr, from, to); 1472 break; 1473 case 4: 1474 csum_replace4(ptr, from, to); 1475 break; 1476 default: 1477 return -EINVAL; 1478 } 1479 1480 return 0; 1481 } 1482 1483 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1484 .func = bpf_l3_csum_replace, 1485 .gpl_only = false, 1486 .ret_type = RET_INTEGER, 1487 .arg1_type = ARG_PTR_TO_CTX, 1488 .arg2_type = ARG_ANYTHING, 1489 .arg3_type = ARG_ANYTHING, 1490 .arg4_type = ARG_ANYTHING, 1491 .arg5_type = ARG_ANYTHING, 1492 }; 1493 1494 static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1495 { 1496 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1497 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1498 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1499 unsigned int offset = (unsigned int) r2; 1500 __sum16 *ptr; 1501 1502 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1503 BPF_F_HDR_FIELD_MASK))) 1504 return -EINVAL; 1505 if (unlikely(offset > 0xffff || offset & 1)) 1506 return -EFAULT; 1507 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1508 return -EFAULT; 1509 1510 ptr = (__sum16 *)(skb->data + offset); 1511 if (is_mmzero && !*ptr) 1512 return 0; 1513 1514 switch (flags & BPF_F_HDR_FIELD_MASK) { 1515 case 0: 1516 if (unlikely(from != 0)) 1517 return -EINVAL; 1518 1519 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1520 break; 1521 case 2: 1522 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1523 break; 1524 case 4: 1525 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1526 break; 1527 default: 1528 return -EINVAL; 1529 } 1530 1531 if (is_mmzero && !*ptr) 1532 *ptr = CSUM_MANGLED_0; 1533 return 0; 1534 } 1535 1536 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1537 .func = bpf_l4_csum_replace, 1538 .gpl_only = false, 1539 .ret_type = RET_INTEGER, 1540 .arg1_type = ARG_PTR_TO_CTX, 1541 .arg2_type = ARG_ANYTHING, 1542 .arg3_type = ARG_ANYTHING, 1543 .arg4_type = ARG_ANYTHING, 1544 .arg5_type = ARG_ANYTHING, 1545 }; 1546 1547 static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) 1548 { 1549 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1550 u64 diff_size = from_size + to_size; 1551 __be32 *from = (__be32 *) (long) r1; 1552 __be32 *to = (__be32 *) (long) r3; 1553 int i, j = 0; 1554 1555 /* This is quite flexible, some examples: 1556 * 1557 * from_size == 0, to_size > 0, seed := csum --> pushing data 1558 * from_size > 0, to_size == 0, seed := csum --> pulling data 1559 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1560 * 1561 * Even for diffing, from_size and to_size don't need to be equal. 1562 */ 1563 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1564 diff_size > sizeof(sp->diff))) 1565 return -EINVAL; 1566 1567 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1568 sp->diff[j] = ~from[i]; 1569 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1570 sp->diff[j] = to[i]; 1571 1572 return csum_partial(sp->diff, diff_size, seed); 1573 } 1574 1575 static const struct bpf_func_proto bpf_csum_diff_proto = { 1576 .func = bpf_csum_diff, 1577 .gpl_only = false, 1578 .ret_type = RET_INTEGER, 1579 .arg1_type = ARG_PTR_TO_STACK, 1580 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1581 .arg3_type = ARG_PTR_TO_STACK, 1582 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1583 .arg5_type = ARG_ANYTHING, 1584 }; 1585 1586 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1587 { 1588 return dev_forward_skb(dev, skb); 1589 } 1590 1591 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 1592 { 1593 int ret; 1594 1595 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { 1596 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 1597 kfree_skb(skb); 1598 return -ENETDOWN; 1599 } 1600 1601 skb->dev = dev; 1602 1603 __this_cpu_inc(xmit_recursion); 1604 ret = dev_queue_xmit(skb); 1605 __this_cpu_dec(xmit_recursion); 1606 1607 return ret; 1608 } 1609 1610 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) 1611 { 1612 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1613 struct net_device *dev; 1614 1615 if (unlikely(flags & ~(BPF_F_INGRESS))) 1616 return -EINVAL; 1617 1618 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1619 if (unlikely(!dev)) 1620 return -EINVAL; 1621 1622 skb = skb_clone(skb, GFP_ATOMIC); 1623 if (unlikely(!skb)) 1624 return -ENOMEM; 1625 1626 bpf_push_mac_rcsum(skb); 1627 1628 return flags & BPF_F_INGRESS ? 1629 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1630 } 1631 1632 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1633 .func = bpf_clone_redirect, 1634 .gpl_only = false, 1635 .ret_type = RET_INTEGER, 1636 .arg1_type = ARG_PTR_TO_CTX, 1637 .arg2_type = ARG_ANYTHING, 1638 .arg3_type = ARG_ANYTHING, 1639 }; 1640 1641 struct redirect_info { 1642 u32 ifindex; 1643 u32 flags; 1644 }; 1645 1646 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1647 1648 static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) 1649 { 1650 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1651 1652 if (unlikely(flags & ~(BPF_F_INGRESS))) 1653 return TC_ACT_SHOT; 1654 1655 ri->ifindex = ifindex; 1656 ri->flags = flags; 1657 1658 return TC_ACT_REDIRECT; 1659 } 1660 1661 int skb_do_redirect(struct sk_buff *skb) 1662 { 1663 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1664 struct net_device *dev; 1665 1666 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1667 ri->ifindex = 0; 1668 if (unlikely(!dev)) { 1669 kfree_skb(skb); 1670 return -EINVAL; 1671 } 1672 1673 bpf_push_mac_rcsum(skb); 1674 1675 return ri->flags & BPF_F_INGRESS ? 1676 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1677 } 1678 1679 static const struct bpf_func_proto bpf_redirect_proto = { 1680 .func = bpf_redirect, 1681 .gpl_only = false, 1682 .ret_type = RET_INTEGER, 1683 .arg1_type = ARG_ANYTHING, 1684 .arg2_type = ARG_ANYTHING, 1685 }; 1686 1687 static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1688 { 1689 return task_get_classid((struct sk_buff *) (unsigned long) r1); 1690 } 1691 1692 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1693 .func = bpf_get_cgroup_classid, 1694 .gpl_only = false, 1695 .ret_type = RET_INTEGER, 1696 .arg1_type = ARG_PTR_TO_CTX, 1697 }; 1698 1699 static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1700 { 1701 return dst_tclassid((struct sk_buff *) (unsigned long) r1); 1702 } 1703 1704 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1705 .func = bpf_get_route_realm, 1706 .gpl_only = false, 1707 .ret_type = RET_INTEGER, 1708 .arg1_type = ARG_PTR_TO_CTX, 1709 }; 1710 1711 static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1712 { 1713 /* If skb_clear_hash() was called due to mangling, we can 1714 * trigger SW recalculation here. Later access to hash 1715 * can then use the inline skb->hash via context directly 1716 * instead of calling this helper again. 1717 */ 1718 return skb_get_hash((struct sk_buff *) (unsigned long) r1); 1719 } 1720 1721 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 1722 .func = bpf_get_hash_recalc, 1723 .gpl_only = false, 1724 .ret_type = RET_INTEGER, 1725 .arg1_type = ARG_PTR_TO_CTX, 1726 }; 1727 1728 static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) 1729 { 1730 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1731 __be16 vlan_proto = (__force __be16) r2; 1732 int ret; 1733 1734 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1735 vlan_proto != htons(ETH_P_8021AD))) 1736 vlan_proto = htons(ETH_P_8021Q); 1737 1738 bpf_push_mac_rcsum(skb); 1739 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 1740 bpf_pull_mac_rcsum(skb); 1741 1742 bpf_compute_data_end(skb); 1743 return ret; 1744 } 1745 1746 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1747 .func = bpf_skb_vlan_push, 1748 .gpl_only = false, 1749 .ret_type = RET_INTEGER, 1750 .arg1_type = ARG_PTR_TO_CTX, 1751 .arg2_type = ARG_ANYTHING, 1752 .arg3_type = ARG_ANYTHING, 1753 }; 1754 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1755 1756 static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1757 { 1758 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1759 int ret; 1760 1761 bpf_push_mac_rcsum(skb); 1762 ret = skb_vlan_pop(skb); 1763 bpf_pull_mac_rcsum(skb); 1764 1765 bpf_compute_data_end(skb); 1766 return ret; 1767 } 1768 1769 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1770 .func = bpf_skb_vlan_pop, 1771 .gpl_only = false, 1772 .ret_type = RET_INTEGER, 1773 .arg1_type = ARG_PTR_TO_CTX, 1774 }; 1775 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1776 1777 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 1778 { 1779 /* Caller already did skb_cow() with len as headroom, 1780 * so no need to do it here. 1781 */ 1782 skb_push(skb, len); 1783 memmove(skb->data, skb->data + len, off); 1784 memset(skb->data + off, 0, len); 1785 1786 /* No skb_postpush_rcsum(skb, skb->data + off, len) 1787 * needed here as it does not change the skb->csum 1788 * result for checksum complete when summing over 1789 * zeroed blocks. 1790 */ 1791 return 0; 1792 } 1793 1794 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 1795 { 1796 /* skb_ensure_writable() is not needed here, as we're 1797 * already working on an uncloned skb. 1798 */ 1799 if (unlikely(!pskb_may_pull(skb, off + len))) 1800 return -ENOMEM; 1801 1802 skb_postpull_rcsum(skb, skb->data + off, len); 1803 memmove(skb->data + len, skb->data, off); 1804 __skb_pull(skb, len); 1805 1806 return 0; 1807 } 1808 1809 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 1810 { 1811 bool trans_same = skb->transport_header == skb->network_header; 1812 int ret; 1813 1814 /* There's no need for __skb_push()/__skb_pull() pair to 1815 * get to the start of the mac header as we're guaranteed 1816 * to always start from here under eBPF. 1817 */ 1818 ret = bpf_skb_generic_push(skb, off, len); 1819 if (likely(!ret)) { 1820 skb->mac_header -= len; 1821 skb->network_header -= len; 1822 if (trans_same) 1823 skb->transport_header = skb->network_header; 1824 } 1825 1826 return ret; 1827 } 1828 1829 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 1830 { 1831 bool trans_same = skb->transport_header == skb->network_header; 1832 int ret; 1833 1834 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 1835 ret = bpf_skb_generic_pop(skb, off, len); 1836 if (likely(!ret)) { 1837 skb->mac_header += len; 1838 skb->network_header += len; 1839 if (trans_same) 1840 skb->transport_header = skb->network_header; 1841 } 1842 1843 return ret; 1844 } 1845 1846 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 1847 { 1848 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1849 u32 off = skb->network_header - skb->mac_header; 1850 int ret; 1851 1852 ret = skb_cow(skb, len_diff); 1853 if (unlikely(ret < 0)) 1854 return ret; 1855 1856 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 1857 if (unlikely(ret < 0)) 1858 return ret; 1859 1860 if (skb_is_gso(skb)) { 1861 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to 1862 * be changed into SKB_GSO_TCPV6. 1863 */ 1864 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 1865 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; 1866 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; 1867 } 1868 1869 /* Due to IPv6 header, MSS needs to be downgraded. */ 1870 skb_shinfo(skb)->gso_size -= len_diff; 1871 /* Header must be checked, and gso_segs recomputed. */ 1872 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 1873 skb_shinfo(skb)->gso_segs = 0; 1874 } 1875 1876 skb->protocol = htons(ETH_P_IPV6); 1877 skb_clear_hash(skb); 1878 1879 return 0; 1880 } 1881 1882 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 1883 { 1884 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1885 u32 off = skb->network_header - skb->mac_header; 1886 int ret; 1887 1888 ret = skb_unclone(skb, GFP_ATOMIC); 1889 if (unlikely(ret < 0)) 1890 return ret; 1891 1892 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 1893 if (unlikely(ret < 0)) 1894 return ret; 1895 1896 if (skb_is_gso(skb)) { 1897 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to 1898 * be changed into SKB_GSO_TCPV4. 1899 */ 1900 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { 1901 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; 1902 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; 1903 } 1904 1905 /* Due to IPv4 header, MSS can be upgraded. */ 1906 skb_shinfo(skb)->gso_size += len_diff; 1907 /* Header must be checked, and gso_segs recomputed. */ 1908 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 1909 skb_shinfo(skb)->gso_segs = 0; 1910 } 1911 1912 skb->protocol = htons(ETH_P_IP); 1913 skb_clear_hash(skb); 1914 1915 return 0; 1916 } 1917 1918 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 1919 { 1920 __be16 from_proto = skb->protocol; 1921 1922 if (from_proto == htons(ETH_P_IP) && 1923 to_proto == htons(ETH_P_IPV6)) 1924 return bpf_skb_proto_4_to_6(skb); 1925 1926 if (from_proto == htons(ETH_P_IPV6) && 1927 to_proto == htons(ETH_P_IP)) 1928 return bpf_skb_proto_6_to_4(skb); 1929 1930 return -ENOTSUPP; 1931 } 1932 1933 static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) 1934 { 1935 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1936 __be16 proto = (__force __be16) r2; 1937 int ret; 1938 1939 if (unlikely(flags)) 1940 return -EINVAL; 1941 1942 /* General idea is that this helper does the basic groundwork 1943 * needed for changing the protocol, and eBPF program fills the 1944 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 1945 * and other helpers, rather than passing a raw buffer here. 1946 * 1947 * The rationale is to keep this minimal and without a need to 1948 * deal with raw packet data. F.e. even if we would pass buffers 1949 * here, the program still needs to call the bpf_lX_csum_replace() 1950 * helpers anyway. Plus, this way we keep also separation of 1951 * concerns, since f.e. bpf_skb_store_bytes() should only take 1952 * care of stores. 1953 * 1954 * Currently, additional options and extension header space are 1955 * not supported, but flags register is reserved so we can adapt 1956 * that. For offloads, we mark packet as dodgy, so that headers 1957 * need to be verified first. 1958 */ 1959 ret = bpf_skb_proto_xlat(skb, proto); 1960 bpf_compute_data_end(skb); 1961 return ret; 1962 } 1963 1964 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 1965 .func = bpf_skb_change_proto, 1966 .gpl_only = false, 1967 .ret_type = RET_INTEGER, 1968 .arg1_type = ARG_PTR_TO_CTX, 1969 .arg2_type = ARG_ANYTHING, 1970 .arg3_type = ARG_ANYTHING, 1971 }; 1972 1973 static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1974 { 1975 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1976 u32 pkt_type = r2; 1977 1978 /* We only allow a restricted subset to be changed for now. */ 1979 if (unlikely(skb->pkt_type > PACKET_OTHERHOST || 1980 pkt_type > PACKET_OTHERHOST)) 1981 return -EINVAL; 1982 1983 skb->pkt_type = pkt_type; 1984 return 0; 1985 } 1986 1987 static const struct bpf_func_proto bpf_skb_change_type_proto = { 1988 .func = bpf_skb_change_type, 1989 .gpl_only = false, 1990 .ret_type = RET_INTEGER, 1991 .arg1_type = ARG_PTR_TO_CTX, 1992 .arg2_type = ARG_ANYTHING, 1993 }; 1994 1995 bool bpf_helper_changes_skb_data(void *func) 1996 { 1997 if (func == bpf_skb_vlan_push) 1998 return true; 1999 if (func == bpf_skb_vlan_pop) 2000 return true; 2001 if (func == bpf_skb_store_bytes) 2002 return true; 2003 if (func == bpf_skb_change_proto) 2004 return true; 2005 if (func == bpf_l3_csum_replace) 2006 return true; 2007 if (func == bpf_l4_csum_replace) 2008 return true; 2009 2010 return false; 2011 } 2012 2013 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 2014 unsigned long off, unsigned long len) 2015 { 2016 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 2017 2018 if (unlikely(!ptr)) 2019 return len; 2020 if (ptr != dst_buff) 2021 memcpy(dst_buff, ptr, len); 2022 2023 return 0; 2024 } 2025 2026 static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, 2027 u64 meta_size) 2028 { 2029 struct sk_buff *skb = (struct sk_buff *)(long) r1; 2030 struct bpf_map *map = (struct bpf_map *)(long) r2; 2031 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2032 void *meta = (void *)(long) r4; 2033 2034 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2035 return -EINVAL; 2036 if (unlikely(skb_size > skb->len)) 2037 return -EFAULT; 2038 2039 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 2040 bpf_skb_copy); 2041 } 2042 2043 static const struct bpf_func_proto bpf_skb_event_output_proto = { 2044 .func = bpf_skb_event_output, 2045 .gpl_only = true, 2046 .ret_type = RET_INTEGER, 2047 .arg1_type = ARG_PTR_TO_CTX, 2048 .arg2_type = ARG_CONST_MAP_PTR, 2049 .arg3_type = ARG_ANYTHING, 2050 .arg4_type = ARG_PTR_TO_STACK, 2051 .arg5_type = ARG_CONST_STACK_SIZE, 2052 }; 2053 2054 static unsigned short bpf_tunnel_key_af(u64 flags) 2055 { 2056 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 2057 } 2058 2059 static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 2060 { 2061 struct sk_buff *skb = (struct sk_buff *) (long) r1; 2062 struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; 2063 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2064 u8 compat[sizeof(struct bpf_tunnel_key)]; 2065 void *to_orig = to; 2066 int err; 2067 2068 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { 2069 err = -EINVAL; 2070 goto err_clear; 2071 } 2072 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 2073 err = -EPROTO; 2074 goto err_clear; 2075 } 2076 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2077 err = -EINVAL; 2078 switch (size) { 2079 case offsetof(struct bpf_tunnel_key, tunnel_label): 2080 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2081 goto set_compat; 2082 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2083 /* Fixup deprecated structure layouts here, so we have 2084 * a common path later on. 2085 */ 2086 if (ip_tunnel_info_af(info) != AF_INET) 2087 goto err_clear; 2088 set_compat: 2089 to = (struct bpf_tunnel_key *)compat; 2090 break; 2091 default: 2092 goto err_clear; 2093 } 2094 } 2095 2096 to->tunnel_id = be64_to_cpu(info->key.tun_id); 2097 to->tunnel_tos = info->key.tos; 2098 to->tunnel_ttl = info->key.ttl; 2099 2100 if (flags & BPF_F_TUNINFO_IPV6) { 2101 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 2102 sizeof(to->remote_ipv6)); 2103 to->tunnel_label = be32_to_cpu(info->key.label); 2104 } else { 2105 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 2106 } 2107 2108 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 2109 memcpy(to_orig, to, size); 2110 2111 return 0; 2112 err_clear: 2113 memset(to_orig, 0, size); 2114 return err; 2115 } 2116 2117 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 2118 .func = bpf_skb_get_tunnel_key, 2119 .gpl_only = false, 2120 .ret_type = RET_INTEGER, 2121 .arg1_type = ARG_PTR_TO_CTX, 2122 .arg2_type = ARG_PTR_TO_RAW_STACK, 2123 .arg3_type = ARG_CONST_STACK_SIZE, 2124 .arg4_type = ARG_ANYTHING, 2125 }; 2126 2127 static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 2128 { 2129 struct sk_buff *skb = (struct sk_buff *) (long) r1; 2130 u8 *to = (u8 *) (long) r2; 2131 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2132 int err; 2133 2134 if (unlikely(!info || 2135 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { 2136 err = -ENOENT; 2137 goto err_clear; 2138 } 2139 if (unlikely(size < info->options_len)) { 2140 err = -ENOMEM; 2141 goto err_clear; 2142 } 2143 2144 ip_tunnel_info_opts_get(to, info); 2145 if (size > info->options_len) 2146 memset(to + info->options_len, 0, size - info->options_len); 2147 2148 return info->options_len; 2149 err_clear: 2150 memset(to, 0, size); 2151 return err; 2152 } 2153 2154 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 2155 .func = bpf_skb_get_tunnel_opt, 2156 .gpl_only = false, 2157 .ret_type = RET_INTEGER, 2158 .arg1_type = ARG_PTR_TO_CTX, 2159 .arg2_type = ARG_PTR_TO_RAW_STACK, 2160 .arg3_type = ARG_CONST_STACK_SIZE, 2161 }; 2162 2163 static struct metadata_dst __percpu *md_dst; 2164 2165 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 2166 { 2167 struct sk_buff *skb = (struct sk_buff *) (long) r1; 2168 struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; 2169 struct metadata_dst *md = this_cpu_ptr(md_dst); 2170 u8 compat[sizeof(struct bpf_tunnel_key)]; 2171 struct ip_tunnel_info *info; 2172 2173 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 2174 BPF_F_DONT_FRAGMENT))) 2175 return -EINVAL; 2176 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2177 switch (size) { 2178 case offsetof(struct bpf_tunnel_key, tunnel_label): 2179 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2180 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2181 /* Fixup deprecated structure layouts here, so we have 2182 * a common path later on. 2183 */ 2184 memcpy(compat, from, size); 2185 memset(compat + size, 0, sizeof(compat) - size); 2186 from = (struct bpf_tunnel_key *)compat; 2187 break; 2188 default: 2189 return -EINVAL; 2190 } 2191 } 2192 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 2193 from->tunnel_ext)) 2194 return -EINVAL; 2195 2196 skb_dst_drop(skb); 2197 dst_hold((struct dst_entry *) md); 2198 skb_dst_set(skb, (struct dst_entry *) md); 2199 2200 info = &md->u.tun_info; 2201 info->mode = IP_TUNNEL_INFO_TX; 2202 2203 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 2204 if (flags & BPF_F_DONT_FRAGMENT) 2205 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 2206 2207 info->key.tun_id = cpu_to_be64(from->tunnel_id); 2208 info->key.tos = from->tunnel_tos; 2209 info->key.ttl = from->tunnel_ttl; 2210 2211 if (flags & BPF_F_TUNINFO_IPV6) { 2212 info->mode |= IP_TUNNEL_INFO_IPV6; 2213 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 2214 sizeof(from->remote_ipv6)); 2215 info->key.label = cpu_to_be32(from->tunnel_label) & 2216 IPV6_FLOWLABEL_MASK; 2217 } else { 2218 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 2219 if (flags & BPF_F_ZERO_CSUM_TX) 2220 info->key.tun_flags &= ~TUNNEL_CSUM; 2221 } 2222 2223 return 0; 2224 } 2225 2226 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 2227 .func = bpf_skb_set_tunnel_key, 2228 .gpl_only = false, 2229 .ret_type = RET_INTEGER, 2230 .arg1_type = ARG_PTR_TO_CTX, 2231 .arg2_type = ARG_PTR_TO_STACK, 2232 .arg3_type = ARG_CONST_STACK_SIZE, 2233 .arg4_type = ARG_ANYTHING, 2234 }; 2235 2236 static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 2237 { 2238 struct sk_buff *skb = (struct sk_buff *) (long) r1; 2239 u8 *from = (u8 *) (long) r2; 2240 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2241 const struct metadata_dst *md = this_cpu_ptr(md_dst); 2242 2243 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 2244 return -EINVAL; 2245 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 2246 return -ENOMEM; 2247 2248 ip_tunnel_info_opts_set(info, from, size); 2249 2250 return 0; 2251 } 2252 2253 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 2254 .func = bpf_skb_set_tunnel_opt, 2255 .gpl_only = false, 2256 .ret_type = RET_INTEGER, 2257 .arg1_type = ARG_PTR_TO_CTX, 2258 .arg2_type = ARG_PTR_TO_STACK, 2259 .arg3_type = ARG_CONST_STACK_SIZE, 2260 }; 2261 2262 static const struct bpf_func_proto * 2263 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 2264 { 2265 if (!md_dst) { 2266 /* Race is not possible, since it's called from verifier 2267 * that is holding verifier mutex. 2268 */ 2269 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 2270 GFP_KERNEL); 2271 if (!md_dst) 2272 return NULL; 2273 } 2274 2275 switch (which) { 2276 case BPF_FUNC_skb_set_tunnel_key: 2277 return &bpf_skb_set_tunnel_key_proto; 2278 case BPF_FUNC_skb_set_tunnel_opt: 2279 return &bpf_skb_set_tunnel_opt_proto; 2280 default: 2281 return NULL; 2282 } 2283 } 2284 2285 #ifdef CONFIG_SOCK_CGROUP_DATA 2286 static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 2287 { 2288 struct sk_buff *skb = (struct sk_buff *)(long)r1; 2289 struct bpf_map *map = (struct bpf_map *)(long)r2; 2290 struct bpf_array *array = container_of(map, struct bpf_array, map); 2291 struct cgroup *cgrp; 2292 struct sock *sk; 2293 u32 i = (u32)r3; 2294 2295 sk = skb->sk; 2296 if (!sk || !sk_fullsock(sk)) 2297 return -ENOENT; 2298 2299 if (unlikely(i >= array->map.max_entries)) 2300 return -E2BIG; 2301 2302 cgrp = READ_ONCE(array->ptrs[i]); 2303 if (unlikely(!cgrp)) 2304 return -EAGAIN; 2305 2306 return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); 2307 } 2308 2309 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 2310 .func = bpf_skb_under_cgroup, 2311 .gpl_only = false, 2312 .ret_type = RET_INTEGER, 2313 .arg1_type = ARG_PTR_TO_CTX, 2314 .arg2_type = ARG_CONST_MAP_PTR, 2315 .arg3_type = ARG_ANYTHING, 2316 }; 2317 #endif 2318 2319 static const struct bpf_func_proto * 2320 sk_filter_func_proto(enum bpf_func_id func_id) 2321 { 2322 switch (func_id) { 2323 case BPF_FUNC_map_lookup_elem: 2324 return &bpf_map_lookup_elem_proto; 2325 case BPF_FUNC_map_update_elem: 2326 return &bpf_map_update_elem_proto; 2327 case BPF_FUNC_map_delete_elem: 2328 return &bpf_map_delete_elem_proto; 2329 case BPF_FUNC_get_prandom_u32: 2330 return &bpf_get_prandom_u32_proto; 2331 case BPF_FUNC_get_smp_processor_id: 2332 return &bpf_get_raw_smp_processor_id_proto; 2333 case BPF_FUNC_tail_call: 2334 return &bpf_tail_call_proto; 2335 case BPF_FUNC_ktime_get_ns: 2336 return &bpf_ktime_get_ns_proto; 2337 case BPF_FUNC_trace_printk: 2338 if (capable(CAP_SYS_ADMIN)) 2339 return bpf_get_trace_printk_proto(); 2340 default: 2341 return NULL; 2342 } 2343 } 2344 2345 static const struct bpf_func_proto * 2346 tc_cls_act_func_proto(enum bpf_func_id func_id) 2347 { 2348 switch (func_id) { 2349 case BPF_FUNC_skb_store_bytes: 2350 return &bpf_skb_store_bytes_proto; 2351 case BPF_FUNC_skb_load_bytes: 2352 return &bpf_skb_load_bytes_proto; 2353 case BPF_FUNC_csum_diff: 2354 return &bpf_csum_diff_proto; 2355 case BPF_FUNC_l3_csum_replace: 2356 return &bpf_l3_csum_replace_proto; 2357 case BPF_FUNC_l4_csum_replace: 2358 return &bpf_l4_csum_replace_proto; 2359 case BPF_FUNC_clone_redirect: 2360 return &bpf_clone_redirect_proto; 2361 case BPF_FUNC_get_cgroup_classid: 2362 return &bpf_get_cgroup_classid_proto; 2363 case BPF_FUNC_skb_vlan_push: 2364 return &bpf_skb_vlan_push_proto; 2365 case BPF_FUNC_skb_vlan_pop: 2366 return &bpf_skb_vlan_pop_proto; 2367 case BPF_FUNC_skb_change_proto: 2368 return &bpf_skb_change_proto_proto; 2369 case BPF_FUNC_skb_change_type: 2370 return &bpf_skb_change_type_proto; 2371 case BPF_FUNC_skb_get_tunnel_key: 2372 return &bpf_skb_get_tunnel_key_proto; 2373 case BPF_FUNC_skb_set_tunnel_key: 2374 return bpf_get_skb_set_tunnel_proto(func_id); 2375 case BPF_FUNC_skb_get_tunnel_opt: 2376 return &bpf_skb_get_tunnel_opt_proto; 2377 case BPF_FUNC_skb_set_tunnel_opt: 2378 return bpf_get_skb_set_tunnel_proto(func_id); 2379 case BPF_FUNC_redirect: 2380 return &bpf_redirect_proto; 2381 case BPF_FUNC_get_route_realm: 2382 return &bpf_get_route_realm_proto; 2383 case BPF_FUNC_get_hash_recalc: 2384 return &bpf_get_hash_recalc_proto; 2385 case BPF_FUNC_perf_event_output: 2386 return &bpf_skb_event_output_proto; 2387 case BPF_FUNC_get_smp_processor_id: 2388 return &bpf_get_smp_processor_id_proto; 2389 #ifdef CONFIG_SOCK_CGROUP_DATA 2390 case BPF_FUNC_skb_under_cgroup: 2391 return &bpf_skb_under_cgroup_proto; 2392 #endif 2393 default: 2394 return sk_filter_func_proto(func_id); 2395 } 2396 } 2397 2398 static const struct bpf_func_proto * 2399 xdp_func_proto(enum bpf_func_id func_id) 2400 { 2401 return sk_filter_func_proto(func_id); 2402 } 2403 2404 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2405 { 2406 if (off < 0 || off >= sizeof(struct __sk_buff)) 2407 return false; 2408 /* The verifier guarantees that size > 0. */ 2409 if (off % size != 0) 2410 return false; 2411 if (size != sizeof(__u32)) 2412 return false; 2413 2414 return true; 2415 } 2416 2417 static bool sk_filter_is_valid_access(int off, int size, 2418 enum bpf_access_type type, 2419 enum bpf_reg_type *reg_type) 2420 { 2421 switch (off) { 2422 case offsetof(struct __sk_buff, tc_classid): 2423 case offsetof(struct __sk_buff, data): 2424 case offsetof(struct __sk_buff, data_end): 2425 return false; 2426 } 2427 2428 if (type == BPF_WRITE) { 2429 switch (off) { 2430 case offsetof(struct __sk_buff, cb[0]) ... 2431 offsetof(struct __sk_buff, cb[4]): 2432 break; 2433 default: 2434 return false; 2435 } 2436 } 2437 2438 return __is_valid_access(off, size, type); 2439 } 2440 2441 static bool tc_cls_act_is_valid_access(int off, int size, 2442 enum bpf_access_type type, 2443 enum bpf_reg_type *reg_type) 2444 { 2445 if (type == BPF_WRITE) { 2446 switch (off) { 2447 case offsetof(struct __sk_buff, mark): 2448 case offsetof(struct __sk_buff, tc_index): 2449 case offsetof(struct __sk_buff, priority): 2450 case offsetof(struct __sk_buff, cb[0]) ... 2451 offsetof(struct __sk_buff, cb[4]): 2452 case offsetof(struct __sk_buff, tc_classid): 2453 break; 2454 default: 2455 return false; 2456 } 2457 } 2458 2459 switch (off) { 2460 case offsetof(struct __sk_buff, data): 2461 *reg_type = PTR_TO_PACKET; 2462 break; 2463 case offsetof(struct __sk_buff, data_end): 2464 *reg_type = PTR_TO_PACKET_END; 2465 break; 2466 } 2467 2468 return __is_valid_access(off, size, type); 2469 } 2470 2471 static bool __is_valid_xdp_access(int off, int size, 2472 enum bpf_access_type type) 2473 { 2474 if (off < 0 || off >= sizeof(struct xdp_md)) 2475 return false; 2476 if (off % size != 0) 2477 return false; 2478 if (size != 4) 2479 return false; 2480 2481 return true; 2482 } 2483 2484 static bool xdp_is_valid_access(int off, int size, 2485 enum bpf_access_type type, 2486 enum bpf_reg_type *reg_type) 2487 { 2488 if (type == BPF_WRITE) 2489 return false; 2490 2491 switch (off) { 2492 case offsetof(struct xdp_md, data): 2493 *reg_type = PTR_TO_PACKET; 2494 break; 2495 case offsetof(struct xdp_md, data_end): 2496 *reg_type = PTR_TO_PACKET_END; 2497 break; 2498 } 2499 2500 return __is_valid_xdp_access(off, size, type); 2501 } 2502 2503 void bpf_warn_invalid_xdp_action(u32 act) 2504 { 2505 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); 2506 } 2507 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2508 2509 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2510 int src_reg, int ctx_off, 2511 struct bpf_insn *insn_buf, 2512 struct bpf_prog *prog) 2513 { 2514 struct bpf_insn *insn = insn_buf; 2515 2516 switch (ctx_off) { 2517 case offsetof(struct __sk_buff, len): 2518 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 2519 2520 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2521 offsetof(struct sk_buff, len)); 2522 break; 2523 2524 case offsetof(struct __sk_buff, protocol): 2525 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 2526 2527 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2528 offsetof(struct sk_buff, protocol)); 2529 break; 2530 2531 case offsetof(struct __sk_buff, vlan_proto): 2532 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 2533 2534 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2535 offsetof(struct sk_buff, vlan_proto)); 2536 break; 2537 2538 case offsetof(struct __sk_buff, priority): 2539 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 2540 2541 if (type == BPF_WRITE) 2542 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2543 offsetof(struct sk_buff, priority)); 2544 else 2545 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2546 offsetof(struct sk_buff, priority)); 2547 break; 2548 2549 case offsetof(struct __sk_buff, ingress_ifindex): 2550 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 2551 2552 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2553 offsetof(struct sk_buff, skb_iif)); 2554 break; 2555 2556 case offsetof(struct __sk_buff, ifindex): 2557 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2558 2559 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 2560 dst_reg, src_reg, 2561 offsetof(struct sk_buff, dev)); 2562 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 2563 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2564 offsetof(struct net_device, ifindex)); 2565 break; 2566 2567 case offsetof(struct __sk_buff, hash): 2568 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 2569 2570 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2571 offsetof(struct sk_buff, hash)); 2572 break; 2573 2574 case offsetof(struct __sk_buff, mark): 2575 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 2576 2577 if (type == BPF_WRITE) 2578 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2579 offsetof(struct sk_buff, mark)); 2580 else 2581 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2582 offsetof(struct sk_buff, mark)); 2583 break; 2584 2585 case offsetof(struct __sk_buff, pkt_type): 2586 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 2587 2588 case offsetof(struct __sk_buff, queue_mapping): 2589 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 2590 2591 case offsetof(struct __sk_buff, vlan_present): 2592 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 2593 dst_reg, src_reg, insn); 2594 2595 case offsetof(struct __sk_buff, vlan_tci): 2596 return convert_skb_access(SKF_AD_VLAN_TAG, 2597 dst_reg, src_reg, insn); 2598 2599 case offsetof(struct __sk_buff, cb[0]) ... 2600 offsetof(struct __sk_buff, cb[4]): 2601 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2602 2603 prog->cb_access = 1; 2604 ctx_off -= offsetof(struct __sk_buff, cb[0]); 2605 ctx_off += offsetof(struct sk_buff, cb); 2606 ctx_off += offsetof(struct qdisc_skb_cb, data); 2607 if (type == BPF_WRITE) 2608 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2609 else 2610 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2611 break; 2612 2613 case offsetof(struct __sk_buff, tc_classid): 2614 ctx_off -= offsetof(struct __sk_buff, tc_classid); 2615 ctx_off += offsetof(struct sk_buff, cb); 2616 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 2617 if (type == BPF_WRITE) 2618 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2619 else 2620 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2621 break; 2622 2623 case offsetof(struct __sk_buff, data): 2624 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), 2625 dst_reg, src_reg, 2626 offsetof(struct sk_buff, data)); 2627 break; 2628 2629 case offsetof(struct __sk_buff, data_end): 2630 ctx_off -= offsetof(struct __sk_buff, data_end); 2631 ctx_off += offsetof(struct sk_buff, cb); 2632 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 2633 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), 2634 dst_reg, src_reg, ctx_off); 2635 break; 2636 2637 case offsetof(struct __sk_buff, tc_index): 2638 #ifdef CONFIG_NET_SCHED 2639 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 2640 2641 if (type == BPF_WRITE) 2642 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 2643 offsetof(struct sk_buff, tc_index)); 2644 else 2645 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2646 offsetof(struct sk_buff, tc_index)); 2647 break; 2648 #else 2649 if (type == BPF_WRITE) 2650 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 2651 else 2652 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 2653 break; 2654 #endif 2655 } 2656 2657 return insn - insn_buf; 2658 } 2659 2660 static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2661 int src_reg, int ctx_off, 2662 struct bpf_insn *insn_buf, 2663 struct bpf_prog *prog) 2664 { 2665 struct bpf_insn *insn = insn_buf; 2666 2667 switch (ctx_off) { 2668 case offsetof(struct xdp_md, data): 2669 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), 2670 dst_reg, src_reg, 2671 offsetof(struct xdp_buff, data)); 2672 break; 2673 case offsetof(struct xdp_md, data_end): 2674 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), 2675 dst_reg, src_reg, 2676 offsetof(struct xdp_buff, data_end)); 2677 break; 2678 } 2679 2680 return insn - insn_buf; 2681 } 2682 2683 static const struct bpf_verifier_ops sk_filter_ops = { 2684 .get_func_proto = sk_filter_func_proto, 2685 .is_valid_access = sk_filter_is_valid_access, 2686 .convert_ctx_access = bpf_net_convert_ctx_access, 2687 }; 2688 2689 static const struct bpf_verifier_ops tc_cls_act_ops = { 2690 .get_func_proto = tc_cls_act_func_proto, 2691 .is_valid_access = tc_cls_act_is_valid_access, 2692 .convert_ctx_access = bpf_net_convert_ctx_access, 2693 }; 2694 2695 static const struct bpf_verifier_ops xdp_ops = { 2696 .get_func_proto = xdp_func_proto, 2697 .is_valid_access = xdp_is_valid_access, 2698 .convert_ctx_access = xdp_convert_ctx_access, 2699 }; 2700 2701 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 2702 .ops = &sk_filter_ops, 2703 .type = BPF_PROG_TYPE_SOCKET_FILTER, 2704 }; 2705 2706 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 2707 .ops = &tc_cls_act_ops, 2708 .type = BPF_PROG_TYPE_SCHED_CLS, 2709 }; 2710 2711 static struct bpf_prog_type_list sched_act_type __read_mostly = { 2712 .ops = &tc_cls_act_ops, 2713 .type = BPF_PROG_TYPE_SCHED_ACT, 2714 }; 2715 2716 static struct bpf_prog_type_list xdp_type __read_mostly = { 2717 .ops = &xdp_ops, 2718 .type = BPF_PROG_TYPE_XDP, 2719 }; 2720 2721 static int __init register_sk_filter_ops(void) 2722 { 2723 bpf_register_prog_type(&sk_filter_type); 2724 bpf_register_prog_type(&sched_cls_type); 2725 bpf_register_prog_type(&sched_act_type); 2726 bpf_register_prog_type(&xdp_type); 2727 2728 return 0; 2729 } 2730 late_initcall(register_sk_filter_ops); 2731 2732 int sk_detach_filter(struct sock *sk) 2733 { 2734 int ret = -ENOENT; 2735 struct sk_filter *filter; 2736 2737 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 2738 return -EPERM; 2739 2740 filter = rcu_dereference_protected(sk->sk_filter, 2741 lockdep_sock_is_held(sk)); 2742 if (filter) { 2743 RCU_INIT_POINTER(sk->sk_filter, NULL); 2744 sk_filter_uncharge(sk, filter); 2745 ret = 0; 2746 } 2747 2748 return ret; 2749 } 2750 EXPORT_SYMBOL_GPL(sk_detach_filter); 2751 2752 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 2753 unsigned int len) 2754 { 2755 struct sock_fprog_kern *fprog; 2756 struct sk_filter *filter; 2757 int ret = 0; 2758 2759 lock_sock(sk); 2760 filter = rcu_dereference_protected(sk->sk_filter, 2761 lockdep_sock_is_held(sk)); 2762 if (!filter) 2763 goto out; 2764 2765 /* We're copying the filter that has been originally attached, 2766 * so no conversion/decode needed anymore. eBPF programs that 2767 * have no original program cannot be dumped through this. 2768 */ 2769 ret = -EACCES; 2770 fprog = filter->prog->orig_prog; 2771 if (!fprog) 2772 goto out; 2773 2774 ret = fprog->len; 2775 if (!len) 2776 /* User space only enquires number of filter blocks. */ 2777 goto out; 2778 2779 ret = -EINVAL; 2780 if (len < fprog->len) 2781 goto out; 2782 2783 ret = -EFAULT; 2784 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 2785 goto out; 2786 2787 /* Instead of bytes, the API requests to return the number 2788 * of filter blocks. 2789 */ 2790 ret = fprog->len; 2791 out: 2792 release_sock(sk); 2793 return ret; 2794 } 2795
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.