1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/slab.h> 11 #include <linux/workqueue.h> 12 #include "ctree.h" 13 #include "volumes.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "dev-replace.h" 17 18 #undef DEBUG 19 20 /* 21 * This is the implementation for the generic read ahead framework. 22 * 23 * To trigger a readahead, btrfs_reada_add must be called. It will start 24 * a read ahead for the given range [start, end) on tree root. The returned 25 * handle can either be used to wait on the readahead to finish 26 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). 27 * 28 * The read ahead works as follows: 29 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. 30 * reada_start_machine will then search for extents to prefetch and trigger 31 * some reads. When a read finishes for a node, all contained node/leaf 32 * pointers that lie in the given range will also be enqueued. The reads will 33 * be triggered in sequential order, thus giving a big win over a naive 34 * enumeration. It will also make use of multi-device layouts. Each disk 35 * will have its on read pointer and all disks will by utilized in parallel. 36 * Also will no two disks read both sides of a mirror simultaneously, as this 37 * would waste seeking capacity. Instead both disks will read different parts 38 * of the filesystem. 39 * Any number of readaheads can be started in parallel. The read order will be 40 * determined globally, i.e. 2 parallel readaheads will normally finish faster 41 * than the 2 started one after another. 42 */ 43 44 #define MAX_IN_FLIGHT 6 45 46 struct reada_extctl { 47 struct list_head list; 48 struct reada_control *rc; 49 u64 generation; 50 }; 51 52 struct reada_extent { 53 u64 logical; 54 struct btrfs_key top; 55 struct list_head extctl; 56 int refcnt; 57 spinlock_t lock; 58 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 59 int nzones; 60 int scheduled; 61 }; 62 63 struct reada_zone { 64 u64 start; 65 u64 end; 66 u64 elems; 67 struct list_head list; 68 spinlock_t lock; 69 int locked; 70 struct btrfs_device *device; 71 struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl 72 * self */ 73 int ndevs; 74 struct kref refcnt; 75 }; 76 77 struct reada_machine_work { 78 struct btrfs_work work; 79 struct btrfs_fs_info *fs_info; 80 }; 81 82 static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); 83 static void reada_control_release(struct kref *kref); 84 static void reada_zone_release(struct kref *kref); 85 static void reada_start_machine(struct btrfs_fs_info *fs_info); 86 static void __reada_start_machine(struct btrfs_fs_info *fs_info); 87 88 static int reada_add_block(struct reada_control *rc, u64 logical, 89 struct btrfs_key *top, u64 generation); 90 91 /* recurses */ 92 /* in case of err, eb might be NULL */ 93 static void __readahead_hook(struct btrfs_fs_info *fs_info, 94 struct reada_extent *re, struct extent_buffer *eb, 95 int err) 96 { 97 int nritems; 98 int i; 99 u64 bytenr; 100 u64 generation; 101 struct list_head list; 102 103 spin_lock(&re->lock); 104 /* 105 * just take the full list from the extent. afterwards we 106 * don't need the lock anymore 107 */ 108 list_replace_init(&re->extctl, &list); 109 re->scheduled = 0; 110 spin_unlock(&re->lock); 111 112 /* 113 * this is the error case, the extent buffer has not been 114 * read correctly. We won't access anything from it and 115 * just cleanup our data structures. Effectively this will 116 * cut the branch below this node from read ahead. 117 */ 118 if (err) 119 goto cleanup; 120 121 /* 122 * FIXME: currently we just set nritems to 0 if this is a leaf, 123 * effectively ignoring the content. In a next step we could 124 * trigger more readahead depending from the content, e.g. 125 * fetch the checksums for the extents in the leaf. 126 */ 127 if (!btrfs_header_level(eb)) 128 goto cleanup; 129 130 nritems = btrfs_header_nritems(eb); 131 generation = btrfs_header_generation(eb); 132 for (i = 0; i < nritems; i++) { 133 struct reada_extctl *rec; 134 u64 n_gen; 135 struct btrfs_key key; 136 struct btrfs_key next_key; 137 138 btrfs_node_key_to_cpu(eb, &key, i); 139 if (i + 1 < nritems) 140 btrfs_node_key_to_cpu(eb, &next_key, i + 1); 141 else 142 next_key = re->top; 143 bytenr = btrfs_node_blockptr(eb, i); 144 n_gen = btrfs_node_ptr_generation(eb, i); 145 146 list_for_each_entry(rec, &list, list) { 147 struct reada_control *rc = rec->rc; 148 149 /* 150 * if the generation doesn't match, just ignore this 151 * extctl. This will probably cut off a branch from 152 * prefetch. Alternatively one could start a new (sub-) 153 * prefetch for this branch, starting again from root. 154 * FIXME: move the generation check out of this loop 155 */ 156 #ifdef DEBUG 157 if (rec->generation != generation) { 158 btrfs_debug(fs_info, 159 "generation mismatch for (%llu,%d,%llu) %llu != %llu", 160 key.objectid, key.type, key.offset, 161 rec->generation, generation); 162 } 163 #endif 164 if (rec->generation == generation && 165 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && 166 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) 167 reada_add_block(rc, bytenr, &next_key, n_gen); 168 } 169 } 170 171 cleanup: 172 /* 173 * free extctl records 174 */ 175 while (!list_empty(&list)) { 176 struct reada_control *rc; 177 struct reada_extctl *rec; 178 179 rec = list_first_entry(&list, struct reada_extctl, list); 180 list_del(&rec->list); 181 rc = rec->rc; 182 kfree(rec); 183 184 kref_get(&rc->refcnt); 185 if (atomic_dec_and_test(&rc->elems)) { 186 kref_put(&rc->refcnt, reada_control_release); 187 wake_up(&rc->wait); 188 } 189 kref_put(&rc->refcnt, reada_control_release); 190 191 reada_extent_put(fs_info, re); /* one ref for each entry */ 192 } 193 194 return; 195 } 196 197 int btree_readahead_hook(struct extent_buffer *eb, int err) 198 { 199 struct btrfs_fs_info *fs_info = eb->fs_info; 200 int ret = 0; 201 struct reada_extent *re; 202 203 /* find extent */ 204 spin_lock(&fs_info->reada_lock); 205 re = radix_tree_lookup(&fs_info->reada_tree, 206 eb->start >> PAGE_SHIFT); 207 if (re) 208 re->refcnt++; 209 spin_unlock(&fs_info->reada_lock); 210 if (!re) { 211 ret = -1; 212 goto start_machine; 213 } 214 215 __readahead_hook(fs_info, re, eb, err); 216 reada_extent_put(fs_info, re); /* our ref */ 217 218 start_machine: 219 reada_start_machine(fs_info); 220 return ret; 221 } 222 223 static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, 224 struct btrfs_bio *bbio) 225 { 226 struct btrfs_fs_info *fs_info = dev->fs_info; 227 int ret; 228 struct reada_zone *zone; 229 struct btrfs_block_group_cache *cache = NULL; 230 u64 start; 231 u64 end; 232 int i; 233 234 zone = NULL; 235 spin_lock(&fs_info->reada_lock); 236 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 237 logical >> PAGE_SHIFT, 1); 238 if (ret == 1 && logical >= zone->start && logical <= zone->end) { 239 kref_get(&zone->refcnt); 240 spin_unlock(&fs_info->reada_lock); 241 return zone; 242 } 243 244 spin_unlock(&fs_info->reada_lock); 245 246 cache = btrfs_lookup_block_group(fs_info, logical); 247 if (!cache) 248 return NULL; 249 250 start = cache->key.objectid; 251 end = start + cache->key.offset - 1; 252 btrfs_put_block_group(cache); 253 254 zone = kzalloc(sizeof(*zone), GFP_KERNEL); 255 if (!zone) 256 return NULL; 257 258 ret = radix_tree_preload(GFP_KERNEL); 259 if (ret) { 260 kfree(zone); 261 return NULL; 262 } 263 264 zone->start = start; 265 zone->end = end; 266 INIT_LIST_HEAD(&zone->list); 267 spin_lock_init(&zone->lock); 268 zone->locked = 0; 269 kref_init(&zone->refcnt); 270 zone->elems = 0; 271 zone->device = dev; /* our device always sits at index 0 */ 272 for (i = 0; i < bbio->num_stripes; ++i) { 273 /* bounds have already been checked */ 274 zone->devs[i] = bbio->stripes[i].dev; 275 } 276 zone->ndevs = bbio->num_stripes; 277 278 spin_lock(&fs_info->reada_lock); 279 ret = radix_tree_insert(&dev->reada_zones, 280 (unsigned long)(zone->end >> PAGE_SHIFT), 281 zone); 282 283 if (ret == -EEXIST) { 284 kfree(zone); 285 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 286 logical >> PAGE_SHIFT, 1); 287 if (ret == 1 && logical >= zone->start && logical <= zone->end) 288 kref_get(&zone->refcnt); 289 else 290 zone = NULL; 291 } 292 spin_unlock(&fs_info->reada_lock); 293 radix_tree_preload_end(); 294 295 return zone; 296 } 297 298 static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, 299 u64 logical, 300 struct btrfs_key *top) 301 { 302 int ret; 303 struct reada_extent *re = NULL; 304 struct reada_extent *re_exist = NULL; 305 struct btrfs_bio *bbio = NULL; 306 struct btrfs_device *dev; 307 struct btrfs_device *prev_dev; 308 u64 length; 309 int real_stripes; 310 int nzones = 0; 311 unsigned long index = logical >> PAGE_SHIFT; 312 int dev_replace_is_ongoing; 313 int have_zone = 0; 314 315 spin_lock(&fs_info->reada_lock); 316 re = radix_tree_lookup(&fs_info->reada_tree, index); 317 if (re) 318 re->refcnt++; 319 spin_unlock(&fs_info->reada_lock); 320 321 if (re) 322 return re; 323 324 re = kzalloc(sizeof(*re), GFP_KERNEL); 325 if (!re) 326 return NULL; 327 328 re->logical = logical; 329 re->top = *top; 330 INIT_LIST_HEAD(&re->extctl); 331 spin_lock_init(&re->lock); 332 re->refcnt = 1; 333 334 /* 335 * map block 336 */ 337 length = fs_info->nodesize; 338 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 339 &length, &bbio, 0); 340 if (ret || !bbio || length < fs_info->nodesize) 341 goto error; 342 343 if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { 344 btrfs_err(fs_info, 345 "readahead: more than %d copies not supported", 346 BTRFS_MAX_MIRRORS); 347 goto error; 348 } 349 350 real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 351 for (nzones = 0; nzones < real_stripes; ++nzones) { 352 struct reada_zone *zone; 353 354 dev = bbio->stripes[nzones].dev; 355 356 /* cannot read ahead on missing device. */ 357 if (!dev->bdev) 358 continue; 359 360 zone = reada_find_zone(dev, logical, bbio); 361 if (!zone) 362 continue; 363 364 re->zones[re->nzones++] = zone; 365 spin_lock(&zone->lock); 366 if (!zone->elems) 367 kref_get(&zone->refcnt); 368 ++zone->elems; 369 spin_unlock(&zone->lock); 370 spin_lock(&fs_info->reada_lock); 371 kref_put(&zone->refcnt, reada_zone_release); 372 spin_unlock(&fs_info->reada_lock); 373 } 374 if (re->nzones == 0) { 375 /* not a single zone found, error and out */ 376 goto error; 377 } 378 379 /* Insert extent in reada tree + all per-device trees, all or nothing */ 380 down_read(&fs_info->dev_replace.rwsem); 381 ret = radix_tree_preload(GFP_KERNEL); 382 if (ret) { 383 up_read(&fs_info->dev_replace.rwsem); 384 goto error; 385 } 386 387 spin_lock(&fs_info->reada_lock); 388 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 389 if (ret == -EEXIST) { 390 re_exist = radix_tree_lookup(&fs_info->reada_tree, index); 391 re_exist->refcnt++; 392 spin_unlock(&fs_info->reada_lock); 393 radix_tree_preload_end(); 394 up_read(&fs_info->dev_replace.rwsem); 395 goto error; 396 } 397 if (ret) { 398 spin_unlock(&fs_info->reada_lock); 399 radix_tree_preload_end(); 400 up_read(&fs_info->dev_replace.rwsem); 401 goto error; 402 } 403 radix_tree_preload_end(); 404 prev_dev = NULL; 405 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 406 &fs_info->dev_replace); 407 for (nzones = 0; nzones < re->nzones; ++nzones) { 408 dev = re->zones[nzones]->device; 409 410 if (dev == prev_dev) { 411 /* 412 * in case of DUP, just add the first zone. As both 413 * are on the same device, there's nothing to gain 414 * from adding both. 415 * Also, it wouldn't work, as the tree is per device 416 * and adding would fail with EEXIST 417 */ 418 continue; 419 } 420 if (!dev->bdev) 421 continue; 422 423 if (dev_replace_is_ongoing && 424 dev == fs_info->dev_replace.tgtdev) { 425 /* 426 * as this device is selected for reading only as 427 * a last resort, skip it for read ahead. 428 */ 429 continue; 430 } 431 prev_dev = dev; 432 ret = radix_tree_insert(&dev->reada_extents, index, re); 433 if (ret) { 434 while (--nzones >= 0) { 435 dev = re->zones[nzones]->device; 436 BUG_ON(dev == NULL); 437 /* ignore whether the entry was inserted */ 438 radix_tree_delete(&dev->reada_extents, index); 439 } 440 radix_tree_delete(&fs_info->reada_tree, index); 441 spin_unlock(&fs_info->reada_lock); 442 up_read(&fs_info->dev_replace.rwsem); 443 goto error; 444 } 445 have_zone = 1; 446 } 447 spin_unlock(&fs_info->reada_lock); 448 up_read(&fs_info->dev_replace.rwsem); 449 450 if (!have_zone) 451 goto error; 452 453 btrfs_put_bbio(bbio); 454 return re; 455 456 error: 457 for (nzones = 0; nzones < re->nzones; ++nzones) { 458 struct reada_zone *zone; 459 460 zone = re->zones[nzones]; 461 kref_get(&zone->refcnt); 462 spin_lock(&zone->lock); 463 --zone->elems; 464 if (zone->elems == 0) { 465 /* 466 * no fs_info->reada_lock needed, as this can't be 467 * the last ref 468 */ 469 kref_put(&zone->refcnt, reada_zone_release); 470 } 471 spin_unlock(&zone->lock); 472 473 spin_lock(&fs_info->reada_lock); 474 kref_put(&zone->refcnt, reada_zone_release); 475 spin_unlock(&fs_info->reada_lock); 476 } 477 btrfs_put_bbio(bbio); 478 kfree(re); 479 return re_exist; 480 } 481 482 static void reada_extent_put(struct btrfs_fs_info *fs_info, 483 struct reada_extent *re) 484 { 485 int i; 486 unsigned long index = re->logical >> PAGE_SHIFT; 487 488 spin_lock(&fs_info->reada_lock); 489 if (--re->refcnt) { 490 spin_unlock(&fs_info->reada_lock); 491 return; 492 } 493 494 radix_tree_delete(&fs_info->reada_tree, index); 495 for (i = 0; i < re->nzones; ++i) { 496 struct reada_zone *zone = re->zones[i]; 497 498 radix_tree_delete(&zone->device->reada_extents, index); 499 } 500 501 spin_unlock(&fs_info->reada_lock); 502 503 for (i = 0; i < re->nzones; ++i) { 504 struct reada_zone *zone = re->zones[i]; 505 506 kref_get(&zone->refcnt); 507 spin_lock(&zone->lock); 508 --zone->elems; 509 if (zone->elems == 0) { 510 /* no fs_info->reada_lock needed, as this can't be 511 * the last ref */ 512 kref_put(&zone->refcnt, reada_zone_release); 513 } 514 spin_unlock(&zone->lock); 515 516 spin_lock(&fs_info->reada_lock); 517 kref_put(&zone->refcnt, reada_zone_release); 518 spin_unlock(&fs_info->reada_lock); 519 } 520 521 kfree(re); 522 } 523 524 static void reada_zone_release(struct kref *kref) 525 { 526 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); 527 528 radix_tree_delete(&zone->device->reada_zones, 529 zone->end >> PAGE_SHIFT); 530 531 kfree(zone); 532 } 533 534 static void reada_control_release(struct kref *kref) 535 { 536 struct reada_control *rc = container_of(kref, struct reada_control, 537 refcnt); 538 539 kfree(rc); 540 } 541 542 static int reada_add_block(struct reada_control *rc, u64 logical, 543 struct btrfs_key *top, u64 generation) 544 { 545 struct btrfs_fs_info *fs_info = rc->fs_info; 546 struct reada_extent *re; 547 struct reada_extctl *rec; 548 549 /* takes one ref */ 550 re = reada_find_extent(fs_info, logical, top); 551 if (!re) 552 return -1; 553 554 rec = kzalloc(sizeof(*rec), GFP_KERNEL); 555 if (!rec) { 556 reada_extent_put(fs_info, re); 557 return -ENOMEM; 558 } 559 560 rec->rc = rc; 561 rec->generation = generation; 562 atomic_inc(&rc->elems); 563 564 spin_lock(&re->lock); 565 list_add_tail(&rec->list, &re->extctl); 566 spin_unlock(&re->lock); 567 568 /* leave the ref on the extent */ 569 570 return 0; 571 } 572 573 /* 574 * called with fs_info->reada_lock held 575 */ 576 static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) 577 { 578 int i; 579 unsigned long index = zone->end >> PAGE_SHIFT; 580 581 for (i = 0; i < zone->ndevs; ++i) { 582 struct reada_zone *peer; 583 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); 584 if (peer && peer->device != zone->device) 585 peer->locked = lock; 586 } 587 } 588 589 /* 590 * called with fs_info->reada_lock held 591 */ 592 static int reada_pick_zone(struct btrfs_device *dev) 593 { 594 struct reada_zone *top_zone = NULL; 595 struct reada_zone *top_locked_zone = NULL; 596 u64 top_elems = 0; 597 u64 top_locked_elems = 0; 598 unsigned long index = 0; 599 int ret; 600 601 if (dev->reada_curr_zone) { 602 reada_peer_zones_set_lock(dev->reada_curr_zone, 0); 603 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); 604 dev->reada_curr_zone = NULL; 605 } 606 /* pick the zone with the most elements */ 607 while (1) { 608 struct reada_zone *zone; 609 610 ret = radix_tree_gang_lookup(&dev->reada_zones, 611 (void **)&zone, index, 1); 612 if (ret == 0) 613 break; 614 index = (zone->end >> PAGE_SHIFT) + 1; 615 if (zone->locked) { 616 if (zone->elems > top_locked_elems) { 617 top_locked_elems = zone->elems; 618 top_locked_zone = zone; 619 } 620 } else { 621 if (zone->elems > top_elems) { 622 top_elems = zone->elems; 623 top_zone = zone; 624 } 625 } 626 } 627 if (top_zone) 628 dev->reada_curr_zone = top_zone; 629 else if (top_locked_zone) 630 dev->reada_curr_zone = top_locked_zone; 631 else 632 return 0; 633 634 dev->reada_next = dev->reada_curr_zone->start; 635 kref_get(&dev->reada_curr_zone->refcnt); 636 reada_peer_zones_set_lock(dev->reada_curr_zone, 1); 637 638 return 1; 639 } 640 641 static int reada_start_machine_dev(struct btrfs_device *dev) 642 { 643 struct btrfs_fs_info *fs_info = dev->fs_info; 644 struct reada_extent *re = NULL; 645 int mirror_num = 0; 646 struct extent_buffer *eb = NULL; 647 u64 logical; 648 int ret; 649 int i; 650 651 spin_lock(&fs_info->reada_lock); 652 if (dev->reada_curr_zone == NULL) { 653 ret = reada_pick_zone(dev); 654 if (!ret) { 655 spin_unlock(&fs_info->reada_lock); 656 return 0; 657 } 658 } 659 /* 660 * FIXME currently we issue the reads one extent at a time. If we have 661 * a contiguous block of extents, we could also coagulate them or use 662 * plugging to speed things up 663 */ 664 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 665 dev->reada_next >> PAGE_SHIFT, 1); 666 if (ret == 0 || re->logical > dev->reada_curr_zone->end) { 667 ret = reada_pick_zone(dev); 668 if (!ret) { 669 spin_unlock(&fs_info->reada_lock); 670 return 0; 671 } 672 re = NULL; 673 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 674 dev->reada_next >> PAGE_SHIFT, 1); 675 } 676 if (ret == 0) { 677 spin_unlock(&fs_info->reada_lock); 678 return 0; 679 } 680 dev->reada_next = re->logical + fs_info->nodesize; 681 re->refcnt++; 682 683 spin_unlock(&fs_info->reada_lock); 684 685 spin_lock(&re->lock); 686 if (re->scheduled || list_empty(&re->extctl)) { 687 spin_unlock(&re->lock); 688 reada_extent_put(fs_info, re); 689 return 0; 690 } 691 re->scheduled = 1; 692 spin_unlock(&re->lock); 693 694 /* 695 * find mirror num 696 */ 697 for (i = 0; i < re->nzones; ++i) { 698 if (re->zones[i]->device == dev) { 699 mirror_num = i + 1; 700 break; 701 } 702 } 703 logical = re->logical; 704 705 atomic_inc(&dev->reada_in_flight); 706 ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); 707 if (ret) 708 __readahead_hook(fs_info, re, NULL, ret); 709 else if (eb) 710 __readahead_hook(fs_info, re, eb, ret); 711 712 if (eb) 713 free_extent_buffer(eb); 714 715 atomic_dec(&dev->reada_in_flight); 716 reada_extent_put(fs_info, re); 717 718 return 1; 719 720 } 721 722 static void reada_start_machine_worker(struct btrfs_work *work) 723 { 724 struct reada_machine_work *rmw; 725 struct btrfs_fs_info *fs_info; 726 int old_ioprio; 727 728 rmw = container_of(work, struct reada_machine_work, work); 729 fs_info = rmw->fs_info; 730 731 kfree(rmw); 732 733 old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), 734 task_nice_ioprio(current)); 735 set_task_ioprio(current, BTRFS_IOPRIO_READA); 736 __reada_start_machine(fs_info); 737 set_task_ioprio(current, old_ioprio); 738 739 atomic_dec(&fs_info->reada_works_cnt); 740 } 741 742 static void __reada_start_machine(struct btrfs_fs_info *fs_info) 743 { 744 struct btrfs_device *device; 745 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 746 u64 enqueued; 747 u64 total = 0; 748 int i; 749 750 again: 751 do { 752 enqueued = 0; 753 mutex_lock(&fs_devices->device_list_mutex); 754 list_for_each_entry(device, &fs_devices->devices, dev_list) { 755 if (atomic_read(&device->reada_in_flight) < 756 MAX_IN_FLIGHT) 757 enqueued += reada_start_machine_dev(device); 758 } 759 mutex_unlock(&fs_devices->device_list_mutex); 760 total += enqueued; 761 } while (enqueued && total < 10000); 762 if (fs_devices->seed) { 763 fs_devices = fs_devices->seed; 764 goto again; 765 } 766 767 if (enqueued == 0) 768 return; 769 770 /* 771 * If everything is already in the cache, this is effectively single 772 * threaded. To a) not hold the caller for too long and b) to utilize 773 * more cores, we broke the loop above after 10000 iterations and now 774 * enqueue to workers to finish it. This will distribute the load to 775 * the cores. 776 */ 777 for (i = 0; i < 2; ++i) { 778 reada_start_machine(fs_info); 779 if (atomic_read(&fs_info->reada_works_cnt) > 780 BTRFS_MAX_MIRRORS * 2) 781 break; 782 } 783 } 784 785 static void reada_start_machine(struct btrfs_fs_info *fs_info) 786 { 787 struct reada_machine_work *rmw; 788 789 rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); 790 if (!rmw) { 791 /* FIXME we cannot handle this properly right now */ 792 BUG(); 793 } 794 btrfs_init_work(&rmw->work, btrfs_readahead_helper, 795 reada_start_machine_worker, NULL, NULL); 796 rmw->fs_info = fs_info; 797 798 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 799 atomic_inc(&fs_info->reada_works_cnt); 800 } 801 802 #ifdef DEBUG 803 static void dump_devs(struct btrfs_fs_info *fs_info, int all) 804 { 805 struct btrfs_device *device; 806 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 807 unsigned long index; 808 int ret; 809 int i; 810 int j; 811 int cnt; 812 813 spin_lock(&fs_info->reada_lock); 814 list_for_each_entry(device, &fs_devices->devices, dev_list) { 815 btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid, 816 atomic_read(&device->reada_in_flight)); 817 index = 0; 818 while (1) { 819 struct reada_zone *zone; 820 ret = radix_tree_gang_lookup(&device->reada_zones, 821 (void **)&zone, index, 1); 822 if (ret == 0) 823 break; 824 pr_debug(" zone %llu-%llu elems %llu locked %d devs", 825 zone->start, zone->end, zone->elems, 826 zone->locked); 827 for (j = 0; j < zone->ndevs; ++j) { 828 pr_cont(" %lld", 829 zone->devs[j]->devid); 830 } 831 if (device->reada_curr_zone == zone) 832 pr_cont(" curr off %llu", 833 device->reada_next - zone->start); 834 pr_cont("\n"); 835 index = (zone->end >> PAGE_SHIFT) + 1; 836 } 837 cnt = 0; 838 index = 0; 839 while (all) { 840 struct reada_extent *re = NULL; 841 842 ret = radix_tree_gang_lookup(&device->reada_extents, 843 (void **)&re, index, 1); 844 if (ret == 0) 845 break; 846 pr_debug(" re: logical %llu size %u empty %d scheduled %d", 847 re->logical, fs_info->nodesize, 848 list_empty(&re->extctl), re->scheduled); 849 850 for (i = 0; i < re->nzones; ++i) { 851 pr_cont(" zone %llu-%llu devs", 852 re->zones[i]->start, 853 re->zones[i]->end); 854 for (j = 0; j < re->zones[i]->ndevs; ++j) { 855 pr_cont(" %lld", 856 re->zones[i]->devs[j]->devid); 857 } 858 } 859 pr_cont("\n"); 860 index = (re->logical >> PAGE_SHIFT) + 1; 861 if (++cnt > 15) 862 break; 863 } 864 } 865 866 index = 0; 867 cnt = 0; 868 while (all) { 869 struct reada_extent *re = NULL; 870 871 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, 872 index, 1); 873 if (ret == 0) 874 break; 875 if (!re->scheduled) { 876 index = (re->logical >> PAGE_SHIFT) + 1; 877 continue; 878 } 879 pr_debug("re: logical %llu size %u list empty %d scheduled %d", 880 re->logical, fs_info->nodesize, 881 list_empty(&re->extctl), re->scheduled); 882 for (i = 0; i < re->nzones; ++i) { 883 pr_cont(" zone %llu-%llu devs", 884 re->zones[i]->start, 885 re->zones[i]->end); 886 for (j = 0; j < re->zones[i]->ndevs; ++j) { 887 pr_cont(" %lld", 888 re->zones[i]->devs[j]->devid); 889 } 890 } 891 pr_cont("\n"); 892 index = (re->logical >> PAGE_SHIFT) + 1; 893 } 894 spin_unlock(&fs_info->reada_lock); 895 } 896 #endif 897 898 /* 899 * interface 900 */ 901 struct reada_control *btrfs_reada_add(struct btrfs_root *root, 902 struct btrfs_key *key_start, struct btrfs_key *key_end) 903 { 904 struct reada_control *rc; 905 u64 start; 906 u64 generation; 907 int ret; 908 struct extent_buffer *node; 909 static struct btrfs_key max_key = { 910 .objectid = (u64)-1, 911 .type = (u8)-1, 912 .offset = (u64)-1 913 }; 914 915 rc = kzalloc(sizeof(*rc), GFP_KERNEL); 916 if (!rc) 917 return ERR_PTR(-ENOMEM); 918 919 rc->fs_info = root->fs_info; 920 rc->key_start = *key_start; 921 rc->key_end = *key_end; 922 atomic_set(&rc->elems, 0); 923 init_waitqueue_head(&rc->wait); 924 kref_init(&rc->refcnt); 925 kref_get(&rc->refcnt); /* one ref for having elements */ 926 927 node = btrfs_root_node(root); 928 start = node->start; 929 generation = btrfs_header_generation(node); 930 free_extent_buffer(node); 931 932 ret = reada_add_block(rc, start, &max_key, generation); 933 if (ret) { 934 kfree(rc); 935 return ERR_PTR(ret); 936 } 937 938 reada_start_machine(root->fs_info); 939 940 return rc; 941 } 942 943 #ifdef DEBUG 944 int btrfs_reada_wait(void *handle) 945 { 946 struct reada_control *rc = handle; 947 struct btrfs_fs_info *fs_info = rc->fs_info; 948 949 while (atomic_read(&rc->elems)) { 950 if (!atomic_read(&fs_info->reada_works_cnt)) 951 reada_start_machine(fs_info); 952 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 953 5 * HZ); 954 dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); 955 } 956 957 dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); 958 959 kref_put(&rc->refcnt, reada_control_release); 960 961 return 0; 962 } 963 #else 964 int btrfs_reada_wait(void *handle) 965 { 966 struct reada_control *rc = handle; 967 struct btrfs_fs_info *fs_info = rc->fs_info; 968 969 while (atomic_read(&rc->elems)) { 970 if (!atomic_read(&fs_info->reada_works_cnt)) 971 reada_start_machine(fs_info); 972 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 973 (HZ + 9) / 10); 974 } 975 976 kref_put(&rc->refcnt, reada_control_release); 977 978 return 0; 979 } 980 #endif 981 982 void btrfs_reada_detach(void *handle) 983 { 984 struct reada_control *rc = handle; 985 986 kref_put(&rc->refcnt, reada_control_release); 987 } 988
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.