1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/compat.h> 34 #include <linux/bit_spinlock.h> 35 #include <linux/xattr.h> 36 #include <linux/posix_acl.h> 37 #include <linux/falloc.h> 38 #include <linux/slab.h> 39 #include <linux/ratelimit.h> 40 #include <linux/mount.h> 41 #include <linux/btrfs.h> 42 #include <linux/blkdev.h> 43 #include <linux/posix_acl_xattr.h> 44 #include <linux/uio.h> 45 #include "ctree.h" 46 #include "disk-io.h" 47 #include "transaction.h" 48 #include "btrfs_inode.h" 49 #include "print-tree.h" 50 #include "ordered-data.h" 51 #include "xattr.h" 52 #include "tree-log.h" 53 #include "volumes.h" 54 #include "compression.h" 55 #include "locking.h" 56 #include "free-space-cache.h" 57 #include "inode-map.h" 58 #include "backref.h" 59 #include "hash.h" 60 #include "props.h" 61 #include "qgroup.h" 62 #include "dedupe.h" 63 64 struct btrfs_iget_args { 65 struct btrfs_key *location; 66 struct btrfs_root *root; 67 }; 68 69 struct btrfs_dio_data { 70 u64 outstanding_extents; 71 u64 reserve; 72 u64 unsubmitted_oe_range_start; 73 u64 unsubmitted_oe_range_end; 74 }; 75 76 static const struct inode_operations btrfs_dir_inode_operations; 77 static const struct inode_operations btrfs_symlink_inode_operations; 78 static const struct inode_operations btrfs_dir_ro_inode_operations; 79 static const struct inode_operations btrfs_special_inode_operations; 80 static const struct inode_operations btrfs_file_inode_operations; 81 static const struct address_space_operations btrfs_aops; 82 static const struct address_space_operations btrfs_symlink_aops; 83 static const struct file_operations btrfs_dir_file_operations; 84 static const struct extent_io_ops btrfs_extent_io_ops; 85 86 static struct kmem_cache *btrfs_inode_cachep; 87 struct kmem_cache *btrfs_trans_handle_cachep; 88 struct kmem_cache *btrfs_transaction_cachep; 89 struct kmem_cache *btrfs_path_cachep; 90 struct kmem_cache *btrfs_free_space_cachep; 91 92 #define S_SHIFT 12 93 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 94 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 95 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 96 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 97 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 98 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 99 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 100 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 101 }; 102 103 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 104 static int btrfs_truncate(struct inode *inode); 105 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 106 static noinline int cow_file_range(struct inode *inode, 107 struct page *locked_page, 108 u64 start, u64 end, u64 delalloc_end, 109 int *page_started, unsigned long *nr_written, 110 int unlock, struct btrfs_dedupe_hash *hash); 111 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 112 u64 len, u64 orig_start, 113 u64 block_start, u64 block_len, 114 u64 orig_block_len, u64 ram_bytes, 115 int type); 116 117 static int btrfs_dirty_inode(struct inode *inode); 118 119 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 120 void btrfs_test_inode_set_ops(struct inode *inode) 121 { 122 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 123 } 124 #endif 125 126 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 127 struct inode *inode, struct inode *dir, 128 const struct qstr *qstr) 129 { 130 int err; 131 132 err = btrfs_init_acl(trans, inode, dir); 133 if (!err) 134 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 135 return err; 136 } 137 138 /* 139 * this does all the hard work for inserting an inline extent into 140 * the btree. The caller should have done a btrfs_drop_extents so that 141 * no overlapping inline items exist in the btree 142 */ 143 static int insert_inline_extent(struct btrfs_trans_handle *trans, 144 struct btrfs_path *path, int extent_inserted, 145 struct btrfs_root *root, struct inode *inode, 146 u64 start, size_t size, size_t compressed_size, 147 int compress_type, 148 struct page **compressed_pages) 149 { 150 struct extent_buffer *leaf; 151 struct page *page = NULL; 152 char *kaddr; 153 unsigned long ptr; 154 struct btrfs_file_extent_item *ei; 155 int err = 0; 156 int ret; 157 size_t cur_size = size; 158 unsigned long offset; 159 160 if (compressed_size && compressed_pages) 161 cur_size = compressed_size; 162 163 inode_add_bytes(inode, size); 164 165 if (!extent_inserted) { 166 struct btrfs_key key; 167 size_t datasize; 168 169 key.objectid = btrfs_ino(inode); 170 key.offset = start; 171 key.type = BTRFS_EXTENT_DATA_KEY; 172 173 datasize = btrfs_file_extent_calc_inline_size(cur_size); 174 path->leave_spinning = 1; 175 ret = btrfs_insert_empty_item(trans, root, path, &key, 176 datasize); 177 if (ret) { 178 err = ret; 179 goto fail; 180 } 181 } 182 leaf = path->nodes[0]; 183 ei = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_file_extent_item); 185 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 186 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 187 btrfs_set_file_extent_encryption(leaf, ei, 0); 188 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 189 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 190 ptr = btrfs_file_extent_inline_start(ei); 191 192 if (compress_type != BTRFS_COMPRESS_NONE) { 193 struct page *cpage; 194 int i = 0; 195 while (compressed_size > 0) { 196 cpage = compressed_pages[i]; 197 cur_size = min_t(unsigned long, compressed_size, 198 PAGE_SIZE); 199 200 kaddr = kmap_atomic(cpage); 201 write_extent_buffer(leaf, kaddr, ptr, cur_size); 202 kunmap_atomic(kaddr); 203 204 i++; 205 ptr += cur_size; 206 compressed_size -= cur_size; 207 } 208 btrfs_set_file_extent_compression(leaf, ei, 209 compress_type); 210 } else { 211 page = find_get_page(inode->i_mapping, 212 start >> PAGE_SHIFT); 213 btrfs_set_file_extent_compression(leaf, ei, 0); 214 kaddr = kmap_atomic(page); 215 offset = start & (PAGE_SIZE - 1); 216 write_extent_buffer(leaf, kaddr + offset, ptr, size); 217 kunmap_atomic(kaddr); 218 put_page(page); 219 } 220 btrfs_mark_buffer_dirty(leaf); 221 btrfs_release_path(path); 222 223 /* 224 * we're an inline extent, so nobody can 225 * extend the file past i_size without locking 226 * a page we already have locked. 227 * 228 * We must do any isize and inode updates 229 * before we unlock the pages. Otherwise we 230 * could end up racing with unlink. 231 */ 232 BTRFS_I(inode)->disk_i_size = inode->i_size; 233 ret = btrfs_update_inode(trans, root, inode); 234 235 return ret; 236 fail: 237 return err; 238 } 239 240 241 /* 242 * conditionally insert an inline extent into the file. This 243 * does the checks required to make sure the data is small enough 244 * to fit as an inline extent. 245 */ 246 static noinline int cow_file_range_inline(struct btrfs_root *root, 247 struct inode *inode, u64 start, 248 u64 end, size_t compressed_size, 249 int compress_type, 250 struct page **compressed_pages) 251 { 252 struct btrfs_fs_info *fs_info = root->fs_info; 253 struct btrfs_trans_handle *trans; 254 u64 isize = i_size_read(inode); 255 u64 actual_end = min(end + 1, isize); 256 u64 inline_len = actual_end - start; 257 u64 aligned_end = ALIGN(end, fs_info->sectorsize); 258 u64 data_len = inline_len; 259 int ret; 260 struct btrfs_path *path; 261 int extent_inserted = 0; 262 u32 extent_item_size; 263 264 if (compressed_size) 265 data_len = compressed_size; 266 267 if (start > 0 || 268 actual_end > fs_info->sectorsize || 269 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 270 (!compressed_size && 271 (actual_end & (fs_info->sectorsize - 1)) == 0) || 272 end + 1 < isize || 273 data_len > fs_info->max_inline) { 274 return 1; 275 } 276 277 path = btrfs_alloc_path(); 278 if (!path) 279 return -ENOMEM; 280 281 trans = btrfs_join_transaction(root); 282 if (IS_ERR(trans)) { 283 btrfs_free_path(path); 284 return PTR_ERR(trans); 285 } 286 trans->block_rsv = &fs_info->delalloc_block_rsv; 287 288 if (compressed_size && compressed_pages) 289 extent_item_size = btrfs_file_extent_calc_inline_size( 290 compressed_size); 291 else 292 extent_item_size = btrfs_file_extent_calc_inline_size( 293 inline_len); 294 295 ret = __btrfs_drop_extents(trans, root, inode, path, 296 start, aligned_end, NULL, 297 1, 1, extent_item_size, &extent_inserted); 298 if (ret) { 299 btrfs_abort_transaction(trans, ret); 300 goto out; 301 } 302 303 if (isize > actual_end) 304 inline_len = min_t(u64, isize, actual_end); 305 ret = insert_inline_extent(trans, path, extent_inserted, 306 root, inode, start, 307 inline_len, compressed_size, 308 compress_type, compressed_pages); 309 if (ret && ret != -ENOSPC) { 310 btrfs_abort_transaction(trans, ret); 311 goto out; 312 } else if (ret == -ENOSPC) { 313 ret = 1; 314 goto out; 315 } 316 317 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 318 btrfs_delalloc_release_metadata(inode, end + 1 - start); 319 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 320 out: 321 /* 322 * Don't forget to free the reserved space, as for inlined extent 323 * it won't count as data extent, free them directly here. 324 * And at reserve time, it's always aligned to page size, so 325 * just free one page here. 326 */ 327 btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); 328 btrfs_free_path(path); 329 btrfs_end_transaction(trans); 330 return ret; 331 } 332 333 struct async_extent { 334 u64 start; 335 u64 ram_size; 336 u64 compressed_size; 337 struct page **pages; 338 unsigned long nr_pages; 339 int compress_type; 340 struct list_head list; 341 }; 342 343 struct async_cow { 344 struct inode *inode; 345 struct btrfs_root *root; 346 struct page *locked_page; 347 u64 start; 348 u64 end; 349 struct list_head extents; 350 struct btrfs_work work; 351 }; 352 353 static noinline int add_async_extent(struct async_cow *cow, 354 u64 start, u64 ram_size, 355 u64 compressed_size, 356 struct page **pages, 357 unsigned long nr_pages, 358 int compress_type) 359 { 360 struct async_extent *async_extent; 361 362 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 363 BUG_ON(!async_extent); /* -ENOMEM */ 364 async_extent->start = start; 365 async_extent->ram_size = ram_size; 366 async_extent->compressed_size = compressed_size; 367 async_extent->pages = pages; 368 async_extent->nr_pages = nr_pages; 369 async_extent->compress_type = compress_type; 370 list_add_tail(&async_extent->list, &cow->extents); 371 return 0; 372 } 373 374 static inline int inode_need_compress(struct inode *inode) 375 { 376 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 377 378 /* force compress */ 379 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 380 return 1; 381 /* bad compression ratios */ 382 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 383 return 0; 384 if (btrfs_test_opt(fs_info, COMPRESS) || 385 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || 386 BTRFS_I(inode)->force_compress) 387 return 1; 388 return 0; 389 } 390 391 /* 392 * we create compressed extents in two phases. The first 393 * phase compresses a range of pages that have already been 394 * locked (both pages and state bits are locked). 395 * 396 * This is done inside an ordered work queue, and the compression 397 * is spread across many cpus. The actual IO submission is step 398 * two, and the ordered work queue takes care of making sure that 399 * happens in the same order things were put onto the queue by 400 * writepages and friends. 401 * 402 * If this code finds it can't get good compression, it puts an 403 * entry onto the work queue to write the uncompressed bytes. This 404 * makes sure that both compressed inodes and uncompressed inodes 405 * are written in the same order that the flusher thread sent them 406 * down. 407 */ 408 static noinline void compress_file_range(struct inode *inode, 409 struct page *locked_page, 410 u64 start, u64 end, 411 struct async_cow *async_cow, 412 int *num_added) 413 { 414 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 415 struct btrfs_root *root = BTRFS_I(inode)->root; 416 u64 num_bytes; 417 u64 blocksize = fs_info->sectorsize; 418 u64 actual_end; 419 u64 isize = i_size_read(inode); 420 int ret = 0; 421 struct page **pages = NULL; 422 unsigned long nr_pages; 423 unsigned long nr_pages_ret = 0; 424 unsigned long total_compressed = 0; 425 unsigned long total_in = 0; 426 unsigned long max_compressed = SZ_128K; 427 unsigned long max_uncompressed = SZ_128K; 428 int i; 429 int will_compress; 430 int compress_type = fs_info->compress_type; 431 int redirty = 0; 432 433 /* if this is a small write inside eof, kick off a defrag */ 434 if ((end - start + 1) < SZ_16K && 435 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 436 btrfs_add_inode_defrag(NULL, inode); 437 438 actual_end = min_t(u64, isize, end + 1); 439 again: 440 will_compress = 0; 441 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 442 nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); 443 444 /* 445 * we don't want to send crud past the end of i_size through 446 * compression, that's just a waste of CPU time. So, if the 447 * end of the file is before the start of our current 448 * requested range of bytes, we bail out to the uncompressed 449 * cleanup code that can deal with all of this. 450 * 451 * It isn't really the fastest way to fix things, but this is a 452 * very uncommon corner. 453 */ 454 if (actual_end <= start) 455 goto cleanup_and_bail_uncompressed; 456 457 total_compressed = actual_end - start; 458 459 /* 460 * skip compression for a small file range(<=blocksize) that 461 * isn't an inline extent, since it doesn't save disk space at all. 462 */ 463 if (total_compressed <= blocksize && 464 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 465 goto cleanup_and_bail_uncompressed; 466 467 /* we want to make sure that amount of ram required to uncompress 468 * an extent is reasonable, so we limit the total size in ram 469 * of a compressed extent to 128k. This is a crucial number 470 * because it also controls how easily we can spread reads across 471 * cpus for decompression. 472 * 473 * We also want to make sure the amount of IO required to do 474 * a random read is reasonably small, so we limit the size of 475 * a compressed extent to 128k. 476 */ 477 total_compressed = min(total_compressed, max_uncompressed); 478 num_bytes = ALIGN(end - start + 1, blocksize); 479 num_bytes = max(blocksize, num_bytes); 480 total_in = 0; 481 ret = 0; 482 483 /* 484 * we do compression for mount -o compress and when the 485 * inode has not been flagged as nocompress. This flag can 486 * change at any time if we discover bad compression ratios. 487 */ 488 if (inode_need_compress(inode)) { 489 WARN_ON(pages); 490 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 491 if (!pages) { 492 /* just bail out to the uncompressed code */ 493 goto cont; 494 } 495 496 if (BTRFS_I(inode)->force_compress) 497 compress_type = BTRFS_I(inode)->force_compress; 498 499 /* 500 * we need to call clear_page_dirty_for_io on each 501 * page in the range. Otherwise applications with the file 502 * mmap'd can wander in and change the page contents while 503 * we are compressing them. 504 * 505 * If the compression fails for any reason, we set the pages 506 * dirty again later on. 507 */ 508 extent_range_clear_dirty_for_io(inode, start, end); 509 redirty = 1; 510 ret = btrfs_compress_pages(compress_type, 511 inode->i_mapping, start, 512 total_compressed, pages, 513 nr_pages, &nr_pages_ret, 514 &total_in, 515 &total_compressed, 516 max_compressed); 517 518 if (!ret) { 519 unsigned long offset = total_compressed & 520 (PAGE_SIZE - 1); 521 struct page *page = pages[nr_pages_ret - 1]; 522 char *kaddr; 523 524 /* zero the tail end of the last page, we might be 525 * sending it down to disk 526 */ 527 if (offset) { 528 kaddr = kmap_atomic(page); 529 memset(kaddr + offset, 0, 530 PAGE_SIZE - offset); 531 kunmap_atomic(kaddr); 532 } 533 will_compress = 1; 534 } 535 } 536 cont: 537 if (start == 0) { 538 /* lets try to make an inline extent */ 539 if (ret || total_in < (actual_end - start)) { 540 /* we didn't compress the entire range, try 541 * to make an uncompressed inline extent. 542 */ 543 ret = cow_file_range_inline(root, inode, start, end, 544 0, 0, NULL); 545 } else { 546 /* try making a compressed inline extent */ 547 ret = cow_file_range_inline(root, inode, start, end, 548 total_compressed, 549 compress_type, pages); 550 } 551 if (ret <= 0) { 552 unsigned long clear_flags = EXTENT_DELALLOC | 553 EXTENT_DEFRAG; 554 unsigned long page_error_op; 555 556 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 557 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 558 559 /* 560 * inline extent creation worked or returned error, 561 * we don't need to create any more async work items. 562 * Unlock and free up our temp pages. 563 */ 564 extent_clear_unlock_delalloc(inode, start, end, end, 565 NULL, clear_flags, 566 PAGE_UNLOCK | 567 PAGE_CLEAR_DIRTY | 568 PAGE_SET_WRITEBACK | 569 page_error_op | 570 PAGE_END_WRITEBACK); 571 btrfs_free_reserved_data_space_noquota(inode, start, 572 end - start + 1); 573 goto free_pages_out; 574 } 575 } 576 577 if (will_compress) { 578 /* 579 * we aren't doing an inline extent round the compressed size 580 * up to a block size boundary so the allocator does sane 581 * things 582 */ 583 total_compressed = ALIGN(total_compressed, blocksize); 584 585 /* 586 * one last check to make sure the compression is really a 587 * win, compare the page count read with the blocks on disk 588 */ 589 total_in = ALIGN(total_in, PAGE_SIZE); 590 if (total_compressed >= total_in) { 591 will_compress = 0; 592 } else { 593 num_bytes = total_in; 594 *num_added += 1; 595 596 /* 597 * The async work queues will take care of doing actual 598 * allocation on disk for these compressed pages, and 599 * will submit them to the elevator. 600 */ 601 add_async_extent(async_cow, start, num_bytes, 602 total_compressed, pages, nr_pages_ret, 603 compress_type); 604 605 if (start + num_bytes < end) { 606 start += num_bytes; 607 pages = NULL; 608 cond_resched(); 609 goto again; 610 } 611 return; 612 } 613 } 614 if (pages) { 615 /* 616 * the compression code ran but failed to make things smaller, 617 * free any pages it allocated and our page pointer array 618 */ 619 for (i = 0; i < nr_pages_ret; i++) { 620 WARN_ON(pages[i]->mapping); 621 put_page(pages[i]); 622 } 623 kfree(pages); 624 pages = NULL; 625 total_compressed = 0; 626 nr_pages_ret = 0; 627 628 /* flag the file so we don't compress in the future */ 629 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && 630 !(BTRFS_I(inode)->force_compress)) { 631 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 632 } 633 } 634 cleanup_and_bail_uncompressed: 635 /* 636 * No compression, but we still need to write the pages in the file 637 * we've been given so far. redirty the locked page if it corresponds 638 * to our extent and set things up for the async work queue to run 639 * cow_file_range to do the normal delalloc dance. 640 */ 641 if (page_offset(locked_page) >= start && 642 page_offset(locked_page) <= end) 643 __set_page_dirty_nobuffers(locked_page); 644 /* unlocked later on in the async handlers */ 645 646 if (redirty) 647 extent_range_redirty_for_io(inode, start, end); 648 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, 649 BTRFS_COMPRESS_NONE); 650 *num_added += 1; 651 652 return; 653 654 free_pages_out: 655 for (i = 0; i < nr_pages_ret; i++) { 656 WARN_ON(pages[i]->mapping); 657 put_page(pages[i]); 658 } 659 kfree(pages); 660 } 661 662 static void free_async_extent_pages(struct async_extent *async_extent) 663 { 664 int i; 665 666 if (!async_extent->pages) 667 return; 668 669 for (i = 0; i < async_extent->nr_pages; i++) { 670 WARN_ON(async_extent->pages[i]->mapping); 671 put_page(async_extent->pages[i]); 672 } 673 kfree(async_extent->pages); 674 async_extent->nr_pages = 0; 675 async_extent->pages = NULL; 676 } 677 678 /* 679 * phase two of compressed writeback. This is the ordered portion 680 * of the code, which only gets called in the order the work was 681 * queued. We walk all the async extents created by compress_file_range 682 * and send them down to the disk. 683 */ 684 static noinline void submit_compressed_extents(struct inode *inode, 685 struct async_cow *async_cow) 686 { 687 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 688 struct async_extent *async_extent; 689 u64 alloc_hint = 0; 690 struct btrfs_key ins; 691 struct extent_map *em; 692 struct btrfs_root *root = BTRFS_I(inode)->root; 693 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 694 struct extent_io_tree *io_tree; 695 int ret = 0; 696 697 again: 698 while (!list_empty(&async_cow->extents)) { 699 async_extent = list_entry(async_cow->extents.next, 700 struct async_extent, list); 701 list_del(&async_extent->list); 702 703 io_tree = &BTRFS_I(inode)->io_tree; 704 705 retry: 706 /* did the compression code fall back to uncompressed IO? */ 707 if (!async_extent->pages) { 708 int page_started = 0; 709 unsigned long nr_written = 0; 710 711 lock_extent(io_tree, async_extent->start, 712 async_extent->start + 713 async_extent->ram_size - 1); 714 715 /* allocate blocks */ 716 ret = cow_file_range(inode, async_cow->locked_page, 717 async_extent->start, 718 async_extent->start + 719 async_extent->ram_size - 1, 720 async_extent->start + 721 async_extent->ram_size - 1, 722 &page_started, &nr_written, 0, 723 NULL); 724 725 /* JDM XXX */ 726 727 /* 728 * if page_started, cow_file_range inserted an 729 * inline extent and took care of all the unlocking 730 * and IO for us. Otherwise, we need to submit 731 * all those pages down to the drive. 732 */ 733 if (!page_started && !ret) 734 extent_write_locked_range(io_tree, 735 inode, async_extent->start, 736 async_extent->start + 737 async_extent->ram_size - 1, 738 btrfs_get_extent, 739 WB_SYNC_ALL); 740 else if (ret) 741 unlock_page(async_cow->locked_page); 742 kfree(async_extent); 743 cond_resched(); 744 continue; 745 } 746 747 lock_extent(io_tree, async_extent->start, 748 async_extent->start + async_extent->ram_size - 1); 749 750 ret = btrfs_reserve_extent(root, async_extent->ram_size, 751 async_extent->compressed_size, 752 async_extent->compressed_size, 753 0, alloc_hint, &ins, 1, 1); 754 if (ret) { 755 free_async_extent_pages(async_extent); 756 757 if (ret == -ENOSPC) { 758 unlock_extent(io_tree, async_extent->start, 759 async_extent->start + 760 async_extent->ram_size - 1); 761 762 /* 763 * we need to redirty the pages if we decide to 764 * fallback to uncompressed IO, otherwise we 765 * will not submit these pages down to lower 766 * layers. 767 */ 768 extent_range_redirty_for_io(inode, 769 async_extent->start, 770 async_extent->start + 771 async_extent->ram_size - 1); 772 773 goto retry; 774 } 775 goto out_free; 776 } 777 /* 778 * here we're doing allocation and writeback of the 779 * compressed pages 780 */ 781 btrfs_drop_extent_cache(inode, async_extent->start, 782 async_extent->start + 783 async_extent->ram_size - 1, 0); 784 785 em = alloc_extent_map(); 786 if (!em) { 787 ret = -ENOMEM; 788 goto out_free_reserve; 789 } 790 em->start = async_extent->start; 791 em->len = async_extent->ram_size; 792 em->orig_start = em->start; 793 em->mod_start = em->start; 794 em->mod_len = em->len; 795 796 em->block_start = ins.objectid; 797 em->block_len = ins.offset; 798 em->orig_block_len = ins.offset; 799 em->ram_bytes = async_extent->ram_size; 800 em->bdev = fs_info->fs_devices->latest_bdev; 801 em->compress_type = async_extent->compress_type; 802 set_bit(EXTENT_FLAG_PINNED, &em->flags); 803 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 804 em->generation = -1; 805 806 while (1) { 807 write_lock(&em_tree->lock); 808 ret = add_extent_mapping(em_tree, em, 1); 809 write_unlock(&em_tree->lock); 810 if (ret != -EEXIST) { 811 free_extent_map(em); 812 break; 813 } 814 btrfs_drop_extent_cache(inode, async_extent->start, 815 async_extent->start + 816 async_extent->ram_size - 1, 0); 817 } 818 819 if (ret) 820 goto out_free_reserve; 821 822 ret = btrfs_add_ordered_extent_compress(inode, 823 async_extent->start, 824 ins.objectid, 825 async_extent->ram_size, 826 ins.offset, 827 BTRFS_ORDERED_COMPRESSED, 828 async_extent->compress_type); 829 if (ret) { 830 btrfs_drop_extent_cache(inode, async_extent->start, 831 async_extent->start + 832 async_extent->ram_size - 1, 0); 833 goto out_free_reserve; 834 } 835 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 836 837 /* 838 * clear dirty, set writeback and unlock the pages. 839 */ 840 extent_clear_unlock_delalloc(inode, async_extent->start, 841 async_extent->start + 842 async_extent->ram_size - 1, 843 async_extent->start + 844 async_extent->ram_size - 1, 845 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 846 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 847 PAGE_SET_WRITEBACK); 848 ret = btrfs_submit_compressed_write(inode, 849 async_extent->start, 850 async_extent->ram_size, 851 ins.objectid, 852 ins.offset, async_extent->pages, 853 async_extent->nr_pages); 854 if (ret) { 855 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 856 struct page *p = async_extent->pages[0]; 857 const u64 start = async_extent->start; 858 const u64 end = start + async_extent->ram_size - 1; 859 860 p->mapping = inode->i_mapping; 861 tree->ops->writepage_end_io_hook(p, start, end, 862 NULL, 0); 863 p->mapping = NULL; 864 extent_clear_unlock_delalloc(inode, start, end, end, 865 NULL, 0, 866 PAGE_END_WRITEBACK | 867 PAGE_SET_ERROR); 868 free_async_extent_pages(async_extent); 869 } 870 alloc_hint = ins.objectid + ins.offset; 871 kfree(async_extent); 872 cond_resched(); 873 } 874 return; 875 out_free_reserve: 876 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 877 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 878 out_free: 879 extent_clear_unlock_delalloc(inode, async_extent->start, 880 async_extent->start + 881 async_extent->ram_size - 1, 882 async_extent->start + 883 async_extent->ram_size - 1, 884 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 885 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 886 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 887 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | 888 PAGE_SET_ERROR); 889 free_async_extent_pages(async_extent); 890 kfree(async_extent); 891 goto again; 892 } 893 894 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 895 u64 num_bytes) 896 { 897 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 898 struct extent_map *em; 899 u64 alloc_hint = 0; 900 901 read_lock(&em_tree->lock); 902 em = search_extent_mapping(em_tree, start, num_bytes); 903 if (em) { 904 /* 905 * if block start isn't an actual block number then find the 906 * first block in this inode and use that as a hint. If that 907 * block is also bogus then just don't worry about it. 908 */ 909 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 910 free_extent_map(em); 911 em = search_extent_mapping(em_tree, 0, 0); 912 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 913 alloc_hint = em->block_start; 914 if (em) 915 free_extent_map(em); 916 } else { 917 alloc_hint = em->block_start; 918 free_extent_map(em); 919 } 920 } 921 read_unlock(&em_tree->lock); 922 923 return alloc_hint; 924 } 925 926 /* 927 * when extent_io.c finds a delayed allocation range in the file, 928 * the call backs end up in this code. The basic idea is to 929 * allocate extents on disk for the range, and create ordered data structs 930 * in ram to track those extents. 931 * 932 * locked_page is the page that writepage had locked already. We use 933 * it to make sure we don't do extra locks or unlocks. 934 * 935 * *page_started is set to one if we unlock locked_page and do everything 936 * required to start IO on it. It may be clean and already done with 937 * IO when we return. 938 */ 939 static noinline int cow_file_range(struct inode *inode, 940 struct page *locked_page, 941 u64 start, u64 end, u64 delalloc_end, 942 int *page_started, unsigned long *nr_written, 943 int unlock, struct btrfs_dedupe_hash *hash) 944 { 945 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 946 struct btrfs_root *root = BTRFS_I(inode)->root; 947 u64 alloc_hint = 0; 948 u64 num_bytes; 949 unsigned long ram_size; 950 u64 disk_num_bytes; 951 u64 cur_alloc_size; 952 u64 blocksize = fs_info->sectorsize; 953 struct btrfs_key ins; 954 struct extent_map *em; 955 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 956 int ret = 0; 957 958 if (btrfs_is_free_space_inode(inode)) { 959 WARN_ON_ONCE(1); 960 ret = -EINVAL; 961 goto out_unlock; 962 } 963 964 num_bytes = ALIGN(end - start + 1, blocksize); 965 num_bytes = max(blocksize, num_bytes); 966 disk_num_bytes = num_bytes; 967 968 /* if this is a small write inside eof, kick off defrag */ 969 if (num_bytes < SZ_64K && 970 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 971 btrfs_add_inode_defrag(NULL, inode); 972 973 if (start == 0) { 974 /* lets try to make an inline extent */ 975 ret = cow_file_range_inline(root, inode, start, end, 0, 0, 976 NULL); 977 if (ret == 0) { 978 extent_clear_unlock_delalloc(inode, start, end, 979 delalloc_end, NULL, 980 EXTENT_LOCKED | EXTENT_DELALLOC | 981 EXTENT_DEFRAG, PAGE_UNLOCK | 982 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 983 PAGE_END_WRITEBACK); 984 btrfs_free_reserved_data_space_noquota(inode, start, 985 end - start + 1); 986 *nr_written = *nr_written + 987 (end - start + PAGE_SIZE) / PAGE_SIZE; 988 *page_started = 1; 989 goto out; 990 } else if (ret < 0) { 991 goto out_unlock; 992 } 993 } 994 995 BUG_ON(disk_num_bytes > 996 btrfs_super_total_bytes(fs_info->super_copy)); 997 998 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 999 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 1000 1001 while (disk_num_bytes > 0) { 1002 unsigned long op; 1003 1004 cur_alloc_size = disk_num_bytes; 1005 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1006 fs_info->sectorsize, 0, alloc_hint, 1007 &ins, 1, 1); 1008 if (ret < 0) 1009 goto out_unlock; 1010 1011 em = alloc_extent_map(); 1012 if (!em) { 1013 ret = -ENOMEM; 1014 goto out_reserve; 1015 } 1016 em->start = start; 1017 em->orig_start = em->start; 1018 ram_size = ins.offset; 1019 em->len = ins.offset; 1020 em->mod_start = em->start; 1021 em->mod_len = em->len; 1022 1023 em->block_start = ins.objectid; 1024 em->block_len = ins.offset; 1025 em->orig_block_len = ins.offset; 1026 em->ram_bytes = ram_size; 1027 em->bdev = fs_info->fs_devices->latest_bdev; 1028 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1029 em->generation = -1; 1030 1031 while (1) { 1032 write_lock(&em_tree->lock); 1033 ret = add_extent_mapping(em_tree, em, 1); 1034 write_unlock(&em_tree->lock); 1035 if (ret != -EEXIST) { 1036 free_extent_map(em); 1037 break; 1038 } 1039 btrfs_drop_extent_cache(inode, start, 1040 start + ram_size - 1, 0); 1041 } 1042 if (ret) 1043 goto out_reserve; 1044 1045 cur_alloc_size = ins.offset; 1046 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1047 ram_size, cur_alloc_size, 0); 1048 if (ret) 1049 goto out_drop_extent_cache; 1050 1051 if (root->root_key.objectid == 1052 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1053 ret = btrfs_reloc_clone_csums(inode, start, 1054 cur_alloc_size); 1055 if (ret) 1056 goto out_drop_extent_cache; 1057 } 1058 1059 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1060 1061 if (disk_num_bytes < cur_alloc_size) 1062 break; 1063 1064 /* we're not doing compressed IO, don't unlock the first 1065 * page (which the caller expects to stay locked), don't 1066 * clear any dirty bits and don't set any writeback bits 1067 * 1068 * Do set the Private2 bit so we know this page was properly 1069 * setup for writepage 1070 */ 1071 op = unlock ? PAGE_UNLOCK : 0; 1072 op |= PAGE_SET_PRIVATE2; 1073 1074 extent_clear_unlock_delalloc(inode, start, 1075 start + ram_size - 1, 1076 delalloc_end, locked_page, 1077 EXTENT_LOCKED | EXTENT_DELALLOC, 1078 op); 1079 disk_num_bytes -= cur_alloc_size; 1080 num_bytes -= cur_alloc_size; 1081 alloc_hint = ins.objectid + ins.offset; 1082 start += cur_alloc_size; 1083 } 1084 out: 1085 return ret; 1086 1087 out_drop_extent_cache: 1088 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1089 out_reserve: 1090 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1091 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1092 out_unlock: 1093 extent_clear_unlock_delalloc(inode, start, end, delalloc_end, 1094 locked_page, 1095 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 1096 EXTENT_DELALLOC | EXTENT_DEFRAG, 1097 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 1098 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 1099 goto out; 1100 } 1101 1102 /* 1103 * work queue call back to started compression on a file and pages 1104 */ 1105 static noinline void async_cow_start(struct btrfs_work *work) 1106 { 1107 struct async_cow *async_cow; 1108 int num_added = 0; 1109 async_cow = container_of(work, struct async_cow, work); 1110 1111 compress_file_range(async_cow->inode, async_cow->locked_page, 1112 async_cow->start, async_cow->end, async_cow, 1113 &num_added); 1114 if (num_added == 0) { 1115 btrfs_add_delayed_iput(async_cow->inode); 1116 async_cow->inode = NULL; 1117 } 1118 } 1119 1120 /* 1121 * work queue call back to submit previously compressed pages 1122 */ 1123 static noinline void async_cow_submit(struct btrfs_work *work) 1124 { 1125 struct btrfs_fs_info *fs_info; 1126 struct async_cow *async_cow; 1127 struct btrfs_root *root; 1128 unsigned long nr_pages; 1129 1130 async_cow = container_of(work, struct async_cow, work); 1131 1132 root = async_cow->root; 1133 fs_info = root->fs_info; 1134 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> 1135 PAGE_SHIFT; 1136 1137 /* 1138 * atomic_sub_return implies a barrier for waitqueue_active 1139 */ 1140 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1141 5 * SZ_1M && 1142 waitqueue_active(&fs_info->async_submit_wait)) 1143 wake_up(&fs_info->async_submit_wait); 1144 1145 if (async_cow->inode) 1146 submit_compressed_extents(async_cow->inode, async_cow); 1147 } 1148 1149 static noinline void async_cow_free(struct btrfs_work *work) 1150 { 1151 struct async_cow *async_cow; 1152 async_cow = container_of(work, struct async_cow, work); 1153 if (async_cow->inode) 1154 btrfs_add_delayed_iput(async_cow->inode); 1155 kfree(async_cow); 1156 } 1157 1158 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1159 u64 start, u64 end, int *page_started, 1160 unsigned long *nr_written) 1161 { 1162 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1163 struct async_cow *async_cow; 1164 struct btrfs_root *root = BTRFS_I(inode)->root; 1165 unsigned long nr_pages; 1166 u64 cur_end; 1167 int limit = 10 * SZ_1M; 1168 1169 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1170 1, 0, NULL, GFP_NOFS); 1171 while (start < end) { 1172 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1173 BUG_ON(!async_cow); /* -ENOMEM */ 1174 async_cow->inode = igrab(inode); 1175 async_cow->root = root; 1176 async_cow->locked_page = locked_page; 1177 async_cow->start = start; 1178 1179 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && 1180 !btrfs_test_opt(fs_info, FORCE_COMPRESS)) 1181 cur_end = end; 1182 else 1183 cur_end = min(end, start + SZ_512K - 1); 1184 1185 async_cow->end = cur_end; 1186 INIT_LIST_HEAD(&async_cow->extents); 1187 1188 btrfs_init_work(&async_cow->work, 1189 btrfs_delalloc_helper, 1190 async_cow_start, async_cow_submit, 1191 async_cow_free); 1192 1193 nr_pages = (cur_end - start + PAGE_SIZE) >> 1194 PAGE_SHIFT; 1195 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1196 1197 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); 1198 1199 if (atomic_read(&fs_info->async_delalloc_pages) > limit) { 1200 wait_event(fs_info->async_submit_wait, 1201 (atomic_read(&fs_info->async_delalloc_pages) < 1202 limit)); 1203 } 1204 1205 while (atomic_read(&fs_info->async_submit_draining) && 1206 atomic_read(&fs_info->async_delalloc_pages)) { 1207 wait_event(fs_info->async_submit_wait, 1208 (atomic_read(&fs_info->async_delalloc_pages) == 1209 0)); 1210 } 1211 1212 *nr_written += nr_pages; 1213 start = cur_end + 1; 1214 } 1215 *page_started = 1; 1216 return 0; 1217 } 1218 1219 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, 1220 u64 bytenr, u64 num_bytes) 1221 { 1222 int ret; 1223 struct btrfs_ordered_sum *sums; 1224 LIST_HEAD(list); 1225 1226 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, 1227 bytenr + num_bytes - 1, &list, 0); 1228 if (ret == 0 && list_empty(&list)) 1229 return 0; 1230 1231 while (!list_empty(&list)) { 1232 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1233 list_del(&sums->list); 1234 kfree(sums); 1235 } 1236 return 1; 1237 } 1238 1239 /* 1240 * when nowcow writeback call back. This checks for snapshots or COW copies 1241 * of the extents that exist in the file, and COWs the file as required. 1242 * 1243 * If no cow copies or snapshots exist, we write directly to the existing 1244 * blocks on disk 1245 */ 1246 static noinline int run_delalloc_nocow(struct inode *inode, 1247 struct page *locked_page, 1248 u64 start, u64 end, int *page_started, int force, 1249 unsigned long *nr_written) 1250 { 1251 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1252 struct btrfs_root *root = BTRFS_I(inode)->root; 1253 struct btrfs_trans_handle *trans; 1254 struct extent_buffer *leaf; 1255 struct btrfs_path *path; 1256 struct btrfs_file_extent_item *fi; 1257 struct btrfs_key found_key; 1258 u64 cow_start; 1259 u64 cur_offset; 1260 u64 extent_end; 1261 u64 extent_offset; 1262 u64 disk_bytenr; 1263 u64 num_bytes; 1264 u64 disk_num_bytes; 1265 u64 ram_bytes; 1266 int extent_type; 1267 int ret, err; 1268 int type; 1269 int nocow; 1270 int check_prev = 1; 1271 bool nolock; 1272 u64 ino = btrfs_ino(inode); 1273 1274 path = btrfs_alloc_path(); 1275 if (!path) { 1276 extent_clear_unlock_delalloc(inode, start, end, end, 1277 locked_page, 1278 EXTENT_LOCKED | EXTENT_DELALLOC | 1279 EXTENT_DO_ACCOUNTING | 1280 EXTENT_DEFRAG, PAGE_UNLOCK | 1281 PAGE_CLEAR_DIRTY | 1282 PAGE_SET_WRITEBACK | 1283 PAGE_END_WRITEBACK); 1284 return -ENOMEM; 1285 } 1286 1287 nolock = btrfs_is_free_space_inode(inode); 1288 1289 if (nolock) 1290 trans = btrfs_join_transaction_nolock(root); 1291 else 1292 trans = btrfs_join_transaction(root); 1293 1294 if (IS_ERR(trans)) { 1295 extent_clear_unlock_delalloc(inode, start, end, end, 1296 locked_page, 1297 EXTENT_LOCKED | EXTENT_DELALLOC | 1298 EXTENT_DO_ACCOUNTING | 1299 EXTENT_DEFRAG, PAGE_UNLOCK | 1300 PAGE_CLEAR_DIRTY | 1301 PAGE_SET_WRITEBACK | 1302 PAGE_END_WRITEBACK); 1303 btrfs_free_path(path); 1304 return PTR_ERR(trans); 1305 } 1306 1307 trans->block_rsv = &fs_info->delalloc_block_rsv; 1308 1309 cow_start = (u64)-1; 1310 cur_offset = start; 1311 while (1) { 1312 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1313 cur_offset, 0); 1314 if (ret < 0) 1315 goto error; 1316 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1317 leaf = path->nodes[0]; 1318 btrfs_item_key_to_cpu(leaf, &found_key, 1319 path->slots[0] - 1); 1320 if (found_key.objectid == ino && 1321 found_key.type == BTRFS_EXTENT_DATA_KEY) 1322 path->slots[0]--; 1323 } 1324 check_prev = 0; 1325 next_slot: 1326 leaf = path->nodes[0]; 1327 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1328 ret = btrfs_next_leaf(root, path); 1329 if (ret < 0) 1330 goto error; 1331 if (ret > 0) 1332 break; 1333 leaf = path->nodes[0]; 1334 } 1335 1336 nocow = 0; 1337 disk_bytenr = 0; 1338 num_bytes = 0; 1339 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1340 1341 if (found_key.objectid > ino) 1342 break; 1343 if (WARN_ON_ONCE(found_key.objectid < ino) || 1344 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1345 path->slots[0]++; 1346 goto next_slot; 1347 } 1348 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1349 found_key.offset > end) 1350 break; 1351 1352 if (found_key.offset > cur_offset) { 1353 extent_end = found_key.offset; 1354 extent_type = 0; 1355 goto out_check; 1356 } 1357 1358 fi = btrfs_item_ptr(leaf, path->slots[0], 1359 struct btrfs_file_extent_item); 1360 extent_type = btrfs_file_extent_type(leaf, fi); 1361 1362 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1363 if (extent_type == BTRFS_FILE_EXTENT_REG || 1364 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1365 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1366 extent_offset = btrfs_file_extent_offset(leaf, fi); 1367 extent_end = found_key.offset + 1368 btrfs_file_extent_num_bytes(leaf, fi); 1369 disk_num_bytes = 1370 btrfs_file_extent_disk_num_bytes(leaf, fi); 1371 if (extent_end <= start) { 1372 path->slots[0]++; 1373 goto next_slot; 1374 } 1375 if (disk_bytenr == 0) 1376 goto out_check; 1377 if (btrfs_file_extent_compression(leaf, fi) || 1378 btrfs_file_extent_encryption(leaf, fi) || 1379 btrfs_file_extent_other_encoding(leaf, fi)) 1380 goto out_check; 1381 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1382 goto out_check; 1383 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 1384 goto out_check; 1385 if (btrfs_cross_ref_exist(trans, root, ino, 1386 found_key.offset - 1387 extent_offset, disk_bytenr)) 1388 goto out_check; 1389 disk_bytenr += extent_offset; 1390 disk_bytenr += cur_offset - found_key.offset; 1391 num_bytes = min(end + 1, extent_end) - cur_offset; 1392 /* 1393 * if there are pending snapshots for this root, 1394 * we fall into common COW way. 1395 */ 1396 if (!nolock) { 1397 err = btrfs_start_write_no_snapshoting(root); 1398 if (!err) 1399 goto out_check; 1400 } 1401 /* 1402 * force cow if csum exists in the range. 1403 * this ensure that csum for a given extent are 1404 * either valid or do not exist. 1405 */ 1406 if (csum_exist_in_range(fs_info, disk_bytenr, 1407 num_bytes)) 1408 goto out_check; 1409 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 1410 goto out_check; 1411 nocow = 1; 1412 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1413 extent_end = found_key.offset + 1414 btrfs_file_extent_inline_len(leaf, 1415 path->slots[0], fi); 1416 extent_end = ALIGN(extent_end, 1417 fs_info->sectorsize); 1418 } else { 1419 BUG_ON(1); 1420 } 1421 out_check: 1422 if (extent_end <= start) { 1423 path->slots[0]++; 1424 if (!nolock && nocow) 1425 btrfs_end_write_no_snapshoting(root); 1426 if (nocow) 1427 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1428 goto next_slot; 1429 } 1430 if (!nocow) { 1431 if (cow_start == (u64)-1) 1432 cow_start = cur_offset; 1433 cur_offset = extent_end; 1434 if (cur_offset > end) 1435 break; 1436 path->slots[0]++; 1437 goto next_slot; 1438 } 1439 1440 btrfs_release_path(path); 1441 if (cow_start != (u64)-1) { 1442 ret = cow_file_range(inode, locked_page, 1443 cow_start, found_key.offset - 1, 1444 end, page_started, nr_written, 1, 1445 NULL); 1446 if (ret) { 1447 if (!nolock && nocow) 1448 btrfs_end_write_no_snapshoting(root); 1449 if (nocow) 1450 btrfs_dec_nocow_writers(fs_info, 1451 disk_bytenr); 1452 goto error; 1453 } 1454 cow_start = (u64)-1; 1455 } 1456 1457 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1458 struct extent_map *em; 1459 struct extent_map_tree *em_tree; 1460 em_tree = &BTRFS_I(inode)->extent_tree; 1461 em = alloc_extent_map(); 1462 BUG_ON(!em); /* -ENOMEM */ 1463 em->start = cur_offset; 1464 em->orig_start = found_key.offset - extent_offset; 1465 em->len = num_bytes; 1466 em->block_len = num_bytes; 1467 em->block_start = disk_bytenr; 1468 em->orig_block_len = disk_num_bytes; 1469 em->ram_bytes = ram_bytes; 1470 em->bdev = fs_info->fs_devices->latest_bdev; 1471 em->mod_start = em->start; 1472 em->mod_len = em->len; 1473 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1474 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1475 em->generation = -1; 1476 while (1) { 1477 write_lock(&em_tree->lock); 1478 ret = add_extent_mapping(em_tree, em, 1); 1479 write_unlock(&em_tree->lock); 1480 if (ret != -EEXIST) { 1481 free_extent_map(em); 1482 break; 1483 } 1484 btrfs_drop_extent_cache(inode, em->start, 1485 em->start + em->len - 1, 0); 1486 } 1487 type = BTRFS_ORDERED_PREALLOC; 1488 } else { 1489 type = BTRFS_ORDERED_NOCOW; 1490 } 1491 1492 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1493 num_bytes, num_bytes, type); 1494 if (nocow) 1495 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1496 BUG_ON(ret); /* -ENOMEM */ 1497 1498 if (root->root_key.objectid == 1499 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1500 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1501 num_bytes); 1502 if (ret) { 1503 if (!nolock && nocow) 1504 btrfs_end_write_no_snapshoting(root); 1505 goto error; 1506 } 1507 } 1508 1509 extent_clear_unlock_delalloc(inode, cur_offset, 1510 cur_offset + num_bytes - 1, end, 1511 locked_page, EXTENT_LOCKED | 1512 EXTENT_DELALLOC | 1513 EXTENT_CLEAR_DATA_RESV, 1514 PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1515 1516 if (!nolock && nocow) 1517 btrfs_end_write_no_snapshoting(root); 1518 cur_offset = extent_end; 1519 if (cur_offset > end) 1520 break; 1521 } 1522 btrfs_release_path(path); 1523 1524 if (cur_offset <= end && cow_start == (u64)-1) { 1525 cow_start = cur_offset; 1526 cur_offset = end; 1527 } 1528 1529 if (cow_start != (u64)-1) { 1530 ret = cow_file_range(inode, locked_page, cow_start, end, end, 1531 page_started, nr_written, 1, NULL); 1532 if (ret) 1533 goto error; 1534 } 1535 1536 error: 1537 err = btrfs_end_transaction(trans); 1538 if (!ret) 1539 ret = err; 1540 1541 if (ret && cur_offset < end) 1542 extent_clear_unlock_delalloc(inode, cur_offset, end, end, 1543 locked_page, EXTENT_LOCKED | 1544 EXTENT_DELALLOC | EXTENT_DEFRAG | 1545 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1546 PAGE_CLEAR_DIRTY | 1547 PAGE_SET_WRITEBACK | 1548 PAGE_END_WRITEBACK); 1549 btrfs_free_path(path); 1550 return ret; 1551 } 1552 1553 static inline int need_force_cow(struct inode *inode, u64 start, u64 end) 1554 { 1555 1556 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 1557 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) 1558 return 0; 1559 1560 /* 1561 * @defrag_bytes is a hint value, no spinlock held here, 1562 * if is not zero, it means the file is defragging. 1563 * Force cow if given extent needs to be defragged. 1564 */ 1565 if (BTRFS_I(inode)->defrag_bytes && 1566 test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1567 EXTENT_DEFRAG, 0, NULL)) 1568 return 1; 1569 1570 return 0; 1571 } 1572 1573 /* 1574 * extent_io.c call back to do delayed allocation processing 1575 */ 1576 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1577 u64 start, u64 end, int *page_started, 1578 unsigned long *nr_written) 1579 { 1580 int ret; 1581 int force_cow = need_force_cow(inode, start, end); 1582 1583 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { 1584 ret = run_delalloc_nocow(inode, locked_page, start, end, 1585 page_started, 1, nr_written); 1586 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { 1587 ret = run_delalloc_nocow(inode, locked_page, start, end, 1588 page_started, 0, nr_written); 1589 } else if (!inode_need_compress(inode)) { 1590 ret = cow_file_range(inode, locked_page, start, end, end, 1591 page_started, nr_written, 1, NULL); 1592 } else { 1593 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1594 &BTRFS_I(inode)->runtime_flags); 1595 ret = cow_file_range_async(inode, locked_page, start, end, 1596 page_started, nr_written); 1597 } 1598 return ret; 1599 } 1600 1601 static void btrfs_split_extent_hook(struct inode *inode, 1602 struct extent_state *orig, u64 split) 1603 { 1604 u64 size; 1605 1606 /* not delalloc, ignore it */ 1607 if (!(orig->state & EXTENT_DELALLOC)) 1608 return; 1609 1610 size = orig->end - orig->start + 1; 1611 if (size > BTRFS_MAX_EXTENT_SIZE) { 1612 u64 num_extents; 1613 u64 new_size; 1614 1615 /* 1616 * See the explanation in btrfs_merge_extent_hook, the same 1617 * applies here, just in reverse. 1618 */ 1619 new_size = orig->end - split + 1; 1620 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1621 BTRFS_MAX_EXTENT_SIZE); 1622 new_size = split - orig->start; 1623 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1624 BTRFS_MAX_EXTENT_SIZE); 1625 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, 1626 BTRFS_MAX_EXTENT_SIZE) >= num_extents) 1627 return; 1628 } 1629 1630 spin_lock(&BTRFS_I(inode)->lock); 1631 BTRFS_I(inode)->outstanding_extents++; 1632 spin_unlock(&BTRFS_I(inode)->lock); 1633 } 1634 1635 /* 1636 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1637 * extents so we can keep track of new extents that are just merged onto old 1638 * extents, such as when we are doing sequential writes, so we can properly 1639 * account for the metadata space we'll need. 1640 */ 1641 static void btrfs_merge_extent_hook(struct inode *inode, 1642 struct extent_state *new, 1643 struct extent_state *other) 1644 { 1645 u64 new_size, old_size; 1646 u64 num_extents; 1647 1648 /* not delalloc, ignore it */ 1649 if (!(other->state & EXTENT_DELALLOC)) 1650 return; 1651 1652 if (new->start > other->start) 1653 new_size = new->end - other->start + 1; 1654 else 1655 new_size = other->end - new->start + 1; 1656 1657 /* we're not bigger than the max, unreserve the space and go */ 1658 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1659 spin_lock(&BTRFS_I(inode)->lock); 1660 BTRFS_I(inode)->outstanding_extents--; 1661 spin_unlock(&BTRFS_I(inode)->lock); 1662 return; 1663 } 1664 1665 /* 1666 * We have to add up either side to figure out how many extents were 1667 * accounted for before we merged into one big extent. If the number of 1668 * extents we accounted for is <= the amount we need for the new range 1669 * then we can return, otherwise drop. Think of it like this 1670 * 1671 * [ 4k][MAX_SIZE] 1672 * 1673 * So we've grown the extent by a MAX_SIZE extent, this would mean we 1674 * need 2 outstanding extents, on one side we have 1 and the other side 1675 * we have 1 so they are == and we can return. But in this case 1676 * 1677 * [MAX_SIZE+4k][MAX_SIZE+4k] 1678 * 1679 * Each range on their own accounts for 2 extents, but merged together 1680 * they are only 3 extents worth of accounting, so we need to drop in 1681 * this case. 1682 */ 1683 old_size = other->end - other->start + 1; 1684 num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, 1685 BTRFS_MAX_EXTENT_SIZE); 1686 old_size = new->end - new->start + 1; 1687 num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, 1688 BTRFS_MAX_EXTENT_SIZE); 1689 1690 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1691 BTRFS_MAX_EXTENT_SIZE) >= num_extents) 1692 return; 1693 1694 spin_lock(&BTRFS_I(inode)->lock); 1695 BTRFS_I(inode)->outstanding_extents--; 1696 spin_unlock(&BTRFS_I(inode)->lock); 1697 } 1698 1699 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1700 struct inode *inode) 1701 { 1702 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1703 1704 spin_lock(&root->delalloc_lock); 1705 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1706 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1707 &root->delalloc_inodes); 1708 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1709 &BTRFS_I(inode)->runtime_flags); 1710 root->nr_delalloc_inodes++; 1711 if (root->nr_delalloc_inodes == 1) { 1712 spin_lock(&fs_info->delalloc_root_lock); 1713 BUG_ON(!list_empty(&root->delalloc_root)); 1714 list_add_tail(&root->delalloc_root, 1715 &fs_info->delalloc_roots); 1716 spin_unlock(&fs_info->delalloc_root_lock); 1717 } 1718 } 1719 spin_unlock(&root->delalloc_lock); 1720 } 1721 1722 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 1723 struct inode *inode) 1724 { 1725 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1726 1727 spin_lock(&root->delalloc_lock); 1728 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1729 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1730 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1731 &BTRFS_I(inode)->runtime_flags); 1732 root->nr_delalloc_inodes--; 1733 if (!root->nr_delalloc_inodes) { 1734 spin_lock(&fs_info->delalloc_root_lock); 1735 BUG_ON(list_empty(&root->delalloc_root)); 1736 list_del_init(&root->delalloc_root); 1737 spin_unlock(&fs_info->delalloc_root_lock); 1738 } 1739 } 1740 spin_unlock(&root->delalloc_lock); 1741 } 1742 1743 /* 1744 * extent_io.c set_bit_hook, used to track delayed allocation 1745 * bytes in this file, and to maintain the list of inodes that 1746 * have pending delalloc work to be done. 1747 */ 1748 static void btrfs_set_bit_hook(struct inode *inode, 1749 struct extent_state *state, unsigned *bits) 1750 { 1751 1752 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1753 1754 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 1755 WARN_ON(1); 1756 /* 1757 * set_bit and clear bit hooks normally require _irqsave/restore 1758 * but in this case, we are only testing for the DELALLOC 1759 * bit, which is only set or cleared with irqs on 1760 */ 1761 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1762 struct btrfs_root *root = BTRFS_I(inode)->root; 1763 u64 len = state->end + 1 - state->start; 1764 bool do_list = !btrfs_is_free_space_inode(inode); 1765 1766 if (*bits & EXTENT_FIRST_DELALLOC) { 1767 *bits &= ~EXTENT_FIRST_DELALLOC; 1768 } else { 1769 spin_lock(&BTRFS_I(inode)->lock); 1770 BTRFS_I(inode)->outstanding_extents++; 1771 spin_unlock(&BTRFS_I(inode)->lock); 1772 } 1773 1774 /* For sanity tests */ 1775 if (btrfs_is_testing(fs_info)) 1776 return; 1777 1778 __percpu_counter_add(&fs_info->delalloc_bytes, len, 1779 fs_info->delalloc_batch); 1780 spin_lock(&BTRFS_I(inode)->lock); 1781 BTRFS_I(inode)->delalloc_bytes += len; 1782 if (*bits & EXTENT_DEFRAG) 1783 BTRFS_I(inode)->defrag_bytes += len; 1784 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1785 &BTRFS_I(inode)->runtime_flags)) 1786 btrfs_add_delalloc_inodes(root, inode); 1787 spin_unlock(&BTRFS_I(inode)->lock); 1788 } 1789 } 1790 1791 /* 1792 * extent_io.c clear_bit_hook, see set_bit_hook for why 1793 */ 1794 static void btrfs_clear_bit_hook(struct inode *inode, 1795 struct extent_state *state, 1796 unsigned *bits) 1797 { 1798 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1799 u64 len = state->end + 1 - state->start; 1800 u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, 1801 BTRFS_MAX_EXTENT_SIZE); 1802 1803 spin_lock(&BTRFS_I(inode)->lock); 1804 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) 1805 BTRFS_I(inode)->defrag_bytes -= len; 1806 spin_unlock(&BTRFS_I(inode)->lock); 1807 1808 /* 1809 * set_bit and clear bit hooks normally require _irqsave/restore 1810 * but in this case, we are only testing for the DELALLOC 1811 * bit, which is only set or cleared with irqs on 1812 */ 1813 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1814 struct btrfs_root *root = BTRFS_I(inode)->root; 1815 bool do_list = !btrfs_is_free_space_inode(inode); 1816 1817 if (*bits & EXTENT_FIRST_DELALLOC) { 1818 *bits &= ~EXTENT_FIRST_DELALLOC; 1819 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1820 spin_lock(&BTRFS_I(inode)->lock); 1821 BTRFS_I(inode)->outstanding_extents -= num_extents; 1822 spin_unlock(&BTRFS_I(inode)->lock); 1823 } 1824 1825 /* 1826 * We don't reserve metadata space for space cache inodes so we 1827 * don't need to call dellalloc_release_metadata if there is an 1828 * error. 1829 */ 1830 if (*bits & EXTENT_DO_ACCOUNTING && 1831 root != fs_info->tree_root) 1832 btrfs_delalloc_release_metadata(inode, len); 1833 1834 /* For sanity tests. */ 1835 if (btrfs_is_testing(fs_info)) 1836 return; 1837 1838 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1839 && do_list && !(state->state & EXTENT_NORESERVE) 1840 && (*bits & (EXTENT_DO_ACCOUNTING | 1841 EXTENT_CLEAR_DATA_RESV))) 1842 btrfs_free_reserved_data_space_noquota(inode, 1843 state->start, len); 1844 1845 __percpu_counter_add(&fs_info->delalloc_bytes, -len, 1846 fs_info->delalloc_batch); 1847 spin_lock(&BTRFS_I(inode)->lock); 1848 BTRFS_I(inode)->delalloc_bytes -= len; 1849 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1850 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1851 &BTRFS_I(inode)->runtime_flags)) 1852 btrfs_del_delalloc_inode(root, inode); 1853 spin_unlock(&BTRFS_I(inode)->lock); 1854 } 1855 } 1856 1857 /* 1858 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1859 * we don't create bios that span stripes or chunks 1860 * 1861 * return 1 if page cannot be merged to bio 1862 * return 0 if page can be merged to bio 1863 * return error otherwise 1864 */ 1865 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1866 size_t size, struct bio *bio, 1867 unsigned long bio_flags) 1868 { 1869 struct inode *inode = page->mapping->host; 1870 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1871 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 1872 u64 length = 0; 1873 u64 map_length; 1874 int ret; 1875 1876 if (bio_flags & EXTENT_BIO_COMPRESSED) 1877 return 0; 1878 1879 length = bio->bi_iter.bi_size; 1880 map_length = length; 1881 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 1882 NULL, 0); 1883 if (ret < 0) 1884 return ret; 1885 if (map_length < length + size) 1886 return 1; 1887 return 0; 1888 } 1889 1890 /* 1891 * in order to insert checksums into the metadata in large chunks, 1892 * we wait until bio submission time. All the pages in the bio are 1893 * checksummed and sums are attached onto the ordered extent record. 1894 * 1895 * At IO completion time the cums attached on the ordered extent record 1896 * are inserted into the btree 1897 */ 1898 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 1899 int mirror_num, unsigned long bio_flags, 1900 u64 bio_offset) 1901 { 1902 int ret = 0; 1903 1904 ret = btrfs_csum_one_bio(inode, bio, 0, 0); 1905 BUG_ON(ret); /* -ENOMEM */ 1906 return 0; 1907 } 1908 1909 /* 1910 * in order to insert checksums into the metadata in large chunks, 1911 * we wait until bio submission time. All the pages in the bio are 1912 * checksummed and sums are attached onto the ordered extent record. 1913 * 1914 * At IO completion time the cums attached on the ordered extent record 1915 * are inserted into the btree 1916 */ 1917 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, 1918 int mirror_num, unsigned long bio_flags, 1919 u64 bio_offset) 1920 { 1921 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1922 int ret; 1923 1924 ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); 1925 if (ret) { 1926 bio->bi_error = ret; 1927 bio_endio(bio); 1928 } 1929 return ret; 1930 } 1931 1932 /* 1933 * extent_io.c submission hook. This does the right thing for csum calculation 1934 * on write, or reading the csums from the tree before a read 1935 */ 1936 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, 1937 int mirror_num, unsigned long bio_flags, 1938 u64 bio_offset) 1939 { 1940 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1941 struct btrfs_root *root = BTRFS_I(inode)->root; 1942 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 1943 int ret = 0; 1944 int skip_sum; 1945 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1946 1947 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1948 1949 if (btrfs_is_free_space_inode(inode)) 1950 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 1951 1952 if (bio_op(bio) != REQ_OP_WRITE) { 1953 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); 1954 if (ret) 1955 goto out; 1956 1957 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1958 ret = btrfs_submit_compressed_read(inode, bio, 1959 mirror_num, 1960 bio_flags); 1961 goto out; 1962 } else if (!skip_sum) { 1963 ret = btrfs_lookup_bio_sums(inode, bio, NULL); 1964 if (ret) 1965 goto out; 1966 } 1967 goto mapit; 1968 } else if (async && !skip_sum) { 1969 /* csum items have already been cloned */ 1970 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1971 goto mapit; 1972 /* we're doing a write, do the async checksumming */ 1973 ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 1974 bio_flags, bio_offset, 1975 __btrfs_submit_bio_start, 1976 __btrfs_submit_bio_done); 1977 goto out; 1978 } else if (!skip_sum) { 1979 ret = btrfs_csum_one_bio(inode, bio, 0, 0); 1980 if (ret) 1981 goto out; 1982 } 1983 1984 mapit: 1985 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); 1986 1987 out: 1988 if (ret < 0) { 1989 bio->bi_error = ret; 1990 bio_endio(bio); 1991 } 1992 return ret; 1993 } 1994 1995 /* 1996 * given a list of ordered sums record them in the inode. This happens 1997 * at IO completion time based on sums calculated at bio submission time. 1998 */ 1999 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 2000 struct inode *inode, u64 file_offset, 2001 struct list_head *list) 2002 { 2003 struct btrfs_ordered_sum *sum; 2004 2005 list_for_each_entry(sum, list, list) { 2006 trans->adding_csums = 1; 2007 btrfs_csum_file_blocks(trans, 2008 BTRFS_I(inode)->root->fs_info->csum_root, sum); 2009 trans->adding_csums = 0; 2010 } 2011 return 0; 2012 } 2013 2014 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2015 struct extent_state **cached_state, int dedupe) 2016 { 2017 WARN_ON((end & (PAGE_SIZE - 1)) == 0); 2018 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 2019 cached_state); 2020 } 2021 2022 /* see btrfs_writepage_start_hook for details on why this is required */ 2023 struct btrfs_writepage_fixup { 2024 struct page *page; 2025 struct btrfs_work work; 2026 }; 2027 2028 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2029 { 2030 struct btrfs_writepage_fixup *fixup; 2031 struct btrfs_ordered_extent *ordered; 2032 struct extent_state *cached_state = NULL; 2033 struct page *page; 2034 struct inode *inode; 2035 u64 page_start; 2036 u64 page_end; 2037 int ret; 2038 2039 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2040 page = fixup->page; 2041 again: 2042 lock_page(page); 2043 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2044 ClearPageChecked(page); 2045 goto out_page; 2046 } 2047 2048 inode = page->mapping->host; 2049 page_start = page_offset(page); 2050 page_end = page_offset(page) + PAGE_SIZE - 1; 2051 2052 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 2053 &cached_state); 2054 2055 /* already ordered? We're done */ 2056 if (PagePrivate2(page)) 2057 goto out; 2058 2059 ordered = btrfs_lookup_ordered_range(inode, page_start, 2060 PAGE_SIZE); 2061 if (ordered) { 2062 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2063 page_end, &cached_state, GFP_NOFS); 2064 unlock_page(page); 2065 btrfs_start_ordered_extent(inode, ordered, 1); 2066 btrfs_put_ordered_extent(ordered); 2067 goto again; 2068 } 2069 2070 ret = btrfs_delalloc_reserve_space(inode, page_start, 2071 PAGE_SIZE); 2072 if (ret) { 2073 mapping_set_error(page->mapping, ret); 2074 end_extent_writepage(page, ret, page_start, page_end); 2075 ClearPageChecked(page); 2076 goto out; 2077 } 2078 2079 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state, 2080 0); 2081 ClearPageChecked(page); 2082 set_page_dirty(page); 2083 out: 2084 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2085 &cached_state, GFP_NOFS); 2086 out_page: 2087 unlock_page(page); 2088 put_page(page); 2089 kfree(fixup); 2090 } 2091 2092 /* 2093 * There are a few paths in the higher layers of the kernel that directly 2094 * set the page dirty bit without asking the filesystem if it is a 2095 * good idea. This causes problems because we want to make sure COW 2096 * properly happens and the data=ordered rules are followed. 2097 * 2098 * In our case any range that doesn't have the ORDERED bit set 2099 * hasn't been properly setup for IO. We kick off an async process 2100 * to fix it up. The async helper will wait for ordered extents, set 2101 * the delalloc bit and make it safe to write the page. 2102 */ 2103 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 2104 { 2105 struct inode *inode = page->mapping->host; 2106 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2107 struct btrfs_writepage_fixup *fixup; 2108 2109 /* this page is properly in the ordered list */ 2110 if (TestClearPagePrivate2(page)) 2111 return 0; 2112 2113 if (PageChecked(page)) 2114 return -EAGAIN; 2115 2116 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2117 if (!fixup) 2118 return -EAGAIN; 2119 2120 SetPageChecked(page); 2121 get_page(page); 2122 btrfs_init_work(&fixup->work, btrfs_fixup_helper, 2123 btrfs_writepage_fixup_worker, NULL, NULL); 2124 fixup->page = page; 2125 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2126 return -EBUSY; 2127 } 2128 2129 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2130 struct inode *inode, u64 file_pos, 2131 u64 disk_bytenr, u64 disk_num_bytes, 2132 u64 num_bytes, u64 ram_bytes, 2133 u8 compression, u8 encryption, 2134 u16 other_encoding, int extent_type) 2135 { 2136 struct btrfs_root *root = BTRFS_I(inode)->root; 2137 struct btrfs_file_extent_item *fi; 2138 struct btrfs_path *path; 2139 struct extent_buffer *leaf; 2140 struct btrfs_key ins; 2141 int extent_inserted = 0; 2142 int ret; 2143 2144 path = btrfs_alloc_path(); 2145 if (!path) 2146 return -ENOMEM; 2147 2148 /* 2149 * we may be replacing one extent in the tree with another. 2150 * The new extent is pinned in the extent map, and we don't want 2151 * to drop it from the cache until it is completely in the btree. 2152 * 2153 * So, tell btrfs_drop_extents to leave this extent in the cache. 2154 * the caller is expected to unpin it and allow it to be merged 2155 * with the others. 2156 */ 2157 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, 2158 file_pos + num_bytes, NULL, 0, 2159 1, sizeof(*fi), &extent_inserted); 2160 if (ret) 2161 goto out; 2162 2163 if (!extent_inserted) { 2164 ins.objectid = btrfs_ino(inode); 2165 ins.offset = file_pos; 2166 ins.type = BTRFS_EXTENT_DATA_KEY; 2167 2168 path->leave_spinning = 1; 2169 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2170 sizeof(*fi)); 2171 if (ret) 2172 goto out; 2173 } 2174 leaf = path->nodes[0]; 2175 fi = btrfs_item_ptr(leaf, path->slots[0], 2176 struct btrfs_file_extent_item); 2177 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2178 btrfs_set_file_extent_type(leaf, fi, extent_type); 2179 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 2180 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 2181 btrfs_set_file_extent_offset(leaf, fi, 0); 2182 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2183 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 2184 btrfs_set_file_extent_compression(leaf, fi, compression); 2185 btrfs_set_file_extent_encryption(leaf, fi, encryption); 2186 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 2187 2188 btrfs_mark_buffer_dirty(leaf); 2189 btrfs_release_path(path); 2190 2191 inode_add_bytes(inode, num_bytes); 2192 2193 ins.objectid = disk_bytenr; 2194 ins.offset = disk_num_bytes; 2195 ins.type = BTRFS_EXTENT_ITEM_KEY; 2196 ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, 2197 btrfs_ino(inode), file_pos, 2198 ram_bytes, &ins); 2199 /* 2200 * Release the reserved range from inode dirty range map, as it is 2201 * already moved into delayed_ref_head 2202 */ 2203 btrfs_qgroup_release_data(inode, file_pos, ram_bytes); 2204 out: 2205 btrfs_free_path(path); 2206 2207 return ret; 2208 } 2209 2210 /* snapshot-aware defrag */ 2211 struct sa_defrag_extent_backref { 2212 struct rb_node node; 2213 struct old_sa_defrag_extent *old; 2214 u64 root_id; 2215 u64 inum; 2216 u64 file_pos; 2217 u64 extent_offset; 2218 u64 num_bytes; 2219 u64 generation; 2220 }; 2221 2222 struct old_sa_defrag_extent { 2223 struct list_head list; 2224 struct new_sa_defrag_extent *new; 2225 2226 u64 extent_offset; 2227 u64 bytenr; 2228 u64 offset; 2229 u64 len; 2230 int count; 2231 }; 2232 2233 struct new_sa_defrag_extent { 2234 struct rb_root root; 2235 struct list_head head; 2236 struct btrfs_path *path; 2237 struct inode *inode; 2238 u64 file_pos; 2239 u64 len; 2240 u64 bytenr; 2241 u64 disk_len; 2242 u8 compress_type; 2243 }; 2244 2245 static int backref_comp(struct sa_defrag_extent_backref *b1, 2246 struct sa_defrag_extent_backref *b2) 2247 { 2248 if (b1->root_id < b2->root_id) 2249 return -1; 2250 else if (b1->root_id > b2->root_id) 2251 return 1; 2252 2253 if (b1->inum < b2->inum) 2254 return -1; 2255 else if (b1->inum > b2->inum) 2256 return 1; 2257 2258 if (b1->file_pos < b2->file_pos) 2259 return -1; 2260 else if (b1->file_pos > b2->file_pos) 2261 return 1; 2262 2263 /* 2264 * [------------------------------] ===> (a range of space) 2265 * |<--->| |<---->| =============> (fs/file tree A) 2266 * |<---------------------------->| ===> (fs/file tree B) 2267 * 2268 * A range of space can refer to two file extents in one tree while 2269 * refer to only one file extent in another tree. 2270 * 2271 * So we may process a disk offset more than one time(two extents in A) 2272 * and locate at the same extent(one extent in B), then insert two same 2273 * backrefs(both refer to the extent in B). 2274 */ 2275 return 0; 2276 } 2277 2278 static void backref_insert(struct rb_root *root, 2279 struct sa_defrag_extent_backref *backref) 2280 { 2281 struct rb_node **p = &root->rb_node; 2282 struct rb_node *parent = NULL; 2283 struct sa_defrag_extent_backref *entry; 2284 int ret; 2285 2286 while (*p) { 2287 parent = *p; 2288 entry = rb_entry(parent, struct sa_defrag_extent_backref, node); 2289 2290 ret = backref_comp(backref, entry); 2291 if (ret < 0) 2292 p = &(*p)->rb_left; 2293 else 2294 p = &(*p)->rb_right; 2295 } 2296 2297 rb_link_node(&backref->node, parent, p); 2298 rb_insert_color(&backref->node, root); 2299 } 2300 2301 /* 2302 * Note the backref might has changed, and in this case we just return 0. 2303 */ 2304 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, 2305 void *ctx) 2306 { 2307 struct btrfs_file_extent_item *extent; 2308 struct old_sa_defrag_extent *old = ctx; 2309 struct new_sa_defrag_extent *new = old->new; 2310 struct btrfs_path *path = new->path; 2311 struct btrfs_key key; 2312 struct btrfs_root *root; 2313 struct sa_defrag_extent_backref *backref; 2314 struct extent_buffer *leaf; 2315 struct inode *inode = new->inode; 2316 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2317 int slot; 2318 int ret; 2319 u64 extent_offset; 2320 u64 num_bytes; 2321 2322 if (BTRFS_I(inode)->root->root_key.objectid == root_id && 2323 inum == btrfs_ino(inode)) 2324 return 0; 2325 2326 key.objectid = root_id; 2327 key.type = BTRFS_ROOT_ITEM_KEY; 2328 key.offset = (u64)-1; 2329 2330 root = btrfs_read_fs_root_no_name(fs_info, &key); 2331 if (IS_ERR(root)) { 2332 if (PTR_ERR(root) == -ENOENT) 2333 return 0; 2334 WARN_ON(1); 2335 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", 2336 inum, offset, root_id); 2337 return PTR_ERR(root); 2338 } 2339 2340 key.objectid = inum; 2341 key.type = BTRFS_EXTENT_DATA_KEY; 2342 if (offset > (u64)-1 << 32) 2343 key.offset = 0; 2344 else 2345 key.offset = offset; 2346 2347 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2348 if (WARN_ON(ret < 0)) 2349 return ret; 2350 ret = 0; 2351 2352 while (1) { 2353 cond_resched(); 2354 2355 leaf = path->nodes[0]; 2356 slot = path->slots[0]; 2357 2358 if (slot >= btrfs_header_nritems(leaf)) { 2359 ret = btrfs_next_leaf(root, path); 2360 if (ret < 0) { 2361 goto out; 2362 } else if (ret > 0) { 2363 ret = 0; 2364 goto out; 2365 } 2366 continue; 2367 } 2368 2369 path->slots[0]++; 2370 2371 btrfs_item_key_to_cpu(leaf, &key, slot); 2372 2373 if (key.objectid > inum) 2374 goto out; 2375 2376 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) 2377 continue; 2378 2379 extent = btrfs_item_ptr(leaf, slot, 2380 struct btrfs_file_extent_item); 2381 2382 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2383 continue; 2384 2385 /* 2386 * 'offset' refers to the exact key.offset, 2387 * NOT the 'offset' field in btrfs_extent_data_ref, ie. 2388 * (key.offset - extent_offset). 2389 */ 2390 if (key.offset != offset) 2391 continue; 2392 2393 extent_offset = btrfs_file_extent_offset(leaf, extent); 2394 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2395 2396 if (extent_offset >= old->extent_offset + old->offset + 2397 old->len || extent_offset + num_bytes <= 2398 old->extent_offset + old->offset) 2399 continue; 2400 break; 2401 } 2402 2403 backref = kmalloc(sizeof(*backref), GFP_NOFS); 2404 if (!backref) { 2405 ret = -ENOENT; 2406 goto out; 2407 } 2408 2409 backref->root_id = root_id; 2410 backref->inum = inum; 2411 backref->file_pos = offset; 2412 backref->num_bytes = num_bytes; 2413 backref->extent_offset = extent_offset; 2414 backref->generation = btrfs_file_extent_generation(leaf, extent); 2415 backref->old = old; 2416 backref_insert(&new->root, backref); 2417 old->count++; 2418 out: 2419 btrfs_release_path(path); 2420 WARN_ON(ret); 2421 return ret; 2422 } 2423 2424 static noinline bool record_extent_backrefs(struct btrfs_path *path, 2425 struct new_sa_defrag_extent *new) 2426 { 2427 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2428 struct old_sa_defrag_extent *old, *tmp; 2429 int ret; 2430 2431 new->path = path; 2432 2433 list_for_each_entry_safe(old, tmp, &new->head, list) { 2434 ret = iterate_inodes_from_logical(old->bytenr + 2435 old->extent_offset, fs_info, 2436 path, record_one_backref, 2437 old); 2438 if (ret < 0 && ret != -ENOENT) 2439 return false; 2440 2441 /* no backref to be processed for this extent */ 2442 if (!old->count) { 2443 list_del(&old->list); 2444 kfree(old); 2445 } 2446 } 2447 2448 if (list_empty(&new->head)) 2449 return false; 2450 2451 return true; 2452 } 2453 2454 static int relink_is_mergable(struct extent_buffer *leaf, 2455 struct btrfs_file_extent_item *fi, 2456 struct new_sa_defrag_extent *new) 2457 { 2458 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) 2459 return 0; 2460 2461 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2462 return 0; 2463 2464 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) 2465 return 0; 2466 2467 if (btrfs_file_extent_encryption(leaf, fi) || 2468 btrfs_file_extent_other_encoding(leaf, fi)) 2469 return 0; 2470 2471 return 1; 2472 } 2473 2474 /* 2475 * Note the backref might has changed, and in this case we just return 0. 2476 */ 2477 static noinline int relink_extent_backref(struct btrfs_path *path, 2478 struct sa_defrag_extent_backref *prev, 2479 struct sa_defrag_extent_backref *backref) 2480 { 2481 struct btrfs_file_extent_item *extent; 2482 struct btrfs_file_extent_item *item; 2483 struct btrfs_ordered_extent *ordered; 2484 struct btrfs_trans_handle *trans; 2485 struct btrfs_root *root; 2486 struct btrfs_key key; 2487 struct extent_buffer *leaf; 2488 struct old_sa_defrag_extent *old = backref->old; 2489 struct new_sa_defrag_extent *new = old->new; 2490 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2491 struct inode *inode; 2492 struct extent_state *cached = NULL; 2493 int ret = 0; 2494 u64 start; 2495 u64 len; 2496 u64 lock_start; 2497 u64 lock_end; 2498 bool merge = false; 2499 int index; 2500 2501 if (prev && prev->root_id == backref->root_id && 2502 prev->inum == backref->inum && 2503 prev->file_pos + prev->num_bytes == backref->file_pos) 2504 merge = true; 2505 2506 /* step 1: get root */ 2507 key.objectid = backref->root_id; 2508 key.type = BTRFS_ROOT_ITEM_KEY; 2509 key.offset = (u64)-1; 2510 2511 index = srcu_read_lock(&fs_info->subvol_srcu); 2512 2513 root = btrfs_read_fs_root_no_name(fs_info, &key); 2514 if (IS_ERR(root)) { 2515 srcu_read_unlock(&fs_info->subvol_srcu, index); 2516 if (PTR_ERR(root) == -ENOENT) 2517 return 0; 2518 return PTR_ERR(root); 2519 } 2520 2521 if (btrfs_root_readonly(root)) { 2522 srcu_read_unlock(&fs_info->subvol_srcu, index); 2523 return 0; 2524 } 2525 2526 /* step 2: get inode */ 2527 key.objectid = backref->inum; 2528 key.type = BTRFS_INODE_ITEM_KEY; 2529 key.offset = 0; 2530 2531 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 2532 if (IS_ERR(inode)) { 2533 srcu_read_unlock(&fs_info->subvol_srcu, index); 2534 return 0; 2535 } 2536 2537 srcu_read_unlock(&fs_info->subvol_srcu, index); 2538 2539 /* step 3: relink backref */ 2540 lock_start = backref->file_pos; 2541 lock_end = backref->file_pos + backref->num_bytes - 1; 2542 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2543 &cached); 2544 2545 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); 2546 if (ordered) { 2547 btrfs_put_ordered_extent(ordered); 2548 goto out_unlock; 2549 } 2550 2551 trans = btrfs_join_transaction(root); 2552 if (IS_ERR(trans)) { 2553 ret = PTR_ERR(trans); 2554 goto out_unlock; 2555 } 2556 2557 key.objectid = backref->inum; 2558 key.type = BTRFS_EXTENT_DATA_KEY; 2559 key.offset = backref->file_pos; 2560 2561 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2562 if (ret < 0) { 2563 goto out_free_path; 2564 } else if (ret > 0) { 2565 ret = 0; 2566 goto out_free_path; 2567 } 2568 2569 extent = btrfs_item_ptr(path->nodes[0], path->slots[0], 2570 struct btrfs_file_extent_item); 2571 2572 if (btrfs_file_extent_generation(path->nodes[0], extent) != 2573 backref->generation) 2574 goto out_free_path; 2575 2576 btrfs_release_path(path); 2577 2578 start = backref->file_pos; 2579 if (backref->extent_offset < old->extent_offset + old->offset) 2580 start += old->extent_offset + old->offset - 2581 backref->extent_offset; 2582 2583 len = min(backref->extent_offset + backref->num_bytes, 2584 old->extent_offset + old->offset + old->len); 2585 len -= max(backref->extent_offset, old->extent_offset + old->offset); 2586 2587 ret = btrfs_drop_extents(trans, root, inode, start, 2588 start + len, 1); 2589 if (ret) 2590 goto out_free_path; 2591 again: 2592 key.objectid = btrfs_ino(inode); 2593 key.type = BTRFS_EXTENT_DATA_KEY; 2594 key.offset = start; 2595 2596 path->leave_spinning = 1; 2597 if (merge) { 2598 struct btrfs_file_extent_item *fi; 2599 u64 extent_len; 2600 struct btrfs_key found_key; 2601 2602 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2603 if (ret < 0) 2604 goto out_free_path; 2605 2606 path->slots[0]--; 2607 leaf = path->nodes[0]; 2608 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2609 2610 fi = btrfs_item_ptr(leaf, path->slots[0], 2611 struct btrfs_file_extent_item); 2612 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 2613 2614 if (extent_len + found_key.offset == start && 2615 relink_is_mergable(leaf, fi, new)) { 2616 btrfs_set_file_extent_num_bytes(leaf, fi, 2617 extent_len + len); 2618 btrfs_mark_buffer_dirty(leaf); 2619 inode_add_bytes(inode, len); 2620 2621 ret = 1; 2622 goto out_free_path; 2623 } else { 2624 merge = false; 2625 btrfs_release_path(path); 2626 goto again; 2627 } 2628 } 2629 2630 ret = btrfs_insert_empty_item(trans, root, path, &key, 2631 sizeof(*extent)); 2632 if (ret) { 2633 btrfs_abort_transaction(trans, ret); 2634 goto out_free_path; 2635 } 2636 2637 leaf = path->nodes[0]; 2638 item = btrfs_item_ptr(leaf, path->slots[0], 2639 struct btrfs_file_extent_item); 2640 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); 2641 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); 2642 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); 2643 btrfs_set_file_extent_num_bytes(leaf, item, len); 2644 btrfs_set_file_extent_ram_bytes(leaf, item, new->len); 2645 btrfs_set_file_extent_generation(leaf, item, trans->transid); 2646 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 2647 btrfs_set_file_extent_compression(leaf, item, new->compress_type); 2648 btrfs_set_file_extent_encryption(leaf, item, 0); 2649 btrfs_set_file_extent_other_encoding(leaf, item, 0); 2650 2651 btrfs_mark_buffer_dirty(leaf); 2652 inode_add_bytes(inode, len); 2653 btrfs_release_path(path); 2654 2655 ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, 2656 new->disk_len, 0, 2657 backref->root_id, backref->inum, 2658 new->file_pos); /* start - extent_offset */ 2659 if (ret) { 2660 btrfs_abort_transaction(trans, ret); 2661 goto out_free_path; 2662 } 2663 2664 ret = 1; 2665 out_free_path: 2666 btrfs_release_path(path); 2667 path->leave_spinning = 0; 2668 btrfs_end_transaction(trans); 2669 out_unlock: 2670 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2671 &cached, GFP_NOFS); 2672 iput(inode); 2673 return ret; 2674 } 2675 2676 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) 2677 { 2678 struct old_sa_defrag_extent *old, *tmp; 2679 2680 if (!new) 2681 return; 2682 2683 list_for_each_entry_safe(old, tmp, &new->head, list) { 2684 kfree(old); 2685 } 2686 kfree(new); 2687 } 2688 2689 static void relink_file_extents(struct new_sa_defrag_extent *new) 2690 { 2691 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2692 struct btrfs_path *path; 2693 struct sa_defrag_extent_backref *backref; 2694 struct sa_defrag_extent_backref *prev = NULL; 2695 struct inode *inode; 2696 struct btrfs_root *root; 2697 struct rb_node *node; 2698 int ret; 2699 2700 inode = new->inode; 2701 root = BTRFS_I(inode)->root; 2702 2703 path = btrfs_alloc_path(); 2704 if (!path) 2705 return; 2706 2707 if (!record_extent_backrefs(path, new)) { 2708 btrfs_free_path(path); 2709 goto out; 2710 } 2711 btrfs_release_path(path); 2712 2713 while (1) { 2714 node = rb_first(&new->root); 2715 if (!node) 2716 break; 2717 rb_erase(node, &new->root); 2718 2719 backref = rb_entry(node, struct sa_defrag_extent_backref, node); 2720 2721 ret = relink_extent_backref(path, prev, backref); 2722 WARN_ON(ret < 0); 2723 2724 kfree(prev); 2725 2726 if (ret == 1) 2727 prev = backref; 2728 else 2729 prev = NULL; 2730 cond_resched(); 2731 } 2732 kfree(prev); 2733 2734 btrfs_free_path(path); 2735 out: 2736 free_sa_defrag_extent(new); 2737 2738 atomic_dec(&fs_info->defrag_running); 2739 wake_up(&fs_info->transaction_wait); 2740 } 2741 2742 static struct new_sa_defrag_extent * 2743 record_old_file_extents(struct inode *inode, 2744 struct btrfs_ordered_extent *ordered) 2745 { 2746 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2747 struct btrfs_root *root = BTRFS_I(inode)->root; 2748 struct btrfs_path *path; 2749 struct btrfs_key key; 2750 struct old_sa_defrag_extent *old; 2751 struct new_sa_defrag_extent *new; 2752 int ret; 2753 2754 new = kmalloc(sizeof(*new), GFP_NOFS); 2755 if (!new) 2756 return NULL; 2757 2758 new->inode = inode; 2759 new->file_pos = ordered->file_offset; 2760 new->len = ordered->len; 2761 new->bytenr = ordered->start; 2762 new->disk_len = ordered->disk_len; 2763 new->compress_type = ordered->compress_type; 2764 new->root = RB_ROOT; 2765 INIT_LIST_HEAD(&new->head); 2766 2767 path = btrfs_alloc_path(); 2768 if (!path) 2769 goto out_kfree; 2770 2771 key.objectid = btrfs_ino(inode); 2772 key.type = BTRFS_EXTENT_DATA_KEY; 2773 key.offset = new->file_pos; 2774 2775 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2776 if (ret < 0) 2777 goto out_free_path; 2778 if (ret > 0 && path->slots[0] > 0) 2779 path->slots[0]--; 2780 2781 /* find out all the old extents for the file range */ 2782 while (1) { 2783 struct btrfs_file_extent_item *extent; 2784 struct extent_buffer *l; 2785 int slot; 2786 u64 num_bytes; 2787 u64 offset; 2788 u64 end; 2789 u64 disk_bytenr; 2790 u64 extent_offset; 2791 2792 l = path->nodes[0]; 2793 slot = path->slots[0]; 2794 2795 if (slot >= btrfs_header_nritems(l)) { 2796 ret = btrfs_next_leaf(root, path); 2797 if (ret < 0) 2798 goto out_free_path; 2799 else if (ret > 0) 2800 break; 2801 continue; 2802 } 2803 2804 btrfs_item_key_to_cpu(l, &key, slot); 2805 2806 if (key.objectid != btrfs_ino(inode)) 2807 break; 2808 if (key.type != BTRFS_EXTENT_DATA_KEY) 2809 break; 2810 if (key.offset >= new->file_pos + new->len) 2811 break; 2812 2813 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); 2814 2815 num_bytes = btrfs_file_extent_num_bytes(l, extent); 2816 if (key.offset + num_bytes < new->file_pos) 2817 goto next; 2818 2819 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); 2820 if (!disk_bytenr) 2821 goto next; 2822 2823 extent_offset = btrfs_file_extent_offset(l, extent); 2824 2825 old = kmalloc(sizeof(*old), GFP_NOFS); 2826 if (!old) 2827 goto out_free_path; 2828 2829 offset = max(new->file_pos, key.offset); 2830 end = min(new->file_pos + new->len, key.offset + num_bytes); 2831 2832 old->bytenr = disk_bytenr; 2833 old->extent_offset = extent_offset; 2834 old->offset = offset - key.offset; 2835 old->len = end - offset; 2836 old->new = new; 2837 old->count = 0; 2838 list_add_tail(&old->list, &new->head); 2839 next: 2840 path->slots[0]++; 2841 cond_resched(); 2842 } 2843 2844 btrfs_free_path(path); 2845 atomic_inc(&fs_info->defrag_running); 2846 2847 return new; 2848 2849 out_free_path: 2850 btrfs_free_path(path); 2851 out_kfree: 2852 free_sa_defrag_extent(new); 2853 return NULL; 2854 } 2855 2856 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2857 u64 start, u64 len) 2858 { 2859 struct btrfs_block_group_cache *cache; 2860 2861 cache = btrfs_lookup_block_group(fs_info, start); 2862 ASSERT(cache); 2863 2864 spin_lock(&cache->lock); 2865 cache->delalloc_bytes -= len; 2866 spin_unlock(&cache->lock); 2867 2868 btrfs_put_block_group(cache); 2869 } 2870 2871 /* as ordered data IO finishes, this gets called so we can finish 2872 * an ordered extent if the range of bytes in the file it covers are 2873 * fully written. 2874 */ 2875 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2876 { 2877 struct inode *inode = ordered_extent->inode; 2878 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2879 struct btrfs_root *root = BTRFS_I(inode)->root; 2880 struct btrfs_trans_handle *trans = NULL; 2881 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2882 struct extent_state *cached_state = NULL; 2883 struct new_sa_defrag_extent *new = NULL; 2884 int compress_type = 0; 2885 int ret = 0; 2886 u64 logical_len = ordered_extent->len; 2887 bool nolock; 2888 bool truncated = false; 2889 2890 nolock = btrfs_is_free_space_inode(inode); 2891 2892 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2893 ret = -EIO; 2894 goto out; 2895 } 2896 2897 btrfs_free_io_failure_record(inode, ordered_extent->file_offset, 2898 ordered_extent->file_offset + 2899 ordered_extent->len - 1); 2900 2901 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2902 truncated = true; 2903 logical_len = ordered_extent->truncated_len; 2904 /* Truncated the entire extent, don't bother adding */ 2905 if (!logical_len) 2906 goto out; 2907 } 2908 2909 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2910 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2911 2912 /* 2913 * For mwrite(mmap + memset to write) case, we still reserve 2914 * space for NOCOW range. 2915 * As NOCOW won't cause a new delayed ref, just free the space 2916 */ 2917 btrfs_qgroup_free_data(inode, ordered_extent->file_offset, 2918 ordered_extent->len); 2919 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2920 if (nolock) 2921 trans = btrfs_join_transaction_nolock(root); 2922 else 2923 trans = btrfs_join_transaction(root); 2924 if (IS_ERR(trans)) { 2925 ret = PTR_ERR(trans); 2926 trans = NULL; 2927 goto out; 2928 } 2929 trans->block_rsv = &fs_info->delalloc_block_rsv; 2930 ret = btrfs_update_inode_fallback(trans, root, inode); 2931 if (ret) /* -ENOMEM or corruption */ 2932 btrfs_abort_transaction(trans, ret); 2933 goto out; 2934 } 2935 2936 lock_extent_bits(io_tree, ordered_extent->file_offset, 2937 ordered_extent->file_offset + ordered_extent->len - 1, 2938 &cached_state); 2939 2940 ret = test_range_bit(io_tree, ordered_extent->file_offset, 2941 ordered_extent->file_offset + ordered_extent->len - 1, 2942 EXTENT_DEFRAG, 1, cached_state); 2943 if (ret) { 2944 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2945 if (0 && last_snapshot >= BTRFS_I(inode)->generation) 2946 /* the inode is shared */ 2947 new = record_old_file_extents(inode, ordered_extent); 2948 2949 clear_extent_bit(io_tree, ordered_extent->file_offset, 2950 ordered_extent->file_offset + ordered_extent->len - 1, 2951 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); 2952 } 2953 2954 if (nolock) 2955 trans = btrfs_join_transaction_nolock(root); 2956 else 2957 trans = btrfs_join_transaction(root); 2958 if (IS_ERR(trans)) { 2959 ret = PTR_ERR(trans); 2960 trans = NULL; 2961 goto out_unlock; 2962 } 2963 2964 trans->block_rsv = &fs_info->delalloc_block_rsv; 2965 2966 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2967 compress_type = ordered_extent->compress_type; 2968 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2969 BUG_ON(compress_type); 2970 ret = btrfs_mark_extent_written(trans, inode, 2971 ordered_extent->file_offset, 2972 ordered_extent->file_offset + 2973 logical_len); 2974 } else { 2975 BUG_ON(root == fs_info->tree_root); 2976 ret = insert_reserved_file_extent(trans, inode, 2977 ordered_extent->file_offset, 2978 ordered_extent->start, 2979 ordered_extent->disk_len, 2980 logical_len, logical_len, 2981 compress_type, 0, 0, 2982 BTRFS_FILE_EXTENT_REG); 2983 if (!ret) 2984 btrfs_release_delalloc_bytes(fs_info, 2985 ordered_extent->start, 2986 ordered_extent->disk_len); 2987 } 2988 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 2989 ordered_extent->file_offset, ordered_extent->len, 2990 trans->transid); 2991 if (ret < 0) { 2992 btrfs_abort_transaction(trans, ret); 2993 goto out_unlock; 2994 } 2995 2996 add_pending_csums(trans, inode, ordered_extent->file_offset, 2997 &ordered_extent->list); 2998 2999 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 3000 ret = btrfs_update_inode_fallback(trans, root, inode); 3001 if (ret) { /* -ENOMEM or corruption */ 3002 btrfs_abort_transaction(trans, ret); 3003 goto out_unlock; 3004 } 3005 ret = 0; 3006 out_unlock: 3007 unlock_extent_cached(io_tree, ordered_extent->file_offset, 3008 ordered_extent->file_offset + 3009 ordered_extent->len - 1, &cached_state, GFP_NOFS); 3010 out: 3011 if (root != fs_info->tree_root) 3012 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 3013 if (trans) 3014 btrfs_end_transaction(trans); 3015 3016 if (ret || truncated) { 3017 u64 start, end; 3018 3019 if (truncated) 3020 start = ordered_extent->file_offset + logical_len; 3021 else 3022 start = ordered_extent->file_offset; 3023 end = ordered_extent->file_offset + ordered_extent->len - 1; 3024 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); 3025 3026 /* Drop the cache for the part of the extent we didn't write. */ 3027 btrfs_drop_extent_cache(inode, start, end, 0); 3028 3029 /* 3030 * If the ordered extent had an IOERR or something else went 3031 * wrong we need to return the space for this ordered extent 3032 * back to the allocator. We only free the extent in the 3033 * truncated case if we didn't write out the extent at all. 3034 */ 3035 if ((ret || !logical_len) && 3036 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3037 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 3038 btrfs_free_reserved_extent(fs_info, 3039 ordered_extent->start, 3040 ordered_extent->disk_len, 1); 3041 } 3042 3043 3044 /* 3045 * This needs to be done to make sure anybody waiting knows we are done 3046 * updating everything for this ordered extent. 3047 */ 3048 btrfs_remove_ordered_extent(inode, ordered_extent); 3049 3050 /* for snapshot-aware defrag */ 3051 if (new) { 3052 if (ret) { 3053 free_sa_defrag_extent(new); 3054 atomic_dec(&fs_info->defrag_running); 3055 } else { 3056 relink_file_extents(new); 3057 } 3058 } 3059 3060 /* once for us */ 3061 btrfs_put_ordered_extent(ordered_extent); 3062 /* once for the tree */ 3063 btrfs_put_ordered_extent(ordered_extent); 3064 3065 return ret; 3066 } 3067 3068 static void finish_ordered_fn(struct btrfs_work *work) 3069 { 3070 struct btrfs_ordered_extent *ordered_extent; 3071 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 3072 btrfs_finish_ordered_io(ordered_extent); 3073 } 3074 3075 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 3076 struct extent_state *state, int uptodate) 3077 { 3078 struct inode *inode = page->mapping->host; 3079 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3080 struct btrfs_ordered_extent *ordered_extent = NULL; 3081 struct btrfs_workqueue *wq; 3082 btrfs_work_func_t func; 3083 3084 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 3085 3086 ClearPagePrivate2(page); 3087 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 3088 end - start + 1, uptodate)) 3089 return 0; 3090 3091 if (btrfs_is_free_space_inode(inode)) { 3092 wq = fs_info->endio_freespace_worker; 3093 func = btrfs_freespace_write_helper; 3094 } else { 3095 wq = fs_info->endio_write_workers; 3096 func = btrfs_endio_write_helper; 3097 } 3098 3099 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, 3100 NULL); 3101 btrfs_queue_work(wq, &ordered_extent->work); 3102 3103 return 0; 3104 } 3105 3106 static int __readpage_endio_check(struct inode *inode, 3107 struct btrfs_io_bio *io_bio, 3108 int icsum, struct page *page, 3109 int pgoff, u64 start, size_t len) 3110 { 3111 char *kaddr; 3112 u32 csum_expected; 3113 u32 csum = ~(u32)0; 3114 3115 csum_expected = *(((u32 *)io_bio->csum) + icsum); 3116 3117 kaddr = kmap_atomic(page); 3118 csum = btrfs_csum_data(kaddr + pgoff, csum, len); 3119 btrfs_csum_final(csum, (u8 *)&csum); 3120 if (csum != csum_expected) 3121 goto zeroit; 3122 3123 kunmap_atomic(kaddr); 3124 return 0; 3125 zeroit: 3126 btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, 3127 "csum failed ino %llu off %llu csum %u expected csum %u", 3128 btrfs_ino(inode), start, csum, csum_expected); 3129 memset(kaddr + pgoff, 1, len); 3130 flush_dcache_page(page); 3131 kunmap_atomic(kaddr); 3132 if (csum_expected == 0) 3133 return 0; 3134 return -EIO; 3135 } 3136 3137 /* 3138 * when reads are done, we need to check csums to verify the data is correct 3139 * if there's a match, we allow the bio to finish. If not, the code in 3140 * extent_io.c will try to find good copies for us. 3141 */ 3142 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 3143 u64 phy_offset, struct page *page, 3144 u64 start, u64 end, int mirror) 3145 { 3146 size_t offset = start - page_offset(page); 3147 struct inode *inode = page->mapping->host; 3148 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3149 struct btrfs_root *root = BTRFS_I(inode)->root; 3150 3151 if (PageChecked(page)) { 3152 ClearPageChecked(page); 3153 return 0; 3154 } 3155 3156 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3157 return 0; 3158 3159 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 3160 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 3161 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); 3162 return 0; 3163 } 3164 3165 phy_offset >>= inode->i_sb->s_blocksize_bits; 3166 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, 3167 start, (size_t)(end - start + 1)); 3168 } 3169 3170 void btrfs_add_delayed_iput(struct inode *inode) 3171 { 3172 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3173 struct btrfs_inode *binode = BTRFS_I(inode); 3174 3175 if (atomic_add_unless(&inode->i_count, -1, 1)) 3176 return; 3177 3178 spin_lock(&fs_info->delayed_iput_lock); 3179 if (binode->delayed_iput_count == 0) { 3180 ASSERT(list_empty(&binode->delayed_iput)); 3181 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3182 } else { 3183 binode->delayed_iput_count++; 3184 } 3185 spin_unlock(&fs_info->delayed_iput_lock); 3186 } 3187 3188 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3189 { 3190 3191 spin_lock(&fs_info->delayed_iput_lock); 3192 while (!list_empty(&fs_info->delayed_iputs)) { 3193 struct btrfs_inode *inode; 3194 3195 inode = list_first_entry(&fs_info->delayed_iputs, 3196 struct btrfs_inode, delayed_iput); 3197 if (inode->delayed_iput_count) { 3198 inode->delayed_iput_count--; 3199 list_move_tail(&inode->delayed_iput, 3200 &fs_info->delayed_iputs); 3201 } else { 3202 list_del_init(&inode->delayed_iput); 3203 } 3204 spin_unlock(&fs_info->delayed_iput_lock); 3205 iput(&inode->vfs_inode); 3206 spin_lock(&fs_info->delayed_iput_lock); 3207 } 3208 spin_unlock(&fs_info->delayed_iput_lock); 3209 } 3210 3211 /* 3212 * This is called in transaction commit time. If there are no orphan 3213 * files in the subvolume, it removes orphan item and frees block_rsv 3214 * structure. 3215 */ 3216 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 3217 struct btrfs_root *root) 3218 { 3219 struct btrfs_fs_info *fs_info = root->fs_info; 3220 struct btrfs_block_rsv *block_rsv; 3221 int ret; 3222 3223 if (atomic_read(&root->orphan_inodes) || 3224 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 3225 return; 3226 3227 spin_lock(&root->orphan_lock); 3228 if (atomic_read(&root->orphan_inodes)) { 3229 spin_unlock(&root->orphan_lock); 3230 return; 3231 } 3232 3233 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 3234 spin_unlock(&root->orphan_lock); 3235 return; 3236 } 3237 3238 block_rsv = root->orphan_block_rsv; 3239 root->orphan_block_rsv = NULL; 3240 spin_unlock(&root->orphan_lock); 3241 3242 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && 3243 btrfs_root_refs(&root->root_item) > 0) { 3244 ret = btrfs_del_orphan_item(trans, fs_info->tree_root, 3245 root->root_key.objectid); 3246 if (ret) 3247 btrfs_abort_transaction(trans, ret); 3248 else 3249 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, 3250 &root->state); 3251 } 3252 3253 if (block_rsv) { 3254 WARN_ON(block_rsv->size > 0); 3255 btrfs_free_block_rsv(fs_info, block_rsv); 3256 } 3257 } 3258 3259 /* 3260 * This creates an orphan entry for the given inode in case something goes 3261 * wrong in the middle of an unlink/truncate. 3262 * 3263 * NOTE: caller of this function should reserve 5 units of metadata for 3264 * this function. 3265 */ 3266 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 3267 { 3268 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3269 struct btrfs_root *root = BTRFS_I(inode)->root; 3270 struct btrfs_block_rsv *block_rsv = NULL; 3271 int reserve = 0; 3272 int insert = 0; 3273 int ret; 3274 3275 if (!root->orphan_block_rsv) { 3276 block_rsv = btrfs_alloc_block_rsv(fs_info, 3277 BTRFS_BLOCK_RSV_TEMP); 3278 if (!block_rsv) 3279 return -ENOMEM; 3280 } 3281 3282 spin_lock(&root->orphan_lock); 3283 if (!root->orphan_block_rsv) { 3284 root->orphan_block_rsv = block_rsv; 3285 } else if (block_rsv) { 3286 btrfs_free_block_rsv(fs_info, block_rsv); 3287 block_rsv = NULL; 3288 } 3289 3290 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3291 &BTRFS_I(inode)->runtime_flags)) { 3292 #if 0 3293 /* 3294 * For proper ENOSPC handling, we should do orphan 3295 * cleanup when mounting. But this introduces backward 3296 * compatibility issue. 3297 */ 3298 if (!xchg(&root->orphan_item_inserted, 1)) 3299 insert = 2; 3300 else 3301 insert = 1; 3302 #endif 3303 insert = 1; 3304 atomic_inc(&root->orphan_inodes); 3305 } 3306 3307 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3308 &BTRFS_I(inode)->runtime_flags)) 3309 reserve = 1; 3310 spin_unlock(&root->orphan_lock); 3311 3312 /* grab metadata reservation from transaction handle */ 3313 if (reserve) { 3314 ret = btrfs_orphan_reserve_metadata(trans, inode); 3315 ASSERT(!ret); 3316 if (ret) { 3317 atomic_dec(&root->orphan_inodes); 3318 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3319 &BTRFS_I(inode)->runtime_flags); 3320 if (insert) 3321 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3322 &BTRFS_I(inode)->runtime_flags); 3323 return ret; 3324 } 3325 } 3326 3327 /* insert an orphan item to track this unlinked/truncated file */ 3328 if (insert >= 1) { 3329 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 3330 if (ret) { 3331 atomic_dec(&root->orphan_inodes); 3332 if (reserve) { 3333 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3334 &BTRFS_I(inode)->runtime_flags); 3335 btrfs_orphan_release_metadata(inode); 3336 } 3337 if (ret != -EEXIST) { 3338 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3339 &BTRFS_I(inode)->runtime_flags); 3340 btrfs_abort_transaction(trans, ret); 3341 return ret; 3342 } 3343 } 3344 ret = 0; 3345 } 3346 3347 /* insert an orphan item to track subvolume contains orphan files */ 3348 if (insert >= 2) { 3349 ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, 3350 root->root_key.objectid); 3351 if (ret && ret != -EEXIST) { 3352 btrfs_abort_transaction(trans, ret); 3353 return ret; 3354 } 3355 } 3356 return 0; 3357 } 3358 3359 /* 3360 * We have done the truncate/delete so we can go ahead and remove the orphan 3361 * item for this particular inode. 3362 */ 3363 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3364 struct inode *inode) 3365 { 3366 struct btrfs_root *root = BTRFS_I(inode)->root; 3367 int delete_item = 0; 3368 int release_rsv = 0; 3369 int ret = 0; 3370 3371 spin_lock(&root->orphan_lock); 3372 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3373 &BTRFS_I(inode)->runtime_flags)) 3374 delete_item = 1; 3375 3376 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3377 &BTRFS_I(inode)->runtime_flags)) 3378 release_rsv = 1; 3379 spin_unlock(&root->orphan_lock); 3380 3381 if (delete_item) { 3382 atomic_dec(&root->orphan_inodes); 3383 if (trans) 3384 ret = btrfs_del_orphan_item(trans, root, 3385 btrfs_ino(inode)); 3386 } 3387 3388 if (release_rsv) 3389 btrfs_orphan_release_metadata(inode); 3390 3391 return ret; 3392 } 3393 3394 /* 3395 * this cleans up any orphans that may be left on the list from the last use 3396 * of this root. 3397 */ 3398 int btrfs_orphan_cleanup(struct btrfs_root *root) 3399 { 3400 struct btrfs_fs_info *fs_info = root->fs_info; 3401 struct btrfs_path *path; 3402 struct extent_buffer *leaf; 3403 struct btrfs_key key, found_key; 3404 struct btrfs_trans_handle *trans; 3405 struct inode *inode; 3406 u64 last_objectid = 0; 3407 int ret = 0, nr_unlink = 0, nr_truncate = 0; 3408 3409 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3410 return 0; 3411 3412 path = btrfs_alloc_path(); 3413 if (!path) { 3414 ret = -ENOMEM; 3415 goto out; 3416 } 3417 path->reada = READA_BACK; 3418 3419 key.objectid = BTRFS_ORPHAN_OBJECTID; 3420 key.type = BTRFS_ORPHAN_ITEM_KEY; 3421 key.offset = (u64)-1; 3422 3423 while (1) { 3424 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3425 if (ret < 0) 3426 goto out; 3427 3428 /* 3429 * if ret == 0 means we found what we were searching for, which 3430 * is weird, but possible, so only screw with path if we didn't 3431 * find the key and see if we have stuff that matches 3432 */ 3433 if (ret > 0) { 3434 ret = 0; 3435 if (path->slots[0] == 0) 3436 break; 3437 path->slots[0]--; 3438 } 3439 3440 /* pull out the item */ 3441 leaf = path->nodes[0]; 3442 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3443 3444 /* make sure the item matches what we want */ 3445 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3446 break; 3447 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3448 break; 3449 3450 /* release the path since we're done with it */ 3451 btrfs_release_path(path); 3452 3453 /* 3454 * this is where we are basically btrfs_lookup, without the 3455 * crossing root thing. we store the inode number in the 3456 * offset of the orphan item. 3457 */ 3458 3459 if (found_key.offset == last_objectid) { 3460 btrfs_err(fs_info, 3461 "Error removing orphan entry, stopping orphan cleanup"); 3462 ret = -EINVAL; 3463 goto out; 3464 } 3465 3466 last_objectid = found_key.offset; 3467 3468 found_key.objectid = found_key.offset; 3469 found_key.type = BTRFS_INODE_ITEM_KEY; 3470 found_key.offset = 0; 3471 inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); 3472 ret = PTR_ERR_OR_ZERO(inode); 3473 if (ret && ret != -ENOENT) 3474 goto out; 3475 3476 if (ret == -ENOENT && root == fs_info->tree_root) { 3477 struct btrfs_root *dead_root; 3478 struct btrfs_fs_info *fs_info = root->fs_info; 3479 int is_dead_root = 0; 3480 3481 /* 3482 * this is an orphan in the tree root. Currently these 3483 * could come from 2 sources: 3484 * a) a snapshot deletion in progress 3485 * b) a free space cache inode 3486 * We need to distinguish those two, as the snapshot 3487 * orphan must not get deleted. 3488 * find_dead_roots already ran before us, so if this 3489 * is a snapshot deletion, we should find the root 3490 * in the dead_roots list 3491 */ 3492 spin_lock(&fs_info->trans_lock); 3493 list_for_each_entry(dead_root, &fs_info->dead_roots, 3494 root_list) { 3495 if (dead_root->root_key.objectid == 3496 found_key.objectid) { 3497 is_dead_root = 1; 3498 break; 3499 } 3500 } 3501 spin_unlock(&fs_info->trans_lock); 3502 if (is_dead_root) { 3503 /* prevent this orphan from being found again */ 3504 key.offset = found_key.objectid - 1; 3505 continue; 3506 } 3507 } 3508 /* 3509 * Inode is already gone but the orphan item is still there, 3510 * kill the orphan item. 3511 */ 3512 if (ret == -ENOENT) { 3513 trans = btrfs_start_transaction(root, 1); 3514 if (IS_ERR(trans)) { 3515 ret = PTR_ERR(trans); 3516 goto out; 3517 } 3518 btrfs_debug(fs_info, "auto deleting %Lu", 3519 found_key.objectid); 3520 ret = btrfs_del_orphan_item(trans, root, 3521 found_key.objectid); 3522 btrfs_end_transaction(trans); 3523 if (ret) 3524 goto out; 3525 continue; 3526 } 3527 3528 /* 3529 * add this inode to the orphan list so btrfs_orphan_del does 3530 * the proper thing when we hit it 3531 */ 3532 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3533 &BTRFS_I(inode)->runtime_flags); 3534 atomic_inc(&root->orphan_inodes); 3535 3536 /* if we have links, this was a truncate, lets do that */ 3537 if (inode->i_nlink) { 3538 if (WARN_ON(!S_ISREG(inode->i_mode))) { 3539 iput(inode); 3540 continue; 3541 } 3542 nr_truncate++; 3543 3544 /* 1 for the orphan item deletion. */ 3545 trans = btrfs_start_transaction(root, 1); 3546 if (IS_ERR(trans)) { 3547 iput(inode); 3548 ret = PTR_ERR(trans); 3549 goto out; 3550 } 3551 ret = btrfs_orphan_add(trans, inode); 3552 btrfs_end_transaction(trans); 3553 if (ret) { 3554 iput(inode); 3555 goto out; 3556 } 3557 3558 ret = btrfs_truncate(inode); 3559 if (ret) 3560 btrfs_orphan_del(NULL, inode); 3561 } else { 3562 nr_unlink++; 3563 } 3564 3565 /* this will do delete_inode and everything for us */ 3566 iput(inode); 3567 if (ret) 3568 goto out; 3569 } 3570 /* release the path since we're done with it */ 3571 btrfs_release_path(path); 3572 3573 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3574 3575 if (root->orphan_block_rsv) 3576 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, 3577 (u64)-1); 3578 3579 if (root->orphan_block_rsv || 3580 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3581 trans = btrfs_join_transaction(root); 3582 if (!IS_ERR(trans)) 3583 btrfs_end_transaction(trans); 3584 } 3585 3586 if (nr_unlink) 3587 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3588 if (nr_truncate) 3589 btrfs_debug(fs_info, "truncated %d orphans", nr_truncate); 3590 3591 out: 3592 if (ret) 3593 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3594 btrfs_free_path(path); 3595 return ret; 3596 } 3597 3598 /* 3599 * very simple check to peek ahead in the leaf looking for xattrs. If we 3600 * don't find any xattrs, we know there can't be any acls. 3601 * 3602 * slot is the slot the inode is in, objectid is the objectid of the inode 3603 */ 3604 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3605 int slot, u64 objectid, 3606 int *first_xattr_slot) 3607 { 3608 u32 nritems = btrfs_header_nritems(leaf); 3609 struct btrfs_key found_key; 3610 static u64 xattr_access = 0; 3611 static u64 xattr_default = 0; 3612 int scanned = 0; 3613 3614 if (!xattr_access) { 3615 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3616 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3617 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3618 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3619 } 3620 3621 slot++; 3622 *first_xattr_slot = -1; 3623 while (slot < nritems) { 3624 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3625 3626 /* we found a different objectid, there must not be acls */ 3627 if (found_key.objectid != objectid) 3628 return 0; 3629 3630 /* we found an xattr, assume we've got an acl */ 3631 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3632 if (*first_xattr_slot == -1) 3633 *first_xattr_slot = slot; 3634 if (found_key.offset == xattr_access || 3635 found_key.offset == xattr_default) 3636 return 1; 3637 } 3638 3639 /* 3640 * we found a key greater than an xattr key, there can't 3641 * be any acls later on 3642 */ 3643 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3644 return 0; 3645 3646 slot++; 3647 scanned++; 3648 3649 /* 3650 * it goes inode, inode backrefs, xattrs, extents, 3651 * so if there are a ton of hard links to an inode there can 3652 * be a lot of backrefs. Don't waste time searching too hard, 3653 * this is just an optimization 3654 */ 3655 if (scanned >= 8) 3656 break; 3657 } 3658 /* we hit the end of the leaf before we found an xattr or 3659 * something larger than an xattr. We have to assume the inode 3660 * has acls 3661 */ 3662 if (*first_xattr_slot == -1) 3663 *first_xattr_slot = slot; 3664 return 1; 3665 } 3666 3667 /* 3668 * read an inode from the btree into the in-memory inode 3669 */ 3670 static int btrfs_read_locked_inode(struct inode *inode) 3671 { 3672 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3673 struct btrfs_path *path; 3674 struct extent_buffer *leaf; 3675 struct btrfs_inode_item *inode_item; 3676 struct btrfs_root *root = BTRFS_I(inode)->root; 3677 struct btrfs_key location; 3678 unsigned long ptr; 3679 int maybe_acls; 3680 u32 rdev; 3681 int ret; 3682 bool filled = false; 3683 int first_xattr_slot; 3684 3685 ret = btrfs_fill_inode(inode, &rdev); 3686 if (!ret) 3687 filled = true; 3688 3689 path = btrfs_alloc_path(); 3690 if (!path) { 3691 ret = -ENOMEM; 3692 goto make_bad; 3693 } 3694 3695 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3696 3697 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3698 if (ret) { 3699 if (ret > 0) 3700 ret = -ENOENT; 3701 goto make_bad; 3702 } 3703 3704 leaf = path->nodes[0]; 3705 3706 if (filled) 3707 goto cache_index; 3708 3709 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3710 struct btrfs_inode_item); 3711 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3712 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3713 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3714 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3715 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3716 3717 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3718 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3719 3720 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3721 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3722 3723 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3724 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3725 3726 BTRFS_I(inode)->i_otime.tv_sec = 3727 btrfs_timespec_sec(leaf, &inode_item->otime); 3728 BTRFS_I(inode)->i_otime.tv_nsec = 3729 btrfs_timespec_nsec(leaf, &inode_item->otime); 3730 3731 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3732 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3733 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3734 3735 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 3736 inode->i_generation = BTRFS_I(inode)->generation; 3737 inode->i_rdev = 0; 3738 rdev = btrfs_inode_rdev(leaf, inode_item); 3739 3740 BTRFS_I(inode)->index_cnt = (u64)-1; 3741 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3742 3743 cache_index: 3744 /* 3745 * If we were modified in the current generation and evicted from memory 3746 * and then re-read we need to do a full sync since we don't have any 3747 * idea about which extents were modified before we were evicted from 3748 * cache. 3749 * 3750 * This is required for both inode re-read from disk and delayed inode 3751 * in delayed_nodes_tree. 3752 */ 3753 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3754 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3755 &BTRFS_I(inode)->runtime_flags); 3756 3757 /* 3758 * We don't persist the id of the transaction where an unlink operation 3759 * against the inode was last made. So here we assume the inode might 3760 * have been evicted, and therefore the exact value of last_unlink_trans 3761 * lost, and set it to last_trans to avoid metadata inconsistencies 3762 * between the inode and its parent if the inode is fsync'ed and the log 3763 * replayed. For example, in the scenario: 3764 * 3765 * touch mydir/foo 3766 * ln mydir/foo mydir/bar 3767 * sync 3768 * unlink mydir/bar 3769 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3770 * xfs_io -c fsync mydir/foo 3771 * <power failure> 3772 * mount fs, triggers fsync log replay 3773 * 3774 * We must make sure that when we fsync our inode foo we also log its 3775 * parent inode, otherwise after log replay the parent still has the 3776 * dentry with the "bar" name but our inode foo has a link count of 1 3777 * and doesn't have an inode ref with the name "bar" anymore. 3778 * 3779 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3780 * but it guarantees correctness at the expense of occasional full 3781 * transaction commits on fsync if our inode is a directory, or if our 3782 * inode is not a directory, logging its parent unnecessarily. 3783 */ 3784 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3785 3786 path->slots[0]++; 3787 if (inode->i_nlink != 1 || 3788 path->slots[0] >= btrfs_header_nritems(leaf)) 3789 goto cache_acl; 3790 3791 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3792 if (location.objectid != btrfs_ino(inode)) 3793 goto cache_acl; 3794 3795 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3796 if (location.type == BTRFS_INODE_REF_KEY) { 3797 struct btrfs_inode_ref *ref; 3798 3799 ref = (struct btrfs_inode_ref *)ptr; 3800 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3801 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3802 struct btrfs_inode_extref *extref; 3803 3804 extref = (struct btrfs_inode_extref *)ptr; 3805 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3806 extref); 3807 } 3808 cache_acl: 3809 /* 3810 * try to precache a NULL acl entry for files that don't have 3811 * any xattrs or acls 3812 */ 3813 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3814 btrfs_ino(inode), &first_xattr_slot); 3815 if (first_xattr_slot != -1) { 3816 path->slots[0] = first_xattr_slot; 3817 ret = btrfs_load_inode_props(inode, path); 3818 if (ret) 3819 btrfs_err(fs_info, 3820 "error loading props for ino %llu (root %llu): %d", 3821 btrfs_ino(inode), 3822 root->root_key.objectid, ret); 3823 } 3824 btrfs_free_path(path); 3825 3826 if (!maybe_acls) 3827 cache_no_acl(inode); 3828 3829 switch (inode->i_mode & S_IFMT) { 3830 case S_IFREG: 3831 inode->i_mapping->a_ops = &btrfs_aops; 3832 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3833 inode->i_fop = &btrfs_file_operations; 3834 inode->i_op = &btrfs_file_inode_operations; 3835 break; 3836 case S_IFDIR: 3837 inode->i_fop = &btrfs_dir_file_operations; 3838 inode->i_op = &btrfs_dir_inode_operations; 3839 break; 3840 case S_IFLNK: 3841 inode->i_op = &btrfs_symlink_inode_operations; 3842 inode_nohighmem(inode); 3843 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3844 break; 3845 default: 3846 inode->i_op = &btrfs_special_inode_operations; 3847 init_special_inode(inode, inode->i_mode, rdev); 3848 break; 3849 } 3850 3851 btrfs_update_iflags(inode); 3852 return 0; 3853 3854 make_bad: 3855 btrfs_free_path(path); 3856 make_bad_inode(inode); 3857 return ret; 3858 } 3859 3860 /* 3861 * given a leaf and an inode, copy the inode fields into the leaf 3862 */ 3863 static void fill_inode_item(struct btrfs_trans_handle *trans, 3864 struct extent_buffer *leaf, 3865 struct btrfs_inode_item *item, 3866 struct inode *inode) 3867 { 3868 struct btrfs_map_token token; 3869 3870 btrfs_init_map_token(&token); 3871 3872 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3873 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3874 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, 3875 &token); 3876 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3877 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3878 3879 btrfs_set_token_timespec_sec(leaf, &item->atime, 3880 inode->i_atime.tv_sec, &token); 3881 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3882 inode->i_atime.tv_nsec, &token); 3883 3884 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3885 inode->i_mtime.tv_sec, &token); 3886 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3887 inode->i_mtime.tv_nsec, &token); 3888 3889 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3890 inode->i_ctime.tv_sec, &token); 3891 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3892 inode->i_ctime.tv_nsec, &token); 3893 3894 btrfs_set_token_timespec_sec(leaf, &item->otime, 3895 BTRFS_I(inode)->i_otime.tv_sec, &token); 3896 btrfs_set_token_timespec_nsec(leaf, &item->otime, 3897 BTRFS_I(inode)->i_otime.tv_nsec, &token); 3898 3899 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3900 &token); 3901 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3902 &token); 3903 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3904 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3905 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3906 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3907 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3908 } 3909 3910 /* 3911 * copy everything in the in-memory inode into the btree. 3912 */ 3913 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3914 struct btrfs_root *root, struct inode *inode) 3915 { 3916 struct btrfs_inode_item *inode_item; 3917 struct btrfs_path *path; 3918 struct extent_buffer *leaf; 3919 int ret; 3920 3921 path = btrfs_alloc_path(); 3922 if (!path) 3923 return -ENOMEM; 3924 3925 path->leave_spinning = 1; 3926 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 3927 1); 3928 if (ret) { 3929 if (ret > 0) 3930 ret = -ENOENT; 3931 goto failed; 3932 } 3933 3934 leaf = path->nodes[0]; 3935 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3936 struct btrfs_inode_item); 3937 3938 fill_inode_item(trans, leaf, inode_item, inode); 3939 btrfs_mark_buffer_dirty(leaf); 3940 btrfs_set_inode_last_trans(trans, inode); 3941 ret = 0; 3942 failed: 3943 btrfs_free_path(path); 3944 return ret; 3945 } 3946 3947 /* 3948 * copy everything in the in-memory inode into the btree. 3949 */ 3950 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3951 struct btrfs_root *root, struct inode *inode) 3952 { 3953 struct btrfs_fs_info *fs_info = root->fs_info; 3954 int ret; 3955 3956 /* 3957 * If the inode is a free space inode, we can deadlock during commit 3958 * if we put it into the delayed code. 3959 * 3960 * The data relocation inode should also be directly updated 3961 * without delay 3962 */ 3963 if (!btrfs_is_free_space_inode(inode) 3964 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 3965 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 3966 btrfs_update_root_times(trans, root); 3967 3968 ret = btrfs_delayed_update_inode(trans, root, inode); 3969 if (!ret) 3970 btrfs_set_inode_last_trans(trans, inode); 3971 return ret; 3972 } 3973 3974 return btrfs_update_inode_item(trans, root, inode); 3975 } 3976 3977 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3978 struct btrfs_root *root, 3979 struct inode *inode) 3980 { 3981 int ret; 3982 3983 ret = btrfs_update_inode(trans, root, inode); 3984 if (ret == -ENOSPC) 3985 return btrfs_update_inode_item(trans, root, inode); 3986 return ret; 3987 } 3988 3989 /* 3990 * unlink helper that gets used here in inode.c and in the tree logging 3991 * recovery code. It remove a link in a directory with a given name, and 3992 * also drops the back refs in the inode to the directory 3993 */ 3994 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3995 struct btrfs_root *root, 3996 struct inode *dir, struct inode *inode, 3997 const char *name, int name_len) 3998 { 3999 struct btrfs_fs_info *fs_info = root->fs_info; 4000 struct btrfs_path *path; 4001 int ret = 0; 4002 struct extent_buffer *leaf; 4003 struct btrfs_dir_item *di; 4004 struct btrfs_key key; 4005 u64 index; 4006 u64 ino = btrfs_ino(inode); 4007 u64 dir_ino = btrfs_ino(dir); 4008 4009 path = btrfs_alloc_path(); 4010 if (!path) { 4011 ret = -ENOMEM; 4012 goto out; 4013 } 4014 4015 path->leave_spinning = 1; 4016 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4017 name, name_len, -1); 4018 if (IS_ERR(di)) { 4019 ret = PTR_ERR(di); 4020 goto err; 4021 } 4022 if (!di) { 4023 ret = -ENOENT; 4024 goto err; 4025 } 4026 leaf = path->nodes[0]; 4027 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4028 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4029 if (ret) 4030 goto err; 4031 btrfs_release_path(path); 4032 4033 /* 4034 * If we don't have dir index, we have to get it by looking up 4035 * the inode ref, since we get the inode ref, remove it directly, 4036 * it is unnecessary to do delayed deletion. 4037 * 4038 * But if we have dir index, needn't search inode ref to get it. 4039 * Since the inode ref is close to the inode item, it is better 4040 * that we delay to delete it, and just do this deletion when 4041 * we update the inode item. 4042 */ 4043 if (BTRFS_I(inode)->dir_index) { 4044 ret = btrfs_delayed_delete_inode_ref(inode); 4045 if (!ret) { 4046 index = BTRFS_I(inode)->dir_index; 4047 goto skip_backref; 4048 } 4049 } 4050 4051 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 4052 dir_ino, &index); 4053 if (ret) { 4054 btrfs_info(fs_info, 4055 "failed to delete reference to %.*s, inode %llu parent %llu", 4056 name_len, name, ino, dir_ino); 4057 btrfs_abort_transaction(trans, ret); 4058 goto err; 4059 } 4060 skip_backref: 4061 ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); 4062 if (ret) { 4063 btrfs_abort_transaction(trans, ret); 4064 goto err; 4065 } 4066 4067 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 4068 inode, dir_ino); 4069 if (ret != 0 && ret != -ENOENT) { 4070 btrfs_abort_transaction(trans, ret); 4071 goto err; 4072 } 4073 4074 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 4075 dir, index); 4076 if (ret == -ENOENT) 4077 ret = 0; 4078 else if (ret) 4079 btrfs_abort_transaction(trans, ret); 4080 err: 4081 btrfs_free_path(path); 4082 if (ret) 4083 goto out; 4084 4085 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4086 inode_inc_iversion(inode); 4087 inode_inc_iversion(dir); 4088 inode->i_ctime = dir->i_mtime = 4089 dir->i_ctime = current_time(inode); 4090 ret = btrfs_update_inode(trans, root, dir); 4091 out: 4092 return ret; 4093 } 4094 4095 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4096 struct btrfs_root *root, 4097 struct inode *dir, struct inode *inode, 4098 const char *name, int name_len) 4099 { 4100 int ret; 4101 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 4102 if (!ret) { 4103 drop_nlink(inode); 4104 ret = btrfs_update_inode(trans, root, inode); 4105 } 4106 return ret; 4107 } 4108 4109 /* 4110 * helper to start transaction for unlink and rmdir. 4111 * 4112 * unlink and rmdir are special in btrfs, they do not always free space, so 4113 * if we cannot make our reservations the normal way try and see if there is 4114 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4115 * allow the unlink to occur. 4116 */ 4117 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 4118 { 4119 struct btrfs_root *root = BTRFS_I(dir)->root; 4120 4121 /* 4122 * 1 for the possible orphan item 4123 * 1 for the dir item 4124 * 1 for the dir index 4125 * 1 for the inode ref 4126 * 1 for the inode 4127 */ 4128 return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); 4129 } 4130 4131 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4132 { 4133 struct btrfs_root *root = BTRFS_I(dir)->root; 4134 struct btrfs_trans_handle *trans; 4135 struct inode *inode = d_inode(dentry); 4136 int ret; 4137 4138 trans = __unlink_start_trans(dir); 4139 if (IS_ERR(trans)) 4140 return PTR_ERR(trans); 4141 4142 btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); 4143 4144 ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), 4145 dentry->d_name.name, dentry->d_name.len); 4146 if (ret) 4147 goto out; 4148 4149 if (inode->i_nlink == 0) { 4150 ret = btrfs_orphan_add(trans, inode); 4151 if (ret) 4152 goto out; 4153 } 4154 4155 out: 4156 btrfs_end_transaction(trans); 4157 btrfs_btree_balance_dirty(root->fs_info); 4158 return ret; 4159 } 4160 4161 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4162 struct btrfs_root *root, 4163 struct inode *dir, u64 objectid, 4164 const char *name, int name_len) 4165 { 4166 struct btrfs_fs_info *fs_info = root->fs_info; 4167 struct btrfs_path *path; 4168 struct extent_buffer *leaf; 4169 struct btrfs_dir_item *di; 4170 struct btrfs_key key; 4171 u64 index; 4172 int ret; 4173 u64 dir_ino = btrfs_ino(dir); 4174 4175 path = btrfs_alloc_path(); 4176 if (!path) 4177 return -ENOMEM; 4178 4179 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4180 name, name_len, -1); 4181 if (IS_ERR_OR_NULL(di)) { 4182 if (!di) 4183 ret = -ENOENT; 4184 else 4185 ret = PTR_ERR(di); 4186 goto out; 4187 } 4188 4189 leaf = path->nodes[0]; 4190 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4191 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4192 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4193 if (ret) { 4194 btrfs_abort_transaction(trans, ret); 4195 goto out; 4196 } 4197 btrfs_release_path(path); 4198 4199 ret = btrfs_del_root_ref(trans, fs_info, objectid, 4200 root->root_key.objectid, dir_ino, 4201 &index, name, name_len); 4202 if (ret < 0) { 4203 if (ret != -ENOENT) { 4204 btrfs_abort_transaction(trans, ret); 4205 goto out; 4206 } 4207 di = btrfs_search_dir_index_item(root, path, dir_ino, 4208 name, name_len); 4209 if (IS_ERR_OR_NULL(di)) { 4210 if (!di) 4211 ret = -ENOENT; 4212 else 4213 ret = PTR_ERR(di); 4214 btrfs_abort_transaction(trans, ret); 4215 goto out; 4216 } 4217 4218 leaf = path->nodes[0]; 4219 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4220 btrfs_release_path(path); 4221 index = key.offset; 4222 } 4223 btrfs_release_path(path); 4224 4225 ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); 4226 if (ret) { 4227 btrfs_abort_transaction(trans, ret); 4228 goto out; 4229 } 4230 4231 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4232 inode_inc_iversion(dir); 4233 dir->i_mtime = dir->i_ctime = current_time(dir); 4234 ret = btrfs_update_inode_fallback(trans, root, dir); 4235 if (ret) 4236 btrfs_abort_transaction(trans, ret); 4237 out: 4238 btrfs_free_path(path); 4239 return ret; 4240 } 4241 4242 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4243 { 4244 struct inode *inode = d_inode(dentry); 4245 int err = 0; 4246 struct btrfs_root *root = BTRFS_I(dir)->root; 4247 struct btrfs_trans_handle *trans; 4248 u64 last_unlink_trans; 4249 4250 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4251 return -ENOTEMPTY; 4252 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 4253 return -EPERM; 4254 4255 trans = __unlink_start_trans(dir); 4256 if (IS_ERR(trans)) 4257 return PTR_ERR(trans); 4258 4259 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4260 err = btrfs_unlink_subvol(trans, root, dir, 4261 BTRFS_I(inode)->location.objectid, 4262 dentry->d_name.name, 4263 dentry->d_name.len); 4264 goto out; 4265 } 4266 4267 err = btrfs_orphan_add(trans, inode); 4268 if (err) 4269 goto out; 4270 4271 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4272 4273 /* now the directory is empty */ 4274 err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), 4275 dentry->d_name.name, dentry->d_name.len); 4276 if (!err) { 4277 btrfs_i_size_write(inode, 0); 4278 /* 4279 * Propagate the last_unlink_trans value of the deleted dir to 4280 * its parent directory. This is to prevent an unrecoverable 4281 * log tree in the case we do something like this: 4282 * 1) create dir foo 4283 * 2) create snapshot under dir foo 4284 * 3) delete the snapshot 4285 * 4) rmdir foo 4286 * 5) mkdir foo 4287 * 6) fsync foo or some file inside foo 4288 */ 4289 if (last_unlink_trans >= trans->transid) 4290 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4291 } 4292 out: 4293 btrfs_end_transaction(trans); 4294 btrfs_btree_balance_dirty(root->fs_info); 4295 4296 return err; 4297 } 4298 4299 static int truncate_space_check(struct btrfs_trans_handle *trans, 4300 struct btrfs_root *root, 4301 u64 bytes_deleted) 4302 { 4303 struct btrfs_fs_info *fs_info = root->fs_info; 4304 int ret; 4305 4306 /* 4307 * This is only used to apply pressure to the enospc system, we don't 4308 * intend to use this reservation at all. 4309 */ 4310 bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); 4311 bytes_deleted *= fs_info->nodesize; 4312 ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, 4313 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4314 if (!ret) { 4315 trace_btrfs_space_reservation(fs_info, "transaction", 4316 trans->transid, 4317 bytes_deleted, 1); 4318 trans->bytes_reserved += bytes_deleted; 4319 } 4320 return ret; 4321 4322 } 4323 4324 static int truncate_inline_extent(struct inode *inode, 4325 struct btrfs_path *path, 4326 struct btrfs_key *found_key, 4327 const u64 item_end, 4328 const u64 new_size) 4329 { 4330 struct extent_buffer *leaf = path->nodes[0]; 4331 int slot = path->slots[0]; 4332 struct btrfs_file_extent_item *fi; 4333 u32 size = (u32)(new_size - found_key->offset); 4334 struct btrfs_root *root = BTRFS_I(inode)->root; 4335 4336 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4337 4338 if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { 4339 loff_t offset = new_size; 4340 loff_t page_end = ALIGN(offset, PAGE_SIZE); 4341 4342 /* 4343 * Zero out the remaining of the last page of our inline extent, 4344 * instead of directly truncating our inline extent here - that 4345 * would be much more complex (decompressing all the data, then 4346 * compressing the truncated data, which might be bigger than 4347 * the size of the inline extent, resize the extent, etc). 4348 * We release the path because to get the page we might need to 4349 * read the extent item from disk (data not in the page cache). 4350 */ 4351 btrfs_release_path(path); 4352 return btrfs_truncate_block(inode, offset, page_end - offset, 4353 0); 4354 } 4355 4356 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4357 size = btrfs_file_extent_calc_inline_size(size); 4358 btrfs_truncate_item(root->fs_info, path, size, 1); 4359 4360 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4361 inode_sub_bytes(inode, item_end + 1 - new_size); 4362 4363 return 0; 4364 } 4365 4366 /* 4367 * this can truncate away extent items, csum items and directory items. 4368 * It starts at a high offset and removes keys until it can't find 4369 * any higher than new_size 4370 * 4371 * csum items that cross the new i_size are truncated to the new size 4372 * as well. 4373 * 4374 * min_type is the minimum key type to truncate down to. If set to 0, this 4375 * will kill all the items on this inode, including the INODE_ITEM_KEY. 4376 */ 4377 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4378 struct btrfs_root *root, 4379 struct inode *inode, 4380 u64 new_size, u32 min_type) 4381 { 4382 struct btrfs_fs_info *fs_info = root->fs_info; 4383 struct btrfs_path *path; 4384 struct extent_buffer *leaf; 4385 struct btrfs_file_extent_item *fi; 4386 struct btrfs_key key; 4387 struct btrfs_key found_key; 4388 u64 extent_start = 0; 4389 u64 extent_num_bytes = 0; 4390 u64 extent_offset = 0; 4391 u64 item_end = 0; 4392 u64 last_size = new_size; 4393 u32 found_type = (u8)-1; 4394 int found_extent; 4395 int del_item; 4396 int pending_del_nr = 0; 4397 int pending_del_slot = 0; 4398 int extent_type = -1; 4399 int ret; 4400 int err = 0; 4401 u64 ino = btrfs_ino(inode); 4402 u64 bytes_deleted = 0; 4403 bool be_nice = 0; 4404 bool should_throttle = 0; 4405 bool should_end = 0; 4406 4407 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4408 4409 /* 4410 * for non-free space inodes and ref cows, we want to back off from 4411 * time to time 4412 */ 4413 if (!btrfs_is_free_space_inode(inode) && 4414 test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4415 be_nice = 1; 4416 4417 path = btrfs_alloc_path(); 4418 if (!path) 4419 return -ENOMEM; 4420 path->reada = READA_BACK; 4421 4422 /* 4423 * We want to drop from the next block forward in case this new size is 4424 * not block aligned since we will be keeping the last block of the 4425 * extent just the way it is. 4426 */ 4427 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4428 root == fs_info->tree_root) 4429 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4430 fs_info->sectorsize), 4431 (u64)-1, 0); 4432 4433 /* 4434 * This function is also used to drop the items in the log tree before 4435 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4436 * it is used to drop the loged items. So we shouldn't kill the delayed 4437 * items. 4438 */ 4439 if (min_type == 0 && root == BTRFS_I(inode)->root) 4440 btrfs_kill_delayed_inode_items(inode); 4441 4442 key.objectid = ino; 4443 key.offset = (u64)-1; 4444 key.type = (u8)-1; 4445 4446 search_again: 4447 /* 4448 * with a 16K leaf size and 128MB extents, you can actually queue 4449 * up a huge file in a single leaf. Most of the time that 4450 * bytes_deleted is > 0, it will be huge by the time we get here 4451 */ 4452 if (be_nice && bytes_deleted > SZ_32M) { 4453 if (btrfs_should_end_transaction(trans)) { 4454 err = -EAGAIN; 4455 goto error; 4456 } 4457 } 4458 4459 4460 path->leave_spinning = 1; 4461 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4462 if (ret < 0) { 4463 err = ret; 4464 goto out; 4465 } 4466 4467 if (ret > 0) { 4468 /* there are no items in the tree for us to truncate, we're 4469 * done 4470 */ 4471 if (path->slots[0] == 0) 4472 goto out; 4473 path->slots[0]--; 4474 } 4475 4476 while (1) { 4477 fi = NULL; 4478 leaf = path->nodes[0]; 4479 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4480 found_type = found_key.type; 4481 4482 if (found_key.objectid != ino) 4483 break; 4484 4485 if (found_type < min_type) 4486 break; 4487 4488 item_end = found_key.offset; 4489 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4490 fi = btrfs_item_ptr(leaf, path->slots[0], 4491 struct btrfs_file_extent_item); 4492 extent_type = btrfs_file_extent_type(leaf, fi); 4493 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4494 item_end += 4495 btrfs_file_extent_num_bytes(leaf, fi); 4496 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4497 item_end += btrfs_file_extent_inline_len(leaf, 4498 path->slots[0], fi); 4499 } 4500 item_end--; 4501 } 4502 if (found_type > min_type) { 4503 del_item = 1; 4504 } else { 4505 if (item_end < new_size) 4506 break; 4507 if (found_key.offset >= new_size) 4508 del_item = 1; 4509 else 4510 del_item = 0; 4511 } 4512 found_extent = 0; 4513 /* FIXME, shrink the extent if the ref count is only 1 */ 4514 if (found_type != BTRFS_EXTENT_DATA_KEY) 4515 goto delete; 4516 4517 if (del_item) 4518 last_size = found_key.offset; 4519 else 4520 last_size = new_size; 4521 4522 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4523 u64 num_dec; 4524 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4525 if (!del_item) { 4526 u64 orig_num_bytes = 4527 btrfs_file_extent_num_bytes(leaf, fi); 4528 extent_num_bytes = ALIGN(new_size - 4529 found_key.offset, 4530 fs_info->sectorsize); 4531 btrfs_set_file_extent_num_bytes(leaf, fi, 4532 extent_num_bytes); 4533 num_dec = (orig_num_bytes - 4534 extent_num_bytes); 4535 if (test_bit(BTRFS_ROOT_REF_COWS, 4536 &root->state) && 4537 extent_start != 0) 4538 inode_sub_bytes(inode, num_dec); 4539 btrfs_mark_buffer_dirty(leaf); 4540 } else { 4541 extent_num_bytes = 4542 btrfs_file_extent_disk_num_bytes(leaf, 4543 fi); 4544 extent_offset = found_key.offset - 4545 btrfs_file_extent_offset(leaf, fi); 4546 4547 /* FIXME blocksize != 4096 */ 4548 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4549 if (extent_start != 0) { 4550 found_extent = 1; 4551 if (test_bit(BTRFS_ROOT_REF_COWS, 4552 &root->state)) 4553 inode_sub_bytes(inode, num_dec); 4554 } 4555 } 4556 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4557 /* 4558 * we can't truncate inline items that have had 4559 * special encodings 4560 */ 4561 if (!del_item && 4562 btrfs_file_extent_encryption(leaf, fi) == 0 && 4563 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4564 4565 /* 4566 * Need to release path in order to truncate a 4567 * compressed extent. So delete any accumulated 4568 * extent items so far. 4569 */ 4570 if (btrfs_file_extent_compression(leaf, fi) != 4571 BTRFS_COMPRESS_NONE && pending_del_nr) { 4572 err = btrfs_del_items(trans, root, path, 4573 pending_del_slot, 4574 pending_del_nr); 4575 if (err) { 4576 btrfs_abort_transaction(trans, 4577 err); 4578 goto error; 4579 } 4580 pending_del_nr = 0; 4581 } 4582 4583 err = truncate_inline_extent(inode, path, 4584 &found_key, 4585 item_end, 4586 new_size); 4587 if (err) { 4588 btrfs_abort_transaction(trans, err); 4589 goto error; 4590 } 4591 } else if (test_bit(BTRFS_ROOT_REF_COWS, 4592 &root->state)) { 4593 inode_sub_bytes(inode, item_end + 1 - new_size); 4594 } 4595 } 4596 delete: 4597 if (del_item) { 4598 if (!pending_del_nr) { 4599 /* no pending yet, add ourselves */ 4600 pending_del_slot = path->slots[0]; 4601 pending_del_nr = 1; 4602 } else if (pending_del_nr && 4603 path->slots[0] + 1 == pending_del_slot) { 4604 /* hop on the pending chunk */ 4605 pending_del_nr++; 4606 pending_del_slot = path->slots[0]; 4607 } else { 4608 BUG(); 4609 } 4610 } else { 4611 break; 4612 } 4613 should_throttle = 0; 4614 4615 if (found_extent && 4616 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4617 root == fs_info->tree_root)) { 4618 btrfs_set_path_blocking(path); 4619 bytes_deleted += extent_num_bytes; 4620 ret = btrfs_free_extent(trans, fs_info, extent_start, 4621 extent_num_bytes, 0, 4622 btrfs_header_owner(leaf), 4623 ino, extent_offset); 4624 BUG_ON(ret); 4625 if (btrfs_should_throttle_delayed_refs(trans, fs_info)) 4626 btrfs_async_run_delayed_refs(fs_info, 4627 trans->delayed_ref_updates * 2, 4628 trans->transid, 0); 4629 if (be_nice) { 4630 if (truncate_space_check(trans, root, 4631 extent_num_bytes)) { 4632 should_end = 1; 4633 } 4634 if (btrfs_should_throttle_delayed_refs(trans, 4635 fs_info)) 4636 should_throttle = 1; 4637 } 4638 } 4639 4640 if (found_type == BTRFS_INODE_ITEM_KEY) 4641 break; 4642 4643 if (path->slots[0] == 0 || 4644 path->slots[0] != pending_del_slot || 4645 should_throttle || should_end) { 4646 if (pending_del_nr) { 4647 ret = btrfs_del_items(trans, root, path, 4648 pending_del_slot, 4649 pending_del_nr); 4650 if (ret) { 4651 btrfs_abort_transaction(trans, ret); 4652 goto error; 4653 } 4654 pending_del_nr = 0; 4655 } 4656 btrfs_release_path(path); 4657 if (should_throttle) { 4658 unsigned long updates = trans->delayed_ref_updates; 4659 if (updates) { 4660 trans->delayed_ref_updates = 0; 4661 ret = btrfs_run_delayed_refs(trans, 4662 fs_info, 4663 updates * 2); 4664 if (ret && !err) 4665 err = ret; 4666 } 4667 } 4668 /* 4669 * if we failed to refill our space rsv, bail out 4670 * and let the transaction restart 4671 */ 4672 if (should_end) { 4673 err = -EAGAIN; 4674 goto error; 4675 } 4676 goto search_again; 4677 } else { 4678 path->slots[0]--; 4679 } 4680 } 4681 out: 4682 if (pending_del_nr) { 4683 ret = btrfs_del_items(trans, root, path, pending_del_slot, 4684 pending_del_nr); 4685 if (ret) 4686 btrfs_abort_transaction(trans, ret); 4687 } 4688 error: 4689 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4690 ASSERT(last_size >= new_size); 4691 if (!err && last_size > new_size) 4692 last_size = new_size; 4693 btrfs_ordered_update_i_size(inode, last_size, NULL); 4694 } 4695 4696 btrfs_free_path(path); 4697 4698 if (be_nice && bytes_deleted > SZ_32M) { 4699 unsigned long updates = trans->delayed_ref_updates; 4700 if (updates) { 4701 trans->delayed_ref_updates = 0; 4702 ret = btrfs_run_delayed_refs(trans, fs_info, 4703 updates * 2); 4704 if (ret && !err) 4705 err = ret; 4706 } 4707 } 4708 return err; 4709 } 4710 4711 /* 4712 * btrfs_truncate_block - read, zero a chunk and write a block 4713 * @inode - inode that we're zeroing 4714 * @from - the offset to start zeroing 4715 * @len - the length to zero, 0 to zero the entire range respective to the 4716 * offset 4717 * @front - zero up to the offset instead of from the offset on 4718 * 4719 * This will find the block for the "from" offset and cow the block and zero the 4720 * part we want to zero. This is used with truncate and hole punching. 4721 */ 4722 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, 4723 int front) 4724 { 4725 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4726 struct address_space *mapping = inode->i_mapping; 4727 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4728 struct btrfs_ordered_extent *ordered; 4729 struct extent_state *cached_state = NULL; 4730 char *kaddr; 4731 u32 blocksize = fs_info->sectorsize; 4732 pgoff_t index = from >> PAGE_SHIFT; 4733 unsigned offset = from & (blocksize - 1); 4734 struct page *page; 4735 gfp_t mask = btrfs_alloc_write_mask(mapping); 4736 int ret = 0; 4737 u64 block_start; 4738 u64 block_end; 4739 4740 if ((offset & (blocksize - 1)) == 0 && 4741 (!len || ((len & (blocksize - 1)) == 0))) 4742 goto out; 4743 4744 ret = btrfs_delalloc_reserve_space(inode, 4745 round_down(from, blocksize), blocksize); 4746 if (ret) 4747 goto out; 4748 4749 again: 4750 page = find_or_create_page(mapping, index, mask); 4751 if (!page) { 4752 btrfs_delalloc_release_space(inode, 4753 round_down(from, blocksize), 4754 blocksize); 4755 ret = -ENOMEM; 4756 goto out; 4757 } 4758 4759 block_start = round_down(from, blocksize); 4760 block_end = block_start + blocksize - 1; 4761 4762 if (!PageUptodate(page)) { 4763 ret = btrfs_readpage(NULL, page); 4764 lock_page(page); 4765 if (page->mapping != mapping) { 4766 unlock_page(page); 4767 put_page(page); 4768 goto again; 4769 } 4770 if (!PageUptodate(page)) { 4771 ret = -EIO; 4772 goto out_unlock; 4773 } 4774 } 4775 wait_on_page_writeback(page); 4776 4777 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4778 set_page_extent_mapped(page); 4779 4780 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4781 if (ordered) { 4782 unlock_extent_cached(io_tree, block_start, block_end, 4783 &cached_state, GFP_NOFS); 4784 unlock_page(page); 4785 put_page(page); 4786 btrfs_start_ordered_extent(inode, ordered, 1); 4787 btrfs_put_ordered_extent(ordered); 4788 goto again; 4789 } 4790 4791 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, 4792 EXTENT_DIRTY | EXTENT_DELALLOC | 4793 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4794 0, 0, &cached_state, GFP_NOFS); 4795 4796 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 4797 &cached_state, 0); 4798 if (ret) { 4799 unlock_extent_cached(io_tree, block_start, block_end, 4800 &cached_state, GFP_NOFS); 4801 goto out_unlock; 4802 } 4803 4804 if (offset != blocksize) { 4805 if (!len) 4806 len = blocksize - offset; 4807 kaddr = kmap(page); 4808 if (front) 4809 memset(kaddr + (block_start - page_offset(page)), 4810 0, offset); 4811 else 4812 memset(kaddr + (block_start - page_offset(page)) + offset, 4813 0, len); 4814 flush_dcache_page(page); 4815 kunmap(page); 4816 } 4817 ClearPageChecked(page); 4818 set_page_dirty(page); 4819 unlock_extent_cached(io_tree, block_start, block_end, &cached_state, 4820 GFP_NOFS); 4821 4822 out_unlock: 4823 if (ret) 4824 btrfs_delalloc_release_space(inode, block_start, 4825 blocksize); 4826 unlock_page(page); 4827 put_page(page); 4828 out: 4829 return ret; 4830 } 4831 4832 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, 4833 u64 offset, u64 len) 4834 { 4835 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4836 struct btrfs_trans_handle *trans; 4837 int ret; 4838 4839 /* 4840 * Still need to make sure the inode looks like it's been updated so 4841 * that any holes get logged if we fsync. 4842 */ 4843 if (btrfs_fs_incompat(fs_info, NO_HOLES)) { 4844 BTRFS_I(inode)->last_trans = fs_info->generation; 4845 BTRFS_I(inode)->last_sub_trans = root->log_transid; 4846 BTRFS_I(inode)->last_log_commit = root->last_log_commit; 4847 return 0; 4848 } 4849 4850 /* 4851 * 1 - for the one we're dropping 4852 * 1 - for the one we're adding 4853 * 1 - for updating the inode. 4854 */ 4855 trans = btrfs_start_transaction(root, 3); 4856 if (IS_ERR(trans)) 4857 return PTR_ERR(trans); 4858 4859 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); 4860 if (ret) { 4861 btrfs_abort_transaction(trans, ret); 4862 btrfs_end_transaction(trans); 4863 return ret; 4864 } 4865 4866 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 4867 0, 0, len, 0, len, 0, 0, 0); 4868 if (ret) 4869 btrfs_abort_transaction(trans, ret); 4870 else 4871 btrfs_update_inode(trans, root, inode); 4872 btrfs_end_transaction(trans); 4873 return ret; 4874 } 4875 4876 /* 4877 * This function puts in dummy file extents for the area we're creating a hole 4878 * for. So if we are truncating this file to a larger size we need to insert 4879 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4880 * the range between oldsize and size 4881 */ 4882 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4883 { 4884 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4885 struct btrfs_root *root = BTRFS_I(inode)->root; 4886 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4887 struct extent_map *em = NULL; 4888 struct extent_state *cached_state = NULL; 4889 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4890 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 4891 u64 block_end = ALIGN(size, fs_info->sectorsize); 4892 u64 last_byte; 4893 u64 cur_offset; 4894 u64 hole_size; 4895 int err = 0; 4896 4897 /* 4898 * If our size started in the middle of a block we need to zero out the 4899 * rest of the block before we expand the i_size, otherwise we could 4900 * expose stale data. 4901 */ 4902 err = btrfs_truncate_block(inode, oldsize, 0, 0); 4903 if (err) 4904 return err; 4905 4906 if (size <= hole_start) 4907 return 0; 4908 4909 while (1) { 4910 struct btrfs_ordered_extent *ordered; 4911 4912 lock_extent_bits(io_tree, hole_start, block_end - 1, 4913 &cached_state); 4914 ordered = btrfs_lookup_ordered_range(inode, hole_start, 4915 block_end - hole_start); 4916 if (!ordered) 4917 break; 4918 unlock_extent_cached(io_tree, hole_start, block_end - 1, 4919 &cached_state, GFP_NOFS); 4920 btrfs_start_ordered_extent(inode, ordered, 1); 4921 btrfs_put_ordered_extent(ordered); 4922 } 4923 4924 cur_offset = hole_start; 4925 while (1) { 4926 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4927 block_end - cur_offset, 0); 4928 if (IS_ERR(em)) { 4929 err = PTR_ERR(em); 4930 em = NULL; 4931 break; 4932 } 4933 last_byte = min(extent_map_end(em), block_end); 4934 last_byte = ALIGN(last_byte, fs_info->sectorsize); 4935 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4936 struct extent_map *hole_em; 4937 hole_size = last_byte - cur_offset; 4938 4939 err = maybe_insert_hole(root, inode, cur_offset, 4940 hole_size); 4941 if (err) 4942 break; 4943 btrfs_drop_extent_cache(inode, cur_offset, 4944 cur_offset + hole_size - 1, 0); 4945 hole_em = alloc_extent_map(); 4946 if (!hole_em) { 4947 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4948 &BTRFS_I(inode)->runtime_flags); 4949 goto next; 4950 } 4951 hole_em->start = cur_offset; 4952 hole_em->len = hole_size; 4953 hole_em->orig_start = cur_offset; 4954 4955 hole_em->block_start = EXTENT_MAP_HOLE; 4956 hole_em->block_len = 0; 4957 hole_em->orig_block_len = 0; 4958 hole_em->ram_bytes = hole_size; 4959 hole_em->bdev = fs_info->fs_devices->latest_bdev; 4960 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4961 hole_em->generation = fs_info->generation; 4962 4963 while (1) { 4964 write_lock(&em_tree->lock); 4965 err = add_extent_mapping(em_tree, hole_em, 1); 4966 write_unlock(&em_tree->lock); 4967 if (err != -EEXIST) 4968 break; 4969 btrfs_drop_extent_cache(inode, cur_offset, 4970 cur_offset + 4971 hole_size - 1, 0); 4972 } 4973 free_extent_map(hole_em); 4974 } 4975 next: 4976 free_extent_map(em); 4977 em = NULL; 4978 cur_offset = last_byte; 4979 if (cur_offset >= block_end) 4980 break; 4981 } 4982 free_extent_map(em); 4983 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 4984 GFP_NOFS); 4985 return err; 4986 } 4987 4988 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4989 { 4990 struct btrfs_root *root = BTRFS_I(inode)->root; 4991 struct btrfs_trans_handle *trans; 4992 loff_t oldsize = i_size_read(inode); 4993 loff_t newsize = attr->ia_size; 4994 int mask = attr->ia_valid; 4995 int ret; 4996 4997 /* 4998 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4999 * special case where we need to update the times despite not having 5000 * these flags set. For all other operations the VFS set these flags 5001 * explicitly if it wants a timestamp update. 5002 */ 5003 if (newsize != oldsize) { 5004 inode_inc_iversion(inode); 5005 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 5006 inode->i_ctime = inode->i_mtime = 5007 current_time(inode); 5008 } 5009 5010 if (newsize > oldsize) { 5011 /* 5012 * Don't do an expanding truncate while snapshoting is ongoing. 5013 * This is to ensure the snapshot captures a fully consistent 5014 * state of this file - if the snapshot captures this expanding 5015 * truncation, it must capture all writes that happened before 5016 * this truncation. 5017 */ 5018 btrfs_wait_for_snapshot_creation(root); 5019 ret = btrfs_cont_expand(inode, oldsize, newsize); 5020 if (ret) { 5021 btrfs_end_write_no_snapshoting(root); 5022 return ret; 5023 } 5024 5025 trans = btrfs_start_transaction(root, 1); 5026 if (IS_ERR(trans)) { 5027 btrfs_end_write_no_snapshoting(root); 5028 return PTR_ERR(trans); 5029 } 5030 5031 i_size_write(inode, newsize); 5032 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 5033 pagecache_isize_extended(inode, oldsize, newsize); 5034 ret = btrfs_update_inode(trans, root, inode); 5035 btrfs_end_write_no_snapshoting(root); 5036 btrfs_end_transaction(trans); 5037 } else { 5038 5039 /* 5040 * We're truncating a file that used to have good data down to 5041 * zero. Make sure it gets into the ordered flush list so that 5042 * any new writes get down to disk quickly. 5043 */ 5044 if (newsize == 0) 5045 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 5046 &BTRFS_I(inode)->runtime_flags); 5047 5048 /* 5049 * 1 for the orphan item we're going to add 5050 * 1 for the orphan item deletion. 5051 */ 5052 trans = btrfs_start_transaction(root, 2); 5053 if (IS_ERR(trans)) 5054 return PTR_ERR(trans); 5055 5056 /* 5057 * We need to do this in case we fail at _any_ point during the 5058 * actual truncate. Once we do the truncate_setsize we could 5059 * invalidate pages which forces any outstanding ordered io to 5060 * be instantly completed which will give us extents that need 5061 * to be truncated. If we fail to get an orphan inode down we 5062 * could have left over extents that were never meant to live, 5063 * so we need to guarantee from this point on that everything 5064 * will be consistent. 5065 */ 5066 ret = btrfs_orphan_add(trans, inode); 5067 btrfs_end_transaction(trans); 5068 if (ret) 5069 return ret; 5070 5071 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 5072 truncate_setsize(inode, newsize); 5073 5074 /* Disable nonlocked read DIO to avoid the end less truncate */ 5075 btrfs_inode_block_unlocked_dio(inode); 5076 inode_dio_wait(inode); 5077 btrfs_inode_resume_unlocked_dio(inode); 5078 5079 ret = btrfs_truncate(inode); 5080 if (ret && inode->i_nlink) { 5081 int err; 5082 5083 /* 5084 * failed to truncate, disk_i_size is only adjusted down 5085 * as we remove extents, so it should represent the true 5086 * size of the inode, so reset the in memory size and 5087 * delete our orphan entry. 5088 */ 5089 trans = btrfs_join_transaction(root); 5090 if (IS_ERR(trans)) { 5091 btrfs_orphan_del(NULL, inode); 5092 return ret; 5093 } 5094 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5095 err = btrfs_orphan_del(trans, inode); 5096 if (err) 5097 btrfs_abort_transaction(trans, err); 5098 btrfs_end_transaction(trans); 5099 } 5100 } 5101 5102 return ret; 5103 } 5104 5105 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 5106 { 5107 struct inode *inode = d_inode(dentry); 5108 struct btrfs_root *root = BTRFS_I(inode)->root; 5109 int err; 5110 5111 if (btrfs_root_readonly(root)) 5112 return -EROFS; 5113 5114 err = setattr_prepare(dentry, attr); 5115 if (err) 5116 return err; 5117 5118 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5119 err = btrfs_setsize(inode, attr); 5120 if (err) 5121 return err; 5122 } 5123 5124 if (attr->ia_valid) { 5125 setattr_copy(inode, attr); 5126 inode_inc_iversion(inode); 5127 err = btrfs_dirty_inode(inode); 5128 5129 if (!err && attr->ia_valid & ATTR_MODE) 5130 err = posix_acl_chmod(inode, inode->i_mode); 5131 } 5132 5133 return err; 5134 } 5135 5136 /* 5137 * While truncating the inode pages during eviction, we get the VFS calling 5138 * btrfs_invalidatepage() against each page of the inode. This is slow because 5139 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 5140 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 5141 * extent_state structures over and over, wasting lots of time. 5142 * 5143 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 5144 * those expensive operations on a per page basis and do only the ordered io 5145 * finishing, while we release here the extent_map and extent_state structures, 5146 * without the excessive merging and splitting. 5147 */ 5148 static void evict_inode_truncate_pages(struct inode *inode) 5149 { 5150 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5151 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5152 struct rb_node *node; 5153 5154 ASSERT(inode->i_state & I_FREEING); 5155 truncate_inode_pages_final(&inode->i_data); 5156 5157 write_lock(&map_tree->lock); 5158 while (!RB_EMPTY_ROOT(&map_tree->map)) { 5159 struct extent_map *em; 5160 5161 node = rb_first(&map_tree->map); 5162 em = rb_entry(node, struct extent_map, rb_node); 5163 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5164 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5165 remove_extent_mapping(map_tree, em); 5166 free_extent_map(em); 5167 if (need_resched()) { 5168 write_unlock(&map_tree->lock); 5169 cond_resched(); 5170 write_lock(&map_tree->lock); 5171 } 5172 } 5173 write_unlock(&map_tree->lock); 5174 5175 /* 5176 * Keep looping until we have no more ranges in the io tree. 5177 * We can have ongoing bios started by readpages (called from readahead) 5178 * that have their endio callback (extent_io.c:end_bio_extent_readpage) 5179 * still in progress (unlocked the pages in the bio but did not yet 5180 * unlocked the ranges in the io tree). Therefore this means some 5181 * ranges can still be locked and eviction started because before 5182 * submitting those bios, which are executed by a separate task (work 5183 * queue kthread), inode references (inode->i_count) were not taken 5184 * (which would be dropped in the end io callback of each bio). 5185 * Therefore here we effectively end up waiting for those bios and 5186 * anyone else holding locked ranges without having bumped the inode's 5187 * reference count - if we don't do it, when they access the inode's 5188 * io_tree to unlock a range it may be too late, leading to an 5189 * use-after-free issue. 5190 */ 5191 spin_lock(&io_tree->lock); 5192 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5193 struct extent_state *state; 5194 struct extent_state *cached_state = NULL; 5195 u64 start; 5196 u64 end; 5197 5198 node = rb_first(&io_tree->state); 5199 state = rb_entry(node, struct extent_state, rb_node); 5200 start = state->start; 5201 end = state->end; 5202 spin_unlock(&io_tree->lock); 5203 5204 lock_extent_bits(io_tree, start, end, &cached_state); 5205 5206 /* 5207 * If still has DELALLOC flag, the extent didn't reach disk, 5208 * and its reserved space won't be freed by delayed_ref. 5209 * So we need to free its reserved space here. 5210 * (Refer to comment in btrfs_invalidatepage, case 2) 5211 * 5212 * Note, end is the bytenr of last byte, so we need + 1 here. 5213 */ 5214 if (state->state & EXTENT_DELALLOC) 5215 btrfs_qgroup_free_data(inode, start, end - start + 1); 5216 5217 clear_extent_bit(io_tree, start, end, 5218 EXTENT_LOCKED | EXTENT_DIRTY | 5219 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5220 EXTENT_DEFRAG, 1, 1, 5221 &cached_state, GFP_NOFS); 5222 5223 cond_resched(); 5224 spin_lock(&io_tree->lock); 5225 } 5226 spin_unlock(&io_tree->lock); 5227 } 5228 5229 void btrfs_evict_inode(struct inode *inode) 5230 { 5231 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5232 struct btrfs_trans_handle *trans; 5233 struct btrfs_root *root = BTRFS_I(inode)->root; 5234 struct btrfs_block_rsv *rsv, *global_rsv; 5235 int steal_from_global = 0; 5236 u64 min_size; 5237 int ret; 5238 5239 trace_btrfs_inode_evict(inode); 5240 5241 if (!root) { 5242 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5243 return; 5244 } 5245 5246 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); 5247 5248 evict_inode_truncate_pages(inode); 5249 5250 if (inode->i_nlink && 5251 ((btrfs_root_refs(&root->root_item) != 0 && 5252 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5253 btrfs_is_free_space_inode(inode))) 5254 goto no_delete; 5255 5256 if (is_bad_inode(inode)) { 5257 btrfs_orphan_del(NULL, inode); 5258 goto no_delete; 5259 } 5260 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 5261 if (!special_file(inode->i_mode)) 5262 btrfs_wait_ordered_range(inode, 0, (u64)-1); 5263 5264 btrfs_free_io_failure_record(inode, 0, (u64)-1); 5265 5266 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 5267 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 5268 &BTRFS_I(inode)->runtime_flags)); 5269 goto no_delete; 5270 } 5271 5272 if (inode->i_nlink > 0) { 5273 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5274 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5275 goto no_delete; 5276 } 5277 5278 ret = btrfs_commit_inode_delayed_inode(inode); 5279 if (ret) { 5280 btrfs_orphan_del(NULL, inode); 5281 goto no_delete; 5282 } 5283 5284 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5285 if (!rsv) { 5286 btrfs_orphan_del(NULL, inode); 5287 goto no_delete; 5288 } 5289 rsv->size = min_size; 5290 rsv->failfast = 1; 5291 global_rsv = &fs_info->global_block_rsv; 5292 5293 btrfs_i_size_write(inode, 0); 5294 5295 /* 5296 * This is a bit simpler than btrfs_truncate since we've already 5297 * reserved our space for our orphan item in the unlink, so we just 5298 * need to reserve some slack space in case we add bytes and update 5299 * inode item when doing the truncate. 5300 */ 5301 while (1) { 5302 ret = btrfs_block_rsv_refill(root, rsv, min_size, 5303 BTRFS_RESERVE_FLUSH_LIMIT); 5304 5305 /* 5306 * Try and steal from the global reserve since we will 5307 * likely not use this space anyway, we want to try as 5308 * hard as possible to get this to work. 5309 */ 5310 if (ret) 5311 steal_from_global++; 5312 else 5313 steal_from_global = 0; 5314 ret = 0; 5315 5316 /* 5317 * steal_from_global == 0: we reserved stuff, hooray! 5318 * steal_from_global == 1: we didn't reserve stuff, boo! 5319 * steal_from_global == 2: we've committed, still not a lot of 5320 * room but maybe we'll have room in the global reserve this 5321 * time. 5322 * steal_from_global == 3: abandon all hope! 5323 */ 5324 if (steal_from_global > 2) { 5325 btrfs_warn(fs_info, 5326 "Could not get space for a delete, will truncate on mount %d", 5327 ret); 5328 btrfs_orphan_del(NULL, inode); 5329 btrfs_free_block_rsv(fs_info, rsv); 5330 goto no_delete; 5331 } 5332 5333 trans = btrfs_join_transaction(root); 5334 if (IS_ERR(trans)) { 5335 btrfs_orphan_del(NULL, inode); 5336 btrfs_free_block_rsv(fs_info, rsv); 5337 goto no_delete; 5338 } 5339 5340 /* 5341 * We can't just steal from the global reserve, we need to make 5342 * sure there is room to do it, if not we need to commit and try 5343 * again. 5344 */ 5345 if (steal_from_global) { 5346 if (!btrfs_check_space_for_delayed_refs(trans, fs_info)) 5347 ret = btrfs_block_rsv_migrate(global_rsv, rsv, 5348 min_size, 0); 5349 else 5350 ret = -ENOSPC; 5351 } 5352 5353 /* 5354 * Couldn't steal from the global reserve, we have too much 5355 * pending stuff built up, commit the transaction and try it 5356 * again. 5357 */ 5358 if (ret) { 5359 ret = btrfs_commit_transaction(trans); 5360 if (ret) { 5361 btrfs_orphan_del(NULL, inode); 5362 btrfs_free_block_rsv(fs_info, rsv); 5363 goto no_delete; 5364 } 5365 continue; 5366 } else { 5367 steal_from_global = 0; 5368 } 5369 5370 trans->block_rsv = rsv; 5371 5372 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 5373 if (ret != -ENOSPC && ret != -EAGAIN) 5374 break; 5375 5376 trans->block_rsv = &fs_info->trans_block_rsv; 5377 btrfs_end_transaction(trans); 5378 trans = NULL; 5379 btrfs_btree_balance_dirty(fs_info); 5380 } 5381 5382 btrfs_free_block_rsv(fs_info, rsv); 5383 5384 /* 5385 * Errors here aren't a big deal, it just means we leave orphan items 5386 * in the tree. They will be cleaned up on the next mount. 5387 */ 5388 if (ret == 0) { 5389 trans->block_rsv = root->orphan_block_rsv; 5390 btrfs_orphan_del(trans, inode); 5391 } else { 5392 btrfs_orphan_del(NULL, inode); 5393 } 5394 5395 trans->block_rsv = &fs_info->trans_block_rsv; 5396 if (!(root == fs_info->tree_root || 5397 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 5398 btrfs_return_ino(root, btrfs_ino(inode)); 5399 5400 btrfs_end_transaction(trans); 5401 btrfs_btree_balance_dirty(fs_info); 5402 no_delete: 5403 btrfs_remove_delayed_node(inode); 5404 clear_inode(inode); 5405 } 5406 5407 /* 5408 * this returns the key found in the dir entry in the location pointer. 5409 * If no dir entries were found, location->objectid is 0. 5410 */ 5411 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5412 struct btrfs_key *location) 5413 { 5414 const char *name = dentry->d_name.name; 5415 int namelen = dentry->d_name.len; 5416 struct btrfs_dir_item *di; 5417 struct btrfs_path *path; 5418 struct btrfs_root *root = BTRFS_I(dir)->root; 5419 int ret = 0; 5420 5421 path = btrfs_alloc_path(); 5422 if (!path) 5423 return -ENOMEM; 5424 5425 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 5426 namelen, 0); 5427 if (IS_ERR(di)) 5428 ret = PTR_ERR(di); 5429 5430 if (IS_ERR_OR_NULL(di)) 5431 goto out_err; 5432 5433 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5434 out: 5435 btrfs_free_path(path); 5436 return ret; 5437 out_err: 5438 location->objectid = 0; 5439 goto out; 5440 } 5441 5442 /* 5443 * when we hit a tree root in a directory, the btrfs part of the inode 5444 * needs to be changed to reflect the root directory of the tree root. This 5445 * is kind of like crossing a mount point. 5446 */ 5447 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5448 struct inode *dir, 5449 struct dentry *dentry, 5450 struct btrfs_key *location, 5451 struct btrfs_root **sub_root) 5452 { 5453 struct btrfs_path *path; 5454 struct btrfs_root *new_root; 5455 struct btrfs_root_ref *ref; 5456 struct extent_buffer *leaf; 5457 struct btrfs_key key; 5458 int ret; 5459 int err = 0; 5460 5461 path = btrfs_alloc_path(); 5462 if (!path) { 5463 err = -ENOMEM; 5464 goto out; 5465 } 5466 5467 err = -ENOENT; 5468 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5469 key.type = BTRFS_ROOT_REF_KEY; 5470 key.offset = location->objectid; 5471 5472 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5473 if (ret) { 5474 if (ret < 0) 5475 err = ret; 5476 goto out; 5477 } 5478 5479 leaf = path->nodes[0]; 5480 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5481 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 5482 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5483 goto out; 5484 5485 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5486 (unsigned long)(ref + 1), 5487 dentry->d_name.len); 5488 if (ret) 5489 goto out; 5490 5491 btrfs_release_path(path); 5492 5493 new_root = btrfs_read_fs_root_no_name(fs_info, location); 5494 if (IS_ERR(new_root)) { 5495 err = PTR_ERR(new_root); 5496 goto out; 5497 } 5498 5499 *sub_root = new_root; 5500 location->objectid = btrfs_root_dirid(&new_root->root_item); 5501 location->type = BTRFS_INODE_ITEM_KEY; 5502 location->offset = 0; 5503 err = 0; 5504 out: 5505 btrfs_free_path(path); 5506 return err; 5507 } 5508 5509 static void inode_tree_add(struct inode *inode) 5510 { 5511 struct btrfs_root *root = BTRFS_I(inode)->root; 5512 struct btrfs_inode *entry; 5513 struct rb_node **p; 5514 struct rb_node *parent; 5515 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5516 u64 ino = btrfs_ino(inode); 5517 5518 if (inode_unhashed(inode)) 5519 return; 5520 parent = NULL; 5521 spin_lock(&root->inode_lock); 5522 p = &root->inode_tree.rb_node; 5523 while (*p) { 5524 parent = *p; 5525 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5526 5527 if (ino < btrfs_ino(&entry->vfs_inode)) 5528 p = &parent->rb_left; 5529 else if (ino > btrfs_ino(&entry->vfs_inode)) 5530 p = &parent->rb_right; 5531 else { 5532 WARN_ON(!(entry->vfs_inode.i_state & 5533 (I_WILL_FREE | I_FREEING))); 5534 rb_replace_node(parent, new, &root->inode_tree); 5535 RB_CLEAR_NODE(parent); 5536 spin_unlock(&root->inode_lock); 5537 return; 5538 } 5539 } 5540 rb_link_node(new, parent, p); 5541 rb_insert_color(new, &root->inode_tree); 5542 spin_unlock(&root->inode_lock); 5543 } 5544 5545 static void inode_tree_del(struct inode *inode) 5546 { 5547 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5548 struct btrfs_root *root = BTRFS_I(inode)->root; 5549 int empty = 0; 5550 5551 spin_lock(&root->inode_lock); 5552 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 5553 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 5554 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 5555 empty = RB_EMPTY_ROOT(&root->inode_tree); 5556 } 5557 spin_unlock(&root->inode_lock); 5558 5559 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5560 synchronize_srcu(&fs_info->subvol_srcu); 5561 spin_lock(&root->inode_lock); 5562 empty = RB_EMPTY_ROOT(&root->inode_tree); 5563 spin_unlock(&root->inode_lock); 5564 if (empty) 5565 btrfs_add_dead_root(root); 5566 } 5567 } 5568 5569 void btrfs_invalidate_inodes(struct btrfs_root *root) 5570 { 5571 struct btrfs_fs_info *fs_info = root->fs_info; 5572 struct rb_node *node; 5573 struct rb_node *prev; 5574 struct btrfs_inode *entry; 5575 struct inode *inode; 5576 u64 objectid = 0; 5577 5578 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 5579 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 5580 5581 spin_lock(&root->inode_lock); 5582 again: 5583 node = root->inode_tree.rb_node; 5584 prev = NULL; 5585 while (node) { 5586 prev = node; 5587 entry = rb_entry(node, struct btrfs_inode, rb_node); 5588 5589 if (objectid < btrfs_ino(&entry->vfs_inode)) 5590 node = node->rb_left; 5591 else if (objectid > btrfs_ino(&entry->vfs_inode)) 5592 node = node->rb_right; 5593 else 5594 break; 5595 } 5596 if (!node) { 5597 while (prev) { 5598 entry = rb_entry(prev, struct btrfs_inode, rb_node); 5599 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 5600 node = prev; 5601 break; 5602 } 5603 prev = rb_next(prev); 5604 } 5605 } 5606 while (node) { 5607 entry = rb_entry(node, struct btrfs_inode, rb_node); 5608 objectid = btrfs_ino(&entry->vfs_inode) + 1; 5609 inode = igrab(&entry->vfs_inode); 5610 if (inode) { 5611 spin_unlock(&root->inode_lock); 5612 if (atomic_read(&inode->i_count) > 1) 5613 d_prune_aliases(inode); 5614 /* 5615 * btrfs_drop_inode will have it removed from 5616 * the inode cache when its usage count 5617 * hits zero. 5618 */ 5619 iput(inode); 5620 cond_resched(); 5621 spin_lock(&root->inode_lock); 5622 goto again; 5623 } 5624 5625 if (cond_resched_lock(&root->inode_lock)) 5626 goto again; 5627 5628 node = rb_next(node); 5629 } 5630 spin_unlock(&root->inode_lock); 5631 } 5632 5633 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5634 { 5635 struct btrfs_iget_args *args = p; 5636 inode->i_ino = args->location->objectid; 5637 memcpy(&BTRFS_I(inode)->location, args->location, 5638 sizeof(*args->location)); 5639 BTRFS_I(inode)->root = args->root; 5640 return 0; 5641 } 5642 5643 static int btrfs_find_actor(struct inode *inode, void *opaque) 5644 { 5645 struct btrfs_iget_args *args = opaque; 5646 return args->location->objectid == BTRFS_I(inode)->location.objectid && 5647 args->root == BTRFS_I(inode)->root; 5648 } 5649 5650 static struct inode *btrfs_iget_locked(struct super_block *s, 5651 struct btrfs_key *location, 5652 struct btrfs_root *root) 5653 { 5654 struct inode *inode; 5655 struct btrfs_iget_args args; 5656 unsigned long hashval = btrfs_inode_hash(location->objectid, root); 5657 5658 args.location = location; 5659 args.root = root; 5660 5661 inode = iget5_locked(s, hashval, btrfs_find_actor, 5662 btrfs_init_locked_inode, 5663 (void *)&args); 5664 return inode; 5665 } 5666 5667 /* Get an inode object given its location and corresponding root