1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_shared.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_mount.h" 24 #include "xfs_inode.h" 25 #include "xfs_trans.h" 26 #include "xfs_inode_item.h" 27 #include "xfs_alloc.h" 28 #include "xfs_error.h" 29 #include "xfs_iomap.h" 30 #include "xfs_trace.h" 31 #include "xfs_bmap.h" 32 #include "xfs_bmap_util.h" 33 #include "xfs_bmap_btree.h" 34 #include <linux/gfp.h> 35 #include <linux/mpage.h> 36 #include <linux/pagevec.h> 37 #include <linux/writeback.h> 38 39 void 40 xfs_count_page_state( 41 struct page *page, 42 int *delalloc, 43 int *unwritten) 44 { 45 struct buffer_head *bh, *head; 46 47 *delalloc = *unwritten = 0; 48 49 bh = head = page_buffers(page); 50 do { 51 if (buffer_unwritten(bh)) 52 (*unwritten) = 1; 53 else if (buffer_delay(bh)) 54 (*delalloc) = 1; 55 } while ((bh = bh->b_this_page) != head); 56 } 57 58 STATIC struct block_device * 59 xfs_find_bdev_for_inode( 60 struct inode *inode) 61 { 62 struct xfs_inode *ip = XFS_I(inode); 63 struct xfs_mount *mp = ip->i_mount; 64 65 if (XFS_IS_REALTIME_INODE(ip)) 66 return mp->m_rtdev_targp->bt_bdev; 67 else 68 return mp->m_ddev_targp->bt_bdev; 69 } 70 71 /* 72 * We're now finished for good with this ioend structure. 73 * Update the page state via the associated buffer_heads, 74 * release holds on the inode and bio, and finally free 75 * up memory. Do not use the ioend after this. 76 */ 77 STATIC void 78 xfs_destroy_ioend( 79 xfs_ioend_t *ioend) 80 { 81 struct buffer_head *bh, *next; 82 83 for (bh = ioend->io_buffer_head; bh; bh = next) { 84 next = bh->b_private; 85 bh->b_end_io(bh, !ioend->io_error); 86 } 87 88 mempool_free(ioend, xfs_ioend_pool); 89 } 90 91 /* 92 * Fast and loose check if this write could update the on-disk inode size. 93 */ 94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 95 { 96 return ioend->io_offset + ioend->io_size > 97 XFS_I(ioend->io_inode)->i_d.di_size; 98 } 99 100 STATIC int 101 xfs_setfilesize_trans_alloc( 102 struct xfs_ioend *ioend) 103 { 104 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 105 struct xfs_trans *tp; 106 int error; 107 108 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 111 if (error) { 112 xfs_trans_cancel(tp); 113 return error; 114 } 115 116 ioend->io_append_trans = tp; 117 118 /* 119 * We may pass freeze protection with a transaction. So tell lockdep 120 * we released it. 121 */ 122 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 123 1, _THIS_IP_); 124 /* 125 * We hand off the transaction to the completion thread now, so 126 * clear the flag here. 127 */ 128 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 129 return 0; 130 } 131 132 /* 133 * Update on-disk file size now that data has been written to disk. 134 */ 135 STATIC int 136 xfs_setfilesize( 137 struct xfs_inode *ip, 138 struct xfs_trans *tp, 139 xfs_off_t offset, 140 size_t size) 141 { 142 xfs_fsize_t isize; 143 144 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 isize = xfs_new_eof(ip, offset + size); 146 if (!isize) { 147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_trans_cancel(tp); 149 return 0; 150 } 151 152 trace_xfs_setfilesize(ip, offset, size); 153 154 ip->i_d.di_size = isize; 155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 157 158 return xfs_trans_commit(tp); 159 } 160 161 STATIC int 162 xfs_setfilesize_ioend( 163 struct xfs_ioend *ioend) 164 { 165 struct xfs_inode *ip = XFS_I(ioend->io_inode); 166 struct xfs_trans *tp = ioend->io_append_trans; 167 168 /* 169 * The transaction may have been allocated in the I/O submission thread, 170 * thus we need to mark ourselves as being in a transaction manually. 171 * Similarly for freeze protection. 172 */ 173 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 174 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 175 0, 1, _THIS_IP_); 176 177 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 178 } 179 180 /* 181 * Schedule IO completion handling on the final put of an ioend. 182 * 183 * If there is no work to do we might as well call it a day and free the 184 * ioend right now. 185 */ 186 STATIC void 187 xfs_finish_ioend( 188 struct xfs_ioend *ioend) 189 { 190 if (atomic_dec_and_test(&ioend->io_remaining)) { 191 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 192 193 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 else if (ioend->io_append_trans) 196 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 else 198 xfs_destroy_ioend(ioend); 199 } 200 } 201 202 /* 203 * IO write completion. 204 */ 205 STATIC void 206 xfs_end_io( 207 struct work_struct *work) 208 { 209 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); 210 struct xfs_inode *ip = XFS_I(ioend->io_inode); 211 int error = 0; 212 213 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 214 ioend->io_error = -EIO; 215 goto done; 216 } 217 if (ioend->io_error) 218 goto done; 219 220 /* 221 * For unwritten extents we need to issue transactions to convert a 222 * range to normal written extens after the data I/O has finished. 223 */ 224 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 ioend->io_size); 227 } else if (ioend->io_append_trans) { 228 error = xfs_setfilesize_ioend(ioend); 229 } else { 230 ASSERT(!xfs_ioend_is_append(ioend)); 231 } 232 233 done: 234 if (error) 235 ioend->io_error = error; 236 xfs_destroy_ioend(ioend); 237 } 238 239 /* 240 * Allocate and initialise an IO completion structure. 241 * We need to track unwritten extent write completion here initially. 242 * We'll need to extend this for updating the ondisk inode size later 243 * (vs. incore size). 244 */ 245 STATIC xfs_ioend_t * 246 xfs_alloc_ioend( 247 struct inode *inode, 248 unsigned int type) 249 { 250 xfs_ioend_t *ioend; 251 252 ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); 253 254 /* 255 * Set the count to 1 initially, which will prevent an I/O 256 * completion callback from happening before we have started 257 * all the I/O from calling the completion routine too early. 258 */ 259 atomic_set(&ioend->io_remaining, 1); 260 ioend->io_error = 0; 261 ioend->io_list = NULL; 262 ioend->io_type = type; 263 ioend->io_inode = inode; 264 ioend->io_buffer_head = NULL; 265 ioend->io_buffer_tail = NULL; 266 ioend->io_offset = 0; 267 ioend->io_size = 0; 268 ioend->io_append_trans = NULL; 269 270 INIT_WORK(&ioend->io_work, xfs_end_io); 271 return ioend; 272 } 273 274 STATIC int 275 xfs_map_blocks( 276 struct inode *inode, 277 loff_t offset, 278 struct xfs_bmbt_irec *imap, 279 int type, 280 int nonblocking) 281 { 282 struct xfs_inode *ip = XFS_I(inode); 283 struct xfs_mount *mp = ip->i_mount; 284 ssize_t count = 1 << inode->i_blkbits; 285 xfs_fileoff_t offset_fsb, end_fsb; 286 int error = 0; 287 int bmapi_flags = XFS_BMAPI_ENTIRE; 288 int nimaps = 1; 289 290 if (XFS_FORCED_SHUTDOWN(mp)) 291 return -EIO; 292 293 if (type == XFS_IO_UNWRITTEN) 294 bmapi_flags |= XFS_BMAPI_IGSTATE; 295 296 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 297 if (nonblocking) 298 return -EAGAIN; 299 xfs_ilock(ip, XFS_ILOCK_SHARED); 300 } 301 302 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 303 (ip->i_df.if_flags & XFS_IFEXTENTS)); 304 ASSERT(offset <= mp->m_super->s_maxbytes); 305 306 if (offset + count > mp->m_super->s_maxbytes) 307 count = mp->m_super->s_maxbytes - offset; 308 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 309 offset_fsb = XFS_B_TO_FSBT(mp, offset); 310 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 311 imap, &nimaps, bmapi_flags); 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 314 if (error) 315 return error; 316 317 if (type == XFS_IO_DELALLOC && 318 (!nimaps || isnullstartblock(imap->br_startblock))) { 319 error = xfs_iomap_write_allocate(ip, offset, imap); 320 if (!error) 321 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 322 return error; 323 } 324 325 #ifdef DEBUG 326 if (type == XFS_IO_UNWRITTEN) { 327 ASSERT(nimaps); 328 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 329 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 330 } 331 #endif 332 if (nimaps) 333 trace_xfs_map_blocks_found(ip, offset, count, type, imap); 334 return 0; 335 } 336 337 STATIC int 338 xfs_imap_valid( 339 struct inode *inode, 340 struct xfs_bmbt_irec *imap, 341 xfs_off_t offset) 342 { 343 offset >>= inode->i_blkbits; 344 345 return offset >= imap->br_startoff && 346 offset < imap->br_startoff + imap->br_blockcount; 347 } 348 349 /* 350 * BIO completion handler for buffered IO. 351 */ 352 STATIC void 353 xfs_end_bio( 354 struct bio *bio, 355 int error) 356 { 357 xfs_ioend_t *ioend = bio->bi_private; 358 359 if (!ioend->io_error && !test_bit(BIO_UPTODATE, &bio->bi_flags)) 360 ioend->io_error = error; 361 362 /* Toss bio and pass work off to an xfsdatad thread */ 363 bio->bi_private = NULL; 364 bio->bi_end_io = NULL; 365 bio_put(bio); 366 367 xfs_finish_ioend(ioend); 368 } 369 370 STATIC void 371 xfs_submit_ioend_bio( 372 struct writeback_control *wbc, 373 xfs_ioend_t *ioend, 374 struct bio *bio) 375 { 376 atomic_inc(&ioend->io_remaining); 377 bio->bi_private = ioend; 378 bio->bi_end_io = xfs_end_bio; 379 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 380 } 381 382 STATIC struct bio * 383 xfs_alloc_ioend_bio( 384 struct buffer_head *bh) 385 { 386 int nvecs = bio_get_nr_vecs(bh->b_bdev); 387 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 388 389 ASSERT(bio->bi_private == NULL); 390 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 391 bio->bi_bdev = bh->b_bdev; 392 return bio; 393 } 394 395 STATIC void 396 xfs_start_buffer_writeback( 397 struct buffer_head *bh) 398 { 399 ASSERT(buffer_mapped(bh)); 400 ASSERT(buffer_locked(bh)); 401 ASSERT(!buffer_delay(bh)); 402 ASSERT(!buffer_unwritten(bh)); 403 404 mark_buffer_async_write(bh); 405 set_buffer_uptodate(bh); 406 clear_buffer_dirty(bh); 407 } 408 409 STATIC void 410 xfs_start_page_writeback( 411 struct page *page, 412 int clear_dirty, 413 int buffers) 414 { 415 ASSERT(PageLocked(page)); 416 ASSERT(!PageWriteback(page)); 417 418 /* 419 * if the page was not fully cleaned, we need to ensure that the higher 420 * layers come back to it correctly. That means we need to keep the page 421 * dirty, and for WB_SYNC_ALL writeback we need to ensure the 422 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to 423 * write this page in this writeback sweep will be made. 424 */ 425 if (clear_dirty) { 426 clear_page_dirty_for_io(page); 427 set_page_writeback(page); 428 } else 429 set_page_writeback_keepwrite(page); 430 431 unlock_page(page); 432 433 /* If no buffers on the page are to be written, finish it here */ 434 if (!buffers) 435 end_page_writeback(page); 436 } 437 438 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 439 { 440 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 441 } 442 443 /* 444 * Submit all of the bios for all of the ioends we have saved up, covering the 445 * initial writepage page and also any probed pages. 446 * 447 * Because we may have multiple ioends spanning a page, we need to start 448 * writeback on all the buffers before we submit them for I/O. If we mark the 449 * buffers as we got, then we can end up with a page that only has buffers 450 * marked async write and I/O complete on can occur before we mark the other 451 * buffers async write. 452 * 453 * The end result of this is that we trip a bug in end_page_writeback() because 454 * we call it twice for the one page as the code in end_buffer_async_write() 455 * assumes that all buffers on the page are started at the same time. 456 * 457 * The fix is two passes across the ioend list - one to start writeback on the 458 * buffer_heads, and then submit them for I/O on the second pass. 459 * 460 * If @fail is non-zero, it means that we have a situation where some part of 461 * the submission process has failed after we have marked paged for writeback 462 * and unlocked them. In this situation, we need to fail the ioend chain rather 463 * than submit it to IO. This typically only happens on a filesystem shutdown. 464 */ 465 STATIC void 466 xfs_submit_ioend( 467 struct writeback_control *wbc, 468 xfs_ioend_t *ioend, 469 int fail) 470 { 471 xfs_ioend_t *head = ioend; 472 xfs_ioend_t *next; 473 struct buffer_head *bh; 474 struct bio *bio; 475 sector_t lastblock = 0; 476 477 /* Pass 1 - start writeback */ 478 do { 479 next = ioend->io_list; 480 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 481 xfs_start_buffer_writeback(bh); 482 } while ((ioend = next) != NULL); 483 484 /* Pass 2 - submit I/O */ 485 ioend = head; 486 do { 487 next = ioend->io_list; 488 bio = NULL; 489 490 /* 491 * If we are failing the IO now, just mark the ioend with an 492 * error and finish it. This will run IO completion immediately 493 * as there is only one reference to the ioend at this point in 494 * time. 495 */ 496 if (fail) { 497 ioend->io_error = fail; 498 xfs_finish_ioend(ioend); 499 continue; 500 } 501 502 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 503 504 if (!bio) { 505 retry: 506 bio = xfs_alloc_ioend_bio(bh); 507 } else if (bh->b_blocknr != lastblock + 1) { 508 xfs_submit_ioend_bio(wbc, ioend, bio); 509 goto retry; 510 } 511 512 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { 513 xfs_submit_ioend_bio(wbc, ioend, bio); 514 goto retry; 515 } 516 517 lastblock = bh->b_blocknr; 518 } 519 if (bio) 520 xfs_submit_ioend_bio(wbc, ioend, bio); 521 xfs_finish_ioend(ioend); 522 } while ((ioend = next) != NULL); 523 } 524 525 /* 526 * Cancel submission of all buffer_heads so far in this endio. 527 * Toss the endio too. Only ever called for the initial page 528 * in a writepage request, so only ever one page. 529 */ 530 STATIC void 531 xfs_cancel_ioend( 532 xfs_ioend_t *ioend) 533 { 534 xfs_ioend_t *next; 535 struct buffer_head *bh, *next_bh; 536 537 do { 538 next = ioend->io_list; 539 bh = ioend->io_buffer_head; 540 do { 541 next_bh = bh->b_private; 542 clear_buffer_async_write(bh); 543 /* 544 * The unwritten flag is cleared when added to the 545 * ioend. We're not submitting for I/O so mark the 546 * buffer unwritten again for next time around. 547 */ 548 if (ioend->io_type == XFS_IO_UNWRITTEN) 549 set_buffer_unwritten(bh); 550 unlock_buffer(bh); 551 } while ((bh = next_bh) != NULL); 552 553 mempool_free(ioend, xfs_ioend_pool); 554 } while ((ioend = next) != NULL); 555 } 556 557 /* 558 * Test to see if we've been building up a completion structure for 559 * earlier buffers -- if so, we try to append to this ioend if we 560 * can, otherwise we finish off any current ioend and start another. 561 * Return true if we've finished the given ioend. 562 */ 563 STATIC void 564 xfs_add_to_ioend( 565 struct inode *inode, 566 struct buffer_head *bh, 567 xfs_off_t offset, 568 unsigned int type, 569 xfs_ioend_t **result, 570 int need_ioend) 571 { 572 xfs_ioend_t *ioend = *result; 573 574 if (!ioend || need_ioend || type != ioend->io_type) { 575 xfs_ioend_t *previous = *result; 576 577 ioend = xfs_alloc_ioend(inode, type); 578 ioend->io_offset = offset; 579 ioend->io_buffer_head = bh; 580 ioend->io_buffer_tail = bh; 581 if (previous) 582 previous->io_list = ioend; 583 *result = ioend; 584 } else { 585 ioend->io_buffer_tail->b_private = bh; 586 ioend->io_buffer_tail = bh; 587 } 588 589 bh->b_private = NULL; 590 ioend->io_size += bh->b_size; 591 } 592 593 STATIC void 594 xfs_map_buffer( 595 struct inode *inode, 596 struct buffer_head *bh, 597 struct xfs_bmbt_irec *imap, 598 xfs_off_t offset) 599 { 600 sector_t bn; 601 struct xfs_mount *m = XFS_I(inode)->i_mount; 602 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); 603 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); 604 605 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 606 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 607 608 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + 609 ((offset - iomap_offset) >> inode->i_blkbits); 610 611 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); 612 613 bh->b_blocknr = bn; 614 set_buffer_mapped(bh); 615 } 616 617 STATIC void 618 xfs_map_at_offset( 619 struct inode *inode, 620 struct buffer_head *bh, 621 struct xfs_bmbt_irec *imap, 622 xfs_off_t offset) 623 { 624 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 627 xfs_map_buffer(inode, bh, imap, offset); 628 set_buffer_mapped(bh); 629 clear_buffer_delay(bh); 630 clear_buffer_unwritten(bh); 631 } 632 633 /* 634 * Test if a given page contains at least one buffer of a given @type. 635 * If @check_all_buffers is true, then we walk all the buffers in the page to 636 * try to find one of the type passed in. If it is not set, then the caller only 637 * needs to check the first buffer on the page for a match. 638 */ 639 STATIC bool 640 xfs_check_page_type( 641 struct page *page, 642 unsigned int type, 643 bool check_all_buffers) 644 { 645 struct buffer_head *bh; 646 struct buffer_head *head; 647 648 if (PageWriteback(page)) 649 return false; 650 if (!page->mapping) 651 return false; 652 if (!page_has_buffers(page)) 653 return false; 654 655 bh = head = page_buffers(page); 656 do { 657 if (buffer_unwritten(bh)) { 658 if (type == XFS_IO_UNWRITTEN) 659 return true; 660 } else if (buffer_delay(bh)) { 661 if (type == XFS_IO_DELALLOC) 662 return true; 663 } else if (buffer_dirty(bh) && buffer_mapped(bh)) { 664 if (type == XFS_IO_OVERWRITE) 665 return true; 666 } 667 668 /* If we are only checking the first buffer, we are done now. */ 669 if (!check_all_buffers) 670 break; 671 } while ((bh = bh->b_this_page) != head); 672 673 return false; 674 } 675 676 /* 677 * Allocate & map buffers for page given the extent map. Write it out. 678 * except for the original page of a writepage, this is called on 679 * delalloc/unwritten pages only, for the original page it is possible 680 * that the page has no mapping at all. 681 */ 682 STATIC int 683 xfs_convert_page( 684 struct inode *inode, 685 struct page *page, 686 loff_t tindex, 687 struct xfs_bmbt_irec *imap, 688 xfs_ioend_t **ioendp, 689 struct writeback_control *wbc) 690 { 691 struct buffer_head *bh, *head; 692 xfs_off_t end_offset; 693 unsigned long p_offset; 694 unsigned int type; 695 int len, page_dirty; 696 int count = 0, done = 0, uptodate = 1; 697 xfs_off_t offset = page_offset(page); 698 699 if (page->index != tindex) 700 goto fail; 701 if (!trylock_page(page)) 702 goto fail; 703 if (PageWriteback(page)) 704 goto fail_unlock_page; 705 if (page->mapping != inode->i_mapping) 706 goto fail_unlock_page; 707 if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) 708 goto fail_unlock_page; 709 710 /* 711 * page_dirty is initially a count of buffers on the page before 712 * EOF and is decremented as we move each into a cleanable state. 713 * 714 * Derivation: 715 * 716 * End offset is the highest offset that this page should represent. 717 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 718 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 719 * hence give us the correct page_dirty count. On any other page, 720 * it will be zero and in that case we need page_dirty to be the 721 * count of buffers on the page. 722 */ 723 end_offset = min_t(unsigned long long, 724 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 725 i_size_read(inode)); 726 727 /* 728 * If the current map does not span the entire page we are about to try 729 * to write, then give up. The only way we can write a page that spans 730 * multiple mappings in a single writeback iteration is via the 731 * xfs_vm_writepage() function. Data integrity writeback requires the 732 * entire page to be written in a single attempt, otherwise the part of 733 * the page we don't write here doesn't get written as part of the data 734 * integrity sync. 735 * 736 * For normal writeback, we also don't attempt to write partial pages 737 * here as it simply means that write_cache_pages() will see it under 738 * writeback and ignore the page until some point in the future, at 739 * which time this will be the only page in the file that needs 740 * writeback. Hence for more optimal IO patterns, we should always 741 * avoid partial page writeback due to multiple mappings on a page here. 742 */ 743 if (!xfs_imap_valid(inode, imap, end_offset)) 744 goto fail_unlock_page; 745 746 len = 1 << inode->i_blkbits; 747 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 748 PAGE_CACHE_SIZE); 749 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 page_dirty = p_offset / len; 751 752 /* 753 * The moment we find a buffer that doesn't match our current type 754 * specification or can't be written, abort the loop and start 755 * writeback. As per the above xfs_imap_valid() check, only 756 * xfs_vm_writepage() can handle partial page writeback fully - we are 757 * limited here to the buffers that are contiguous with the current 758 * ioend, and hence a buffer we can't write breaks that contiguity and 759 * we have to defer the rest of the IO to xfs_vm_writepage(). 760 */ 761 bh = head = page_buffers(page); 762 do { 763 if (offset >= end_offset) 764 break; 765 if (!buffer_uptodate(bh)) 766 uptodate = 0; 767 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 done = 1; 769 break; 770 } 771 772 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 buffer_mapped(bh)) { 774 if (buffer_unwritten(bh)) 775 type = XFS_IO_UNWRITTEN; 776 else if (buffer_delay(bh)) 777 type = XFS_IO_DELALLOC; 778 else 779 type = XFS_IO_OVERWRITE; 780 781 /* 782 * imap should always be valid because of the above 783 * partial page end_offset check on the imap. 784 */ 785 ASSERT(xfs_imap_valid(inode, imap, offset)); 786 787 lock_buffer(bh); 788 if (type != XFS_IO_OVERWRITE) 789 xfs_map_at_offset(inode, bh, imap, offset); 790 xfs_add_to_ioend(inode, bh, offset, type, 791 ioendp, done); 792 793 page_dirty--; 794 count++; 795 } else { 796 done = 1; 797 break; 798 } 799 } while (offset += len, (bh = bh->b_this_page) != head); 800 801 if (uptodate && bh == head) 802 SetPageUptodate(page); 803 804 if (count) { 805 if (--wbc->nr_to_write <= 0 && 806 wbc->sync_mode == WB_SYNC_NONE) 807 done = 1; 808 } 809 xfs_start_page_writeback(page, !page_dirty, count); 810 811 return done; 812 fail_unlock_page: 813 unlock_page(page); 814 fail: 815 return 1; 816 } 817 818 /* 819 * Convert & write out a cluster of pages in the same extent as defined 820 * by mp and following the start page. 821 */ 822 STATIC void 823 xfs_cluster_write( 824 struct inode *inode, 825 pgoff_t tindex, 826 struct xfs_bmbt_irec *imap, 827 xfs_ioend_t **ioendp, 828 struct writeback_control *wbc, 829 pgoff_t tlast) 830 { 831 struct pagevec pvec; 832 int done = 0, i; 833 834 pagevec_init(&pvec, 0); 835 while (!done && tindex <= tlast) { 836 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); 837 838 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) 839 break; 840 841 for (i = 0; i < pagevec_count(&pvec); i++) { 842 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 843 imap, ioendp, wbc); 844 if (done) 845 break; 846 } 847 848 pagevec_release(&pvec); 849 cond_resched(); 850 } 851 } 852 853 STATIC void 854 xfs_vm_invalidatepage( 855 struct page *page, 856 unsigned int offset, 857 unsigned int length) 858 { 859 trace_xfs_invalidatepage(page->mapping->host, page, offset, 860 length); 861 block_invalidatepage(page, offset, length); 862 } 863 864 /* 865 * If the page has delalloc buffers on it, we need to punch them out before we 866 * invalidate the page. If we don't, we leave a stale delalloc mapping on the 867 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read 868 * is done on that same region - the delalloc extent is returned when none is 869 * supposed to be there. 870 * 871 * We prevent this by truncating away the delalloc regions on the page before 872 * invalidating it. Because they are delalloc, we can do this without needing a 873 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this 874 * truncation without a transaction as there is no space left for block 875 * reservation (typically why we see a ENOSPC in writeback). 876 * 877 * This is not a performance critical path, so for now just do the punching a 878 * buffer head at a time. 879 */ 880 STATIC void 881 xfs_aops_discard_page( 882 struct page *page) 883 { 884 struct inode *inode = page->mapping->host; 885 struct xfs_inode *ip = XFS_I(inode); 886 struct buffer_head *bh, *head; 887 loff_t offset = page_offset(page); 888 889 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) 890 goto out_invalidate; 891 892 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 goto out_invalidate; 894 895 xfs_alert(ip->i_mount, 896 "page discard on page %p, inode 0x%llx, offset %llu.", 897 page, ip->i_ino, offset); 898 899 xfs_ilock(ip, XFS_ILOCK_EXCL); 900 bh = head = page_buffers(page); 901 do { 902 int error; 903 xfs_fileoff_t start_fsb; 904 905 if (!buffer_delay(bh)) 906 goto next_buffer; 907 908 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 909 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); 910 if (error) { 911 /* something screwed, just bail */ 912 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 913 xfs_alert(ip->i_mount, 914 "page discard unable to remove delalloc mapping."); 915 } 916 break; 917 } 918 next_buffer: 919 offset += 1 << inode->i_blkbits; 920 921 } while ((bh = bh->b_this_page) != head); 922 923 xfs_iunlock(ip, XFS_ILOCK_EXCL); 924 out_invalidate: 925 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); 926 return; 927 } 928 929 /* 930 * Write out a dirty page. 931 * 932 * For delalloc space on the page we need to allocate space and flush it. 933 * For unwritten space on the page we need to start the conversion to 934 * regular allocated space. 935 * For any other dirty buffer heads on the page we should flush them. 936 */ 937 STATIC int 938 xfs_vm_writepage( 939 struct page *page, 940 struct writeback_control *wbc) 941 { 942 struct inode *inode = page->mapping->host; 943 struct buffer_head *bh, *head; 944 struct xfs_bmbt_irec imap; 945 xfs_ioend_t *ioend = NULL, *iohead = NULL; 946 loff_t offset; 947 unsigned int type; 948 __uint64_t end_offset; 949 pgoff_t end_index, last_index; 950 ssize_t len; 951 int err, imap_valid = 0, uptodate = 1; 952 int count = 0; 953 int nonblocking = 0; 954 955 trace_xfs_writepage(inode, page, 0, 0); 956 957 ASSERT(page_has_buffers(page)); 958 959 /* 960 * Refuse to write the page out if we are called from reclaim context. 961 * 962 * This avoids stack overflows when called from deeply used stacks in 963 * random callers for direct reclaim or memcg reclaim. We explicitly 964 * allow reclaim from kswapd as the stack usage there is relatively low. 965 * 966 * This should never happen except in the case of a VM regression so 967 * warn about it. 968 */ 969 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == 970 PF_MEMALLOC)) 971 goto redirty; 972 973 /* 974 * Given that we do not allow direct reclaim to call us, we should 975 * never be called while in a filesystem transaction. 976 */ 977 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 978 goto redirty; 979 980 /* Is this page beyond the end of the file? */ 981 offset = i_size_read(inode); 982 end_index = offset >> PAGE_CACHE_SHIFT; 983 last_index = (offset - 1) >> PAGE_CACHE_SHIFT; 984 985 /* 986 * The page index is less than the end_index, adjust the end_offset 987 * to the highest offset that this page should represent. 988 * ----------------------------------------------------- 989 * | file mapping | <EOF> | 990 * ----------------------------------------------------- 991 * | Page ... | Page N-2 | Page N-1 | Page N | | 992 * ^--------------------------------^----------|-------- 993 * | desired writeback range | see else | 994 * ---------------------------------^------------------| 995 */ 996 if (page->index < end_index) 997 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 998 else { 999 /* 1000 * Check whether the page to write out is beyond or straddles 1001 * i_size or not. 1002 * ------------------------------------------------------- 1003 * | file mapping | <EOF> | 1004 * ------------------------------------------------------- 1005 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | 1006 * ^--------------------------------^-----------|--------- 1007 * | | Straddles | 1008 * ---------------------------------^-----------|--------| 1009 */ 1010 unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); 1011 1012 /* 1013 * Skip the page if it is fully outside i_size, e.g. due to a 1014 * truncate operation that is in progress. We must redirty the 1015 * page so that reclaim stops reclaiming it. Otherwise 1016 * xfs_vm_releasepage() is called on it and gets confused. 1017 * 1018 * Note that the end_index is unsigned long, it would overflow 1019 * if the given offset is greater than 16TB on 32-bit system 1020 * and if we do check the page is fully outside i_size or not 1021 * via "if (page->index >= end_index + 1)" as "end_index + 1" 1022 * will be evaluated to 0. Hence this page will be redirtied 1023 * and be written out repeatedly which would result in an 1024 * infinite loop, the user program that perform this operation 1025 * will hang. Instead, we can verify this situation by checking 1026 * if the page to write is totally beyond the i_size or if it's 1027 * offset is just equal to the EOF. 1028 */ 1029 if (page->index > end_index || 1030 (page->index == end_index && offset_into_page == 0)) 1031 goto redirty; 1032 1033 /* 1034 * The page straddles i_size. It must be zeroed out on each 1035 * and every writepage invocation because it may be mmapped. 1036 * "A file is mapped in multiples of the page size. For a file 1037 * that is not a multiple of the page size, the remaining 1038 * memory is zeroed when mapped, and writes to that region are 1039 * not written out to the file." 1040 */ 1041 zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); 1042 1043 /* Adjust the end_offset to the end of file */ 1044 end_offset = offset; 1045 } 1046 1047 len = 1 << inode->i_blkbits; 1048 1049 bh = head = page_buffers(page); 1050 offset = page_offset(page); 1051 type = XFS_IO_OVERWRITE; 1052 1053 if (wbc->sync_mode == WB_SYNC_NONE) 1054 nonblocking = 1; 1055 1056 do { 1057 int new_ioend = 0; 1058 1059 if (offset >= end_offset) 1060 break; 1061 if (!buffer_uptodate(bh)) 1062 uptodate = 0; 1063 1064 /* 1065 * set_page_dirty dirties all buffers in a page, independent 1066 * of their state. The dirty state however is entirely 1067 * meaningless for holes (!mapped && uptodate), so skip 1068 * buffers covering holes here. 1069 */ 1070 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1071 imap_valid = 0; 1072 continue; 1073 } 1074 1075 if (buffer_unwritten(bh)) { 1076 if (type != XFS_IO_UNWRITTEN) { 1077 type = XFS_IO_UNWRITTEN; 1078 imap_valid = 0; 1079 } 1080 } else if (buffer_delay(bh)) { 1081 if (type != XFS_IO_DELALLOC) { 1082 type = XFS_IO_DELALLOC; 1083 imap_valid = 0; 1084 } 1085 } else if (buffer_uptodate(bh)) { 1086 if (type != XFS_IO_OVERWRITE) { 1087 type = XFS_IO_OVERWRITE; 1088 imap_valid = 0; 1089 } 1090 } else { 1091 if (PageUptodate(page)) 1092 ASSERT(buffer_mapped(bh)); 1093 /* 1094 * This buffer is not uptodate and will not be 1095 * written to disk. Ensure that we will put any 1096 * subsequent writeable buffers into a new 1097 * ioend. 1098 */ 1099 imap_valid = 0; 1100 continue; 1101 } 1102 1103 if (imap_valid) 1104 imap_valid = xfs_imap_valid(inode, &imap, offset); 1105 if (!imap_valid) { 1106 /* 1107 * If we didn't have a valid mapping then we need to 1108 * put the new mapping into a separate ioend structure. 1109 * This ensures non-contiguous extents always have 1110 * separate ioends, which is particularly important 1111 * for unwritten extent conversion at I/O completion 1112 * time. 1113 */ 1114 new_ioend = 1; 1115 err = xfs_map_blocks(inode, offset, &imap, type, 1116 nonblocking); 1117 if (err) 1118 goto error; 1119 imap_valid = xfs_imap_valid(inode, &imap, offset); 1120 } 1121 if (imap_valid) { 1122 lock_buffer(bh); 1123 if (type != XFS_IO_OVERWRITE) 1124 xfs_map_at_offset(inode, bh, &imap, offset); 1125 xfs_add_to_ioend(inode, bh, offset, type, &ioend, 1126 new_ioend); 1127 count++; 1128 } 1129 1130 if (!iohead) 1131 iohead = ioend; 1132 1133 } while (offset += len, ((bh = bh->b_this_page) != head)); 1134 1135 if (uptodate && bh == head) 1136 SetPageUptodate(page); 1137 1138 xfs_start_page_writeback(page, 1, count); 1139 1140 /* if there is no IO to be submitted for this page, we are done */ 1141 if (!ioend) 1142 return 0; 1143 1144 ASSERT(iohead); 1145 1146 /* 1147 * Any errors from this point onwards need tobe reported through the IO 1148 * completion path as we have marked the initial page as under writeback 1149 * and unlocked it. 1150 */ 1151 if (imap_valid) { 1152 xfs_off_t end_index; 1153 1154 end_index = imap.br_startoff + imap.br_blockcount; 1155 1156 /* to bytes */ 1157 end_index <<= inode->i_blkbits; 1158 1159 /* to pages */ 1160 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; 1161 1162 /* check against file size */ 1163 if (end_index > last_index) 1164 end_index = last_index; 1165 1166 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1167 wbc, end_index); 1168 } 1169 1170 1171 /* 1172 * Reserve log space if we might write beyond the on-disk inode size. 1173 */ 1174 err = 0; 1175 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) 1176 err = xfs_setfilesize_trans_alloc(ioend); 1177 1178 xfs_submit_ioend(wbc, iohead, err); 1179 1180 return 0; 1181 1182 error: 1183 if (iohead) 1184 xfs_cancel_ioend(iohead); 1185 1186 if (err == -EAGAIN) 1187 goto redirty; 1188 1189 xfs_aops_discard_page(page); 1190 ClearPageUptodate(page); 1191 unlock_page(page); 1192 return err; 1193 1194 redirty: 1195 redirty_page_for_writepage(wbc, page); 1196 unlock_page(page); 1197 return 0; 1198 } 1199 1200 STATIC int 1201 xfs_vm_writepages( 1202 struct address_space *mapping, 1203 struct writeback_control *wbc) 1204 { 1205 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1206 return generic_writepages(mapping, wbc); 1207 } 1208 1209 /* 1210 * Called to move a page into cleanable state - and from there 1211 * to be released. The page should already be clean. We always 1212 * have buffer heads in this call. 1213 * 1214 * Returns 1 if the page is ok to release, 0 otherwise. 1215 */ 1216 STATIC int 1217 xfs_vm_releasepage( 1218 struct page *page, 1219 gfp_t gfp_mask) 1220 { 1221 int delalloc, unwritten; 1222 1223 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1224 1225 xfs_count_page_state(page, &delalloc, &unwritten); 1226 1227 if (WARN_ON_ONCE(delalloc)) 1228 return 0; 1229 if (WARN_ON_ONCE(unwritten)) 1230 return 0; 1231 1232 return try_to_free_buffers(page); 1233 } 1234 1235 /* 1236 * When we map a DIO buffer, we may need to attach an ioend that describes the 1237 * type of write IO we are doing. This passes to the completion function the 1238 * operations it needs to perform. If the mapping is for an overwrite wholly 1239 * within the EOF then we don't need an ioend and so we don't allocate one. 1240 * This avoids the unnecessary overhead of allocating and freeing ioends for 1241 * workloads that don't require transactions on IO completion. 1242 * 1243 * If we get multiple mappings in a single IO, we might be mapping different 1244 * types. But because the direct IO can only have a single private pointer, we 1245 * need to ensure that: 1246 * 1247 * a) i) the ioend spans the entire region of unwritten mappings; or 1248 * ii) the ioend spans all the mappings that cross or are beyond EOF; and 1249 * b) if it contains unwritten extents, it is *permanently* marked as such 1250 * 1251 * We could do this by chaining ioends like buffered IO does, but we only 1252 * actually get one IO completion callback from the direct IO, and that spans 1253 * the entire IO regardless of how many mappings and IOs are needed to complete 1254 * the DIO. There is only going to be one reference to the ioend and its life 1255 * cycle is constrained by the DIO completion code. hence we don't need 1256 * reference counting here. 1257 */ 1258 static void 1259 xfs_map_direct( 1260 struct inode *inode, 1261 struct buffer_head *bh_result, 1262 struct xfs_bmbt_irec *imap, 1263 xfs_off_t offset) 1264 { 1265 struct xfs_ioend *ioend; 1266 xfs_off_t size = bh_result->b_size; 1267 int type; 1268 1269 if (ISUNWRITTEN(imap)) 1270 type = XFS_IO_UNWRITTEN; 1271 else 1272 type = XFS_IO_OVERWRITE; 1273 1274 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1275 1276 if (bh_result->b_private) { 1277 ioend = bh_result->b_private; 1278 ASSERT(ioend->io_size > 0); 1279 ASSERT(offset >= ioend->io_offset); 1280 if (offset + size > ioend->io_offset + ioend->io_size) 1281 ioend->io_size = offset - ioend->io_offset + size; 1282 1283 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) 1284 ioend->io_type = XFS_IO_UNWRITTEN; 1285 1286 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, 1287 ioend->io_size, ioend->io_type, 1288 imap); 1289 } else if (type == XFS_IO_UNWRITTEN || 1290 offset + size > i_size_read(inode)) { 1291 ioend = xfs_alloc_ioend(inode, type); 1292 ioend->io_offset = offset; 1293 ioend->io_size = size; 1294 1295 bh_result->b_private = ioend; 1296 set_buffer_defer_completion(bh_result); 1297 1298 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, 1299 imap); 1300 } else { 1301 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, 1302 imap); 1303 } 1304 } 1305 1306 /* 1307 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1308 * is, so that we can avoid repeated get_blocks calls. 1309 * 1310 * If the mapping spans EOF, then we have to break the mapping up as the mapping 1311 * for blocks beyond EOF must be marked new so that sub block regions can be 1312 * correctly zeroed. We can't do this for mappings within EOF unless the mapping 1313 * was just allocated or is unwritten, otherwise the callers would overwrite 1314 * existing data with zeros. Hence we have to split the mapping into a range up 1315 * to and including EOF, and a second mapping for beyond EOF. 1316 */ 1317 static void 1318 xfs_map_trim_size( 1319 struct inode *inode, 1320 sector_t iblock, 1321 struct buffer_head *bh_result, 1322 struct xfs_bmbt_irec *imap, 1323 xfs_off_t offset, 1324 ssize_t size) 1325 { 1326 xfs_off_t mapping_size; 1327 1328 mapping_size = imap->br_startoff + imap->br_blockcount - iblock; 1329 mapping_size <<= inode->i_blkbits; 1330 1331 ASSERT(mapping_size > 0); 1332 if (mapping_size > size) 1333 mapping_size = size; 1334 if (offset < i_size_read(inode) && 1335 offset + mapping_size >= i_size_read(inode)) { 1336 /* limit mapping to block that spans EOF */ 1337 mapping_size = roundup_64(i_size_read(inode) - offset, 1338 1 << inode->i_blkbits); 1339 } 1340 if (mapping_size > LONG_MAX) 1341 mapping_size = LONG_MAX; 1342 1343 bh_result->b_size = mapping_size; 1344 } 1345 1346 STATIC int 1347 __xfs_get_blocks( 1348 struct inode *inode, 1349 sector_t iblock, 1350 struct buffer_head *bh_result, 1351 int create, 1352 bool direct) 1353 { 1354 struct xfs_inode *ip = XFS_I(inode); 1355 struct xfs_mount *mp = ip->i_mount; 1356 xfs_fileoff_t offset_fsb, end_fsb; 1357 int error = 0; 1358 int lockmode = 0; 1359 struct xfs_bmbt_irec imap; 1360 int nimaps = 1; 1361 xfs_off_t offset; 1362 ssize_t size; 1363 int new = 0; 1364 1365 if (XFS_FORCED_SHUTDOWN(mp)) 1366 return -EIO; 1367 1368 offset = (xfs_off_t)iblock << inode->i_blkbits; 1369 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1370 size = bh_result->b_size; 1371 1372 if (!create && direct && offset >= i_size_read(inode)) 1373 return 0; 1374 1375 /* 1376 * Direct I/O is usually done on preallocated files, so try getting 1377 * a block mapping without an exclusive lock first. For buffered 1378 * writes we already have the exclusive iolock anyway, so avoiding 1379 * a lock roundtrip here by taking the ilock exclusive from the 1380 * beginning is a useful micro optimization. 1381 */ 1382 if (create && !direct) { 1383 lockmode = XFS_ILOCK_EXCL; 1384 xfs_ilock(ip, lockmode); 1385 } else { 1386 lockmode = xfs_ilock_data_map_shared(ip); 1387 } 1388 1389 ASSERT(offset <= mp->m_super->s_maxbytes); 1390 if (offset + size > mp->m_super->s_maxbytes) 1391 size = mp->m_super->s_maxbytes - offset; 1392 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1393 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1394 1395 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 1396 &imap, &nimaps, XFS_BMAPI_ENTIRE); 1397 if (error) 1398 goto out_unlock; 1399 1400 if (create && 1401 (!nimaps || 1402 (imap.br_startblock == HOLESTARTBLOCK || 1403 imap.br_startblock == DELAYSTARTBLOCK))) { 1404 if (direct || xfs_get_extsz_hint(ip)) { 1405 /* 1406 * Drop the ilock in preparation for starting the block 1407 * allocation transaction. It will be retaken 1408 * exclusively inside xfs_iomap_write_direct for the 1409 * actual allocation. 1410 */ 1411 xfs_iunlock(ip, lockmode); 1412 error = xfs_iomap_write_direct(ip, offset, size, 1413 &imap, nimaps); 1414 if (error) 1415 return error; 1416 new = 1; 1417 1418 } else { 1419 /* 1420 * Delalloc reservations do not require a transaction, 1421 * we can go on without dropping the lock here. If we 1422 * are allocating a new delalloc block, make sure that 1423 * we set the new flag so that we mark the buffer new so 1424 * that we know that it is newly allocated if the write 1425 * fails. 1426 */ 1427 if (nimaps && imap.br_startblock == HOLESTARTBLOCK) 1428 new = 1; 1429 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1430 if (error) 1431 goto out_unlock; 1432 1433 xfs_iunlock(ip, lockmode); 1434 } 1435 trace_xfs_get_blocks_alloc(ip, offset, size, 1436 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1437 : XFS_IO_DELALLOC, &imap); 1438 } else if (nimaps) { 1439 trace_xfs_get_blocks_found(ip, offset, size, 1440 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1441 : XFS_IO_OVERWRITE, &imap); 1442 xfs_iunlock(ip, lockmode); 1443 } else { 1444 trace_xfs_get_blocks_notfound(ip, offset, size); 1445 goto out_unlock; 1446 } 1447 1448 /* trim mapping down to size requested */ 1449 if (direct || size > (1 << inode->i_blkbits)) 1450 xfs_map_trim_size(inode, iblock, bh_result, 1451 &imap, offset, size); 1452 1453 /* 1454 * For unwritten extents do not report a disk address in the buffered 1455 * read case (treat as if we're reading into a hole). 1456 */ 1457 if (imap.br_startblock != HOLESTARTBLOCK && 1458 imap.br_startblock != DELAYSTARTBLOCK && 1459 (create || !ISUNWRITTEN(&imap))) { 1460 xfs_map_buffer(inode, bh_result, &imap, offset); 1461 if (ISUNWRITTEN(&imap)) 1462 set_buffer_unwritten(bh_result); 1463 /* direct IO needs special help */ 1464 if (create && direct) 1465 xfs_map_direct(inode, bh_result, &imap, offset); 1466 } 1467 1468 /* 1469 * If this is a realtime file, data may be on a different device. 1470 * to that pointed to from the buffer_head b_bdev currently. 1471 */ 1472 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1473 1474 /* 1475 * If we previously allocated a block out beyond eof and we are now 1476 * coming back to use it then we will need to flag it as new even if it 1477 * has a disk address. 1478 * 1479 * With sub-block writes into unwritten extents we also need to mark 1480 * the buffer as new so that the unwritten parts of the buffer gets 1481 * correctly zeroed. 1482 */ 1483 if (create && 1484 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1485 (offset >= i_size_read(inode)) || 1486 (new || ISUNWRITTEN(&imap)))) 1487 set_buffer_new(bh_result); 1488 1489 if (imap.br_startblock == DELAYSTARTBLOCK) { 1490 BUG_ON(direct); 1491 if (create) { 1492 set_buffer_uptodate(bh_result); 1493 set_buffer_mapped(bh_result); 1494 set_buffer_delay(bh_result); 1495 } 1496 } 1497 1498 return 0; 1499 1500 out_unlock: 1501 xfs_iunlock(ip, lockmode); 1502 return error; 1503 } 1504 1505 int 1506 xfs_get_blocks( 1507 struct inode *inode, 1508 sector_t iblock, 1509 struct buffer_head *bh_result, 1510 int create) 1511 { 1512 return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1513 } 1514 1515 int 1516 xfs_get_blocks_direct( 1517 struct inode *inode, 1518 sector_t iblock, 1519 struct buffer_head *bh_result, 1520 int create) 1521 { 1522 return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1523 } 1524 1525 static void 1526 __xfs_end_io_direct_write( 1527 struct inode *inode, 1528 struct xfs_ioend *ioend, 1529 loff_t offset, 1530 ssize_t size) 1531 { 1532 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1533 1534 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1535 goto out_end_io; 1536 1537 /* 1538 * dio completion end_io functions are only called on writes if more 1539 * than 0 bytes was written. 1540 */ 1541 ASSERT(size > 0); 1542 1543 /* 1544 * The ioend only maps whole blocks, while the IO may be sector aligned. 1545 * Hence the ioend offset/size may not match the IO offset/size exactly. 1546 * Because we don't map overwrites within EOF into the ioend, the offset 1547 * may not match, but only if the endio spans EOF. Either way, write 1548 * the IO sizes into the ioend so that completion processing does the 1549 * right thing. 1550 */ 1551 ASSERT(offset + size <= ioend->io_offset + ioend->io_size); 1552 ioend->io_size = size; 1553 ioend->io_offset = offset; 1554 1555 /* 1556 * The ioend tells us whether we are doing unwritten extent conversion 1557 * or an append transaction that updates the on-disk file size. These 1558 * cases are the only cases where we should *potentially* be needing 1559 * to update the VFS inode size. 1560 * 1561 * We need to update the in-core inode size here so that we don't end up 1562 * with the on-disk inode size being outside the in-core inode size. We 1563 * have no other method of updating EOF for AIO, so always do it here 1564 * if necessary. 1565 * 1566 * We need to lock the test/set EOF update as we can be racing with 1567 * other IO completions here to update the EOF. Failing to serialise 1568 * here can result in EOF moving backwards and Bad Things Happen when 1569 * that occurs. 1570 */ 1571 spin_lock(&XFS_I(inode)->i_flags_lock); 1572 if (offset + size > i_size_read(inode)) 1573 i_size_write(inode, offset + size); 1574 spin_unlock(&XFS_I(inode)->i_flags_lock); 1575 1576 /* 1577 * If we are doing an append IO that needs to update the EOF on disk, 1578 * do the transaction reserve now so we can use common end io 1579 * processing. Stashing the error (if there is one) in the ioend will 1580 * result in the ioend processing passing on the error if it is 1581 * possible as we can't return it from here. 1582 */ 1583 if (ioend->io_type == XFS_IO_OVERWRITE) 1584 ioend->io_error = xfs_setfilesize_trans_alloc(ioend); 1585 1586 out_end_io: 1587 xfs_end_io(&ioend->io_work); 1588 return; 1589 } 1590 1591 /* 1592 * Complete a direct I/O write request. 1593 * 1594 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1595 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1596 * wholly within the EOF and so there is nothing for us to do. Note that in this 1597 * case the completion can be called in interrupt context, whereas if we have an 1598 * ioend we will always be called in task context (i.e. from a workqueue). 1599 */ 1600 STATIC void 1601 xfs_end_io_direct_write( 1602 struct kiocb *iocb, 1603 loff_t offset, 1604 ssize_t size, 1605 void *private) 1606 { 1607 struct inode *inode = file_inode(iocb->ki_filp); 1608 struct xfs_ioend *ioend = private; 1609 1610 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1611 ioend ? ioend->io_type : 0, NULL); 1612 1613 if (!ioend) { 1614 ASSERT(offset + size <= i_size_read(inode)); 1615 return; 1616 } 1617 1618 __xfs_end_io_direct_write(inode, ioend, offset, size); 1619 } 1620 1621 /* 1622 * For DAX we need a mapping buffer callback for unwritten extent conversion 1623 * when page faults allocate blocks and then zero them. Note that in this 1624 * case the mapping indicated by the ioend may extend beyond EOF. We most 1625 * definitely do not want to extend EOF here, so we trim back the ioend size to 1626 * EOF. 1627 */ 1628 #ifdef CONFIG_FS_DAX 1629 void 1630 xfs_end_io_dax_write( 1631 struct buffer_head *bh, 1632 int uptodate) 1633 { 1634 struct xfs_ioend *ioend = bh->b_private; 1635 struct inode *inode = ioend->io_inode; 1636 ssize_t size = ioend->io_size; 1637 1638 ASSERT(IS_DAX(ioend->io_inode)); 1639 1640 /* if there was an error zeroing, then don't convert it */ 1641 if (!uptodate) 1642 ioend->io_error = -EIO; 1643 1644 /* 1645 * Trim update to EOF, so we don't extend EOF during unwritten extent 1646 * conversion of partial EOF blocks. 1647 */ 1648 spin_lock(&XFS_I(inode)->i_flags_lock); 1649 if (ioend->io_offset + size > i_size_read(inode)) 1650 size = i_size_read(inode) - ioend->io_offset; 1651 spin_unlock(&XFS_I(inode)->i_flags_lock); 1652 1653 __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); 1654 1655 } 1656 #else 1657 void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } 1658 #endif 1659 1660 static inline ssize_t 1661 xfs_vm_do_dio( 1662 struct inode *inode, 1663 struct kiocb *iocb, 1664 struct iov_iter *iter, 1665 loff_t offset, 1666 void (*endio)(struct kiocb *iocb, 1667 loff_t offset, 1668 ssize_t size, 1669 void *private), 1670 int flags) 1671 { 1672 struct block_device *bdev; 1673 1674 if (IS_DAX(inode)) 1675 return dax_do_io(iocb, inode, iter, offset, 1676 xfs_get_blocks_direct, endio, 0); 1677 1678 bdev = xfs_find_bdev_for_inode(inode); 1679 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1680 xfs_get_blocks_direct, endio, NULL, flags); 1681 } 1682 1683 STATIC ssize_t 1684 xfs_vm_direct_IO( 1685 struct kiocb *iocb, 1686 struct iov_iter *iter, 1687 loff_t offset) 1688 { 1689 struct inode *inode = iocb->ki_filp->f_mapping->host; 1690 1691 if (iov_iter_rw(iter) == WRITE) 1692 return xfs_vm_do_dio(inode, iocb, iter, offset, 1693 xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1694 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1695 } 1696 1697 /* 1698 * Punch out the delalloc blocks we have already allocated. 1699 * 1700 * Don't bother with xfs_setattr given that nothing can have made it to disk yet 1701 * as the page is still locked at this point. 1702 */ 1703 STATIC void 1704 xfs_vm_kill_delalloc_range( 1705 struct inode *inode, 1706 loff_t start, 1707 loff_t end) 1708 { 1709 struct xfs_inode *ip = XFS_I(inode); 1710 xfs_fileoff_t start_fsb; 1711 xfs_fileoff_t end_fsb; 1712 int error; 1713 1714 start_fsb = XFS_B_TO_FSB(ip->i_mount, start); 1715 end_fsb = XFS_B_TO_FSB(ip->i_mount, end); 1716 if (end_fsb <= start_fsb) 1717 return; 1718 1719 xfs_ilock(ip, XFS_ILOCK_EXCL); 1720 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1721 end_fsb - start_fsb); 1722 if (error) { 1723 /* something screwed, just bail */ 1724 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1725 xfs_alert(ip->i_mount, 1726 "xfs_vm_write_failed: unable to clean up ino %lld", 1727 ip->i_ino); 1728 } 1729 } 1730 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1731 } 1732 1733 STATIC void 1734 xfs_vm_write_failed( 1735 struct inode *inode, 1736 struct page *page, 1737 loff_t pos, 1738 unsigned len) 1739 { 1740 loff_t block_offset; 1741 loff_t block_start; 1742 loff_t block_end; 1743 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1744 loff_t to = from + len; 1745 struct buffer_head *bh, *head; 1746 1747 /* 1748 * The request pos offset might be 32 or 64 bit, this is all fine 1749 * on 64-bit platform. However, for 64-bit pos request on 32-bit 1750 * platform, the high 32-bit will be masked off if we evaluate the 1751 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is 1752 * 0xfffff000 as an unsigned long, hence the result is incorrect 1753 * which could cause the following ASSERT failed in most cases. 1754 * In order to avoid this, we can evaluate the block_offset of the 1755 * start of the page by using shifts rather than masks the mismatch 1756 * problem. 1757 */ 1758 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; 1759 1760 ASSERT(block_offset + from == pos); 1761 1762 head = page_buffers(page); 1763 block_start = 0; 1764 for (bh = head; bh != head || !block_start; 1765 bh = bh->b_this_page, block_start = block_end, 1766 block_offset += bh->b_size) { 1767 block_end = block_start + bh->b_size; 1768 1769 /* skip buffers before the write */ 1770 if (block_end <= from) 1771 continue; 1772 1773 /* if the buffer is after the write, we're done */ 1774 if (block_start >= to) 1775 break; 1776 1777 if (!buffer_delay(bh)) 1778 continue; 1779 1780 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1781 continue; 1782 1783 xfs_vm_kill_delalloc_range(inode, block_offset, 1784 block_offset + bh->b_size); 1785 1786 /* 1787 * This buffer does not contain data anymore. make sure anyone 1788 * who finds it knows that for certain. 1789 */ 1790 clear_buffer_delay(bh); 1791 clear_buffer_uptodate(bh); 1792 clear_buffer_mapped(bh); 1793 clear_buffer_new(bh); 1794 clear_buffer_dirty(bh); 1795 } 1796 1797 } 1798 1799 /* 1800 * This used to call block_write_begin(), but it unlocks and releases the page 1801 * on error, and we need that page to be able to punch stale delalloc blocks out 1802 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at 1803 * the appropriate point. 1804 */ 1805 STATIC int 1806 xfs_vm_write_begin( 1807 struct file *file, 1808 struct address_space *mapping, 1809 loff_t pos, 1810 unsigned len, 1811 unsigned flags, 1812 struct page **pagep, 1813 void **fsdata) 1814 { 1815 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1816 struct page *page; 1817 int status; 1818 1819 ASSERT(len <= PAGE_CACHE_SIZE); 1820 1821 page = grab_cache_page_write_begin(mapping, index, flags); 1822 if (!page) 1823 return -ENOMEM; 1824 1825 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1826 if (unlikely(status)) { 1827 struct inode *inode = mapping->host; 1828 size_t isize = i_size_read(inode); 1829 1830 xfs_vm_write_failed(inode, page, pos, len); 1831 unlock_page(page); 1832 1833 /* 1834 * If the write is beyond EOF, we only want to kill blocks 1835 * allocated in this write, not blocks that were previously 1836 * written successfully. 1837 */ 1838 if (pos + len > isize) { 1839 ssize_t start = max_t(ssize_t, pos, isize); 1840 1841 truncate_pagecache_range(inode, start, pos + len); 1842 } 1843 1844 page_cache_release(page); 1845 page = NULL; 1846 } 1847 1848 *pagep = page; 1849 return status; 1850 } 1851 1852 /* 1853 * On failure, we only need to kill delalloc blocks beyond EOF in the range of 1854 * this specific write because they will never be written. Previous writes 1855 * beyond EOF where block allocation succeeded do not need to be trashed, so 1856 * only new blocks from this write should be trashed. For blocks within 1857 * EOF, generic_write_end() zeros them so they are safe to leave alone and be 1858 * written with all the other valid data. 1859 */ 1860 STATIC int 1861 xfs_vm_write_end( 1862 struct file *file, 1863 struct address_space *mapping, 1864 loff_t pos, 1865 unsigned len, 1866 unsigned copied, 1867 struct page *page, 1868 void *fsdata) 1869 { 1870 int ret; 1871 1872 ASSERT(len <= PAGE_CACHE_SIZE); 1873 1874 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1875 if (unlikely(ret < len)) { 1876 struct inode *inode = mapping->host; 1877 size_t isize = i_size_read(inode); 1878 loff_t to = pos + len; 1879 1880 if (to > isize) { 1881 /* only kill blocks in this write beyond EOF */ 1882 if (pos > isize) 1883 isize = pos; 1884 xfs_vm_kill_delalloc_range(inode, isize, to); 1885 truncate_pagecache_range(inode, isize, to); 1886 } 1887 } 1888 return ret; 1889 } 1890 1891 STATIC sector_t 1892 xfs_vm_bmap( 1893 struct address_space *mapping, 1894 sector_t block) 1895 { 1896 struct inode *inode = (struct inode *)mapping->host; 1897 struct xfs_inode *ip = XFS_I(inode); 1898 1899 trace_xfs_vm_bmap(XFS_I(inode)); 1900 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1901 filemap_write_and_wait(mapping); 1902 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1903 return generic_block_bmap(mapping, block, xfs_get_blocks); 1904 } 1905 1906 STATIC int 1907 xfs_vm_readpage( 1908 struct file *unused, 1909 struct page *page) 1910 { 1911 return mpage_readpage(page, xfs_get_blocks); 1912 } 1913 1914 STATIC int 1915 xfs_vm_readpages( 1916 struct file *unused, 1917 struct address_space *mapping, 1918 struct list_head *pages, 1919 unsigned nr_pages) 1920 { 1921 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1922 } 1923 1924 /* 1925 * This is basically a copy of __set_page_dirty_buffers() with one 1926 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1927 * dirty, we'll never be able to clean them because we don't write buffers 1928 * beyond EOF, and that means we can't invalidate pages that span EOF 1929 * that have been marked dirty. Further, the dirty state can leak into 1930 * the file interior if the file is extended, resulting in all sorts of 1931 * bad things happening as the state does not match the underlying data. 1932 * 1933 * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1934 * this only exist because of bufferheads and how the generic code manages them. 1935 */ 1936 STATIC int 1937 xfs_vm_set_page_dirty( 1938 struct page *page) 1939 { 1940 struct address_space *mapping = page->mapping; 1941 struct inode *inode = mapping->host; 1942 loff_t end_offset; 1943 loff_t offset; 1944 int newly_dirty; 1945 struct mem_cgroup *memcg; 1946 1947 if (unlikely(!mapping)) 1948 return !TestSetPageDirty(page); 1949 1950 end_offset = i_size_read(inode); 1951 offset = page_offset(page); 1952 1953 spin_lock(&mapping->private_lock); 1954 if (page_has_buffers(page)) { 1955 struct buffer_head *head = page_buffers(page); 1956 struct buffer_head *bh = head; 1957 1958 do { 1959 if (offset < end_offset) 1960 set_buffer_dirty(bh); 1961 bh = bh->b_this_page; 1962 offset += 1 << inode->i_blkbits; 1963 } while (bh != head); 1964 } 1965 /* 1966 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1967 * per-memcg dirty page counters. 1968 */ 1969 memcg = mem_cgroup_begin_page_stat(page); 1970 newly_dirty = !TestSetPageDirty(page); 1971 spin_unlock(&mapping->private_lock); 1972 1973 if (newly_dirty) { 1974 /* sigh - __set_page_dirty() is static, so copy it here, too */ 1975 unsigned long flags; 1976 1977 spin_lock_irqsave(&mapping->tree_lock, flags); 1978 if (page->mapping) { /* Race with truncate? */ 1979 WARN_ON_ONCE(!PageUptodate(page)); 1980 account_page_dirtied(page, mapping, memcg); 1981 radix_tree_tag_set(&mapping->page_tree, 1982 page_index(page), PAGECACHE_TAG_DIRTY); 1983 } 1984 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1985 } 1986 mem_cgroup_end_page_stat(memcg); 1987 if (newly_dirty) 1988 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1989 return newly_dirty; 1990 } 1991 1992 const struct address_space_operations xfs_address_space_operations = { 1993 .readpage = xfs_vm_readpage, 1994 .readpages = xfs_vm_readpages, 1995 .writepage = xfs_vm_writepage, 1996 .writepages = xfs_vm_writepages, 1997 .set_page_dirty = xfs_vm_set_page_dirty, 1998 .releasepage = xfs_vm_releasepage, 1999 .invalidatepage = xfs_vm_invalidatepage, 2000 .write_begin = xfs_vm_write_begin, 2001 .write_end = xfs_vm_write_end, 2002 .bmap = xfs_vm_bmap, 2003 .direct_IO = xfs_vm_direct_IO, 2004 .migratepage = buffer_migrate_page, 2005 .is_partially_uptodate = block_is_partially_uptodate, 2006 .error_remove_page = generic_error_remove_page, 2007 }; 2008
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.