~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/inode.c

Version: ~ [ linux-5.16-rc3 ] ~ [ linux-5.15.5 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.82 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.162 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.218 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.256 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.291 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.293 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Copyright (C) 2007 Oracle.  All rights reserved.
  3  *
  4  * This program is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU General Public
  6  * License v2 as published by the Free Software Foundation.
  7  *
  8  * This program is distributed in the hope that it will be useful,
  9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11  * General Public License for more details.
 12  *
 13  * You should have received a copy of the GNU General Public
 14  * License along with this program; if not, write to the
 15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 16  * Boston, MA 021110-1307, USA.
 17  */
 18 
 19 #include <linux/kernel.h>
 20 #include <linux/bio.h>
 21 #include <linux/buffer_head.h>
 22 #include <linux/file.h>
 23 #include <linux/fs.h>
 24 #include <linux/pagemap.h>
 25 #include <linux/highmem.h>
 26 #include <linux/time.h>
 27 #include <linux/init.h>
 28 #include <linux/string.h>
 29 #include <linux/backing-dev.h>
 30 #include <linux/mpage.h>
 31 #include <linux/swap.h>
 32 #include <linux/writeback.h>
 33 #include <linux/compat.h>
 34 #include <linux/bit_spinlock.h>
 35 #include <linux/xattr.h>
 36 #include <linux/posix_acl.h>
 37 #include <linux/falloc.h>
 38 #include <linux/slab.h>
 39 #include <linux/ratelimit.h>
 40 #include <linux/mount.h>
 41 #include <linux/btrfs.h>
 42 #include <linux/blkdev.h>
 43 #include <linux/posix_acl_xattr.h>
 44 #include <linux/uio.h>
 45 #include <linux/magic.h>
 46 #include <linux/iversion.h>
 47 #include <asm/unaligned.h>
 48 #include "ctree.h"
 49 #include "disk-io.h"
 50 #include "transaction.h"
 51 #include "btrfs_inode.h"
 52 #include "print-tree.h"
 53 #include "ordered-data.h"
 54 #include "xattr.h"
 55 #include "tree-log.h"
 56 #include "volumes.h"
 57 #include "compression.h"
 58 #include "locking.h"
 59 #include "free-space-cache.h"
 60 #include "inode-map.h"
 61 #include "backref.h"
 62 #include "hash.h"
 63 #include "props.h"
 64 #include "qgroup.h"
 65 #include "dedupe.h"
 66 
 67 struct btrfs_iget_args {
 68         struct btrfs_key *location;
 69         struct btrfs_root *root;
 70 };
 71 
 72 struct btrfs_dio_data {
 73         u64 reserve;
 74         u64 unsubmitted_oe_range_start;
 75         u64 unsubmitted_oe_range_end;
 76         int overwrite;
 77 };
 78 
 79 static const struct inode_operations btrfs_dir_inode_operations;
 80 static const struct inode_operations btrfs_symlink_inode_operations;
 81 static const struct inode_operations btrfs_dir_ro_inode_operations;
 82 static const struct inode_operations btrfs_special_inode_operations;
 83 static const struct inode_operations btrfs_file_inode_operations;
 84 static const struct address_space_operations btrfs_aops;
 85 static const struct address_space_operations btrfs_symlink_aops;
 86 static const struct file_operations btrfs_dir_file_operations;
 87 static const struct extent_io_ops btrfs_extent_io_ops;
 88 
 89 static struct kmem_cache *btrfs_inode_cachep;
 90 struct kmem_cache *btrfs_trans_handle_cachep;
 91 struct kmem_cache *btrfs_path_cachep;
 92 struct kmem_cache *btrfs_free_space_cachep;
 93 
 94 #define S_SHIFT 12
 95 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 96         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
 97         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
 98         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
 99         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
100         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
101         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
102         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
103 };
104 
105 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
106 static int btrfs_truncate(struct inode *inode);
107 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
108 static noinline int cow_file_range(struct inode *inode,
109                                    struct page *locked_page,
110                                    u64 start, u64 end, u64 delalloc_end,
111                                    int *page_started, unsigned long *nr_written,
112                                    int unlock, struct btrfs_dedupe_hash *hash);
113 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
114                                        u64 orig_start, u64 block_start,
115                                        u64 block_len, u64 orig_block_len,
116                                        u64 ram_bytes, int compress_type,
117                                        int type);
118 
119 static void __endio_write_update_ordered(struct inode *inode,
120                                          const u64 offset, const u64 bytes,
121                                          const bool uptodate);
122 
123 /*
124  * Cleanup all submitted ordered extents in specified range to handle errors
125  * from the fill_dellaloc() callback.
126  *
127  * NOTE: caller must ensure that when an error happens, it can not call
128  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
129  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
130  * to be released, which we want to happen only when finishing the ordered
131  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
132  * fill_delalloc() callback already does proper cleanup for the first page of
133  * the range, that is, it invokes the callback writepage_end_io_hook() for the
134  * range of the first page.
135  */
136 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
137                                                  const u64 offset,
138                                                  const u64 bytes)
139 {
140         unsigned long index = offset >> PAGE_SHIFT;
141         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
142         struct page *page;
143 
144         while (index <= end_index) {
145                 page = find_get_page(inode->i_mapping, index);
146                 index++;
147                 if (!page)
148                         continue;
149                 ClearPagePrivate2(page);
150                 put_page(page);
151         }
152         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
153                                             bytes - PAGE_SIZE, false);
154 }
155 
156 static int btrfs_dirty_inode(struct inode *inode);
157 
158 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
159 void btrfs_test_inode_set_ops(struct inode *inode)
160 {
161         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
162 }
163 #endif
164 
165 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
166                                      struct inode *inode,  struct inode *dir,
167                                      const struct qstr *qstr)
168 {
169         int err;
170 
171         err = btrfs_init_acl(trans, inode, dir);
172         if (!err)
173                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
174         return err;
175 }
176 
177 /*
178  * this does all the hard work for inserting an inline extent into
179  * the btree.  The caller should have done a btrfs_drop_extents so that
180  * no overlapping inline items exist in the btree
181  */
182 static int insert_inline_extent(struct btrfs_trans_handle *trans,
183                                 struct btrfs_path *path, int extent_inserted,
184                                 struct btrfs_root *root, struct inode *inode,
185                                 u64 start, size_t size, size_t compressed_size,
186                                 int compress_type,
187                                 struct page **compressed_pages)
188 {
189         struct extent_buffer *leaf;
190         struct page *page = NULL;
191         char *kaddr;
192         unsigned long ptr;
193         struct btrfs_file_extent_item *ei;
194         int ret;
195         size_t cur_size = size;
196         unsigned long offset;
197 
198         if (compressed_size && compressed_pages)
199                 cur_size = compressed_size;
200 
201         inode_add_bytes(inode, size);
202 
203         if (!extent_inserted) {
204                 struct btrfs_key key;
205                 size_t datasize;
206 
207                 key.objectid = btrfs_ino(BTRFS_I(inode));
208                 key.offset = start;
209                 key.type = BTRFS_EXTENT_DATA_KEY;
210 
211                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
212                 path->leave_spinning = 1;
213                 ret = btrfs_insert_empty_item(trans, root, path, &key,
214                                               datasize);
215                 if (ret)
216                         goto fail;
217         }
218         leaf = path->nodes[0];
219         ei = btrfs_item_ptr(leaf, path->slots[0],
220                             struct btrfs_file_extent_item);
221         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
222         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
223         btrfs_set_file_extent_encryption(leaf, ei, 0);
224         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
225         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
226         ptr = btrfs_file_extent_inline_start(ei);
227 
228         if (compress_type != BTRFS_COMPRESS_NONE) {
229                 struct page *cpage;
230                 int i = 0;
231                 while (compressed_size > 0) {
232                         cpage = compressed_pages[i];
233                         cur_size = min_t(unsigned long, compressed_size,
234                                        PAGE_SIZE);
235 
236                         kaddr = kmap_atomic(cpage);
237                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
238                         kunmap_atomic(kaddr);
239 
240                         i++;
241                         ptr += cur_size;
242                         compressed_size -= cur_size;
243                 }
244                 btrfs_set_file_extent_compression(leaf, ei,
245                                                   compress_type);
246         } else {
247                 page = find_get_page(inode->i_mapping,
248                                      start >> PAGE_SHIFT);
249                 btrfs_set_file_extent_compression(leaf, ei, 0);
250                 kaddr = kmap_atomic(page);
251                 offset = start & (PAGE_SIZE - 1);
252                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
253                 kunmap_atomic(kaddr);
254                 put_page(page);
255         }
256         btrfs_mark_buffer_dirty(leaf);
257         btrfs_release_path(path);
258 
259         /*
260          * we're an inline extent, so nobody can
261          * extend the file past i_size without locking
262          * a page we already have locked.
263          *
264          * We must do any isize and inode updates
265          * before we unlock the pages.  Otherwise we
266          * could end up racing with unlink.
267          */
268         BTRFS_I(inode)->disk_i_size = inode->i_size;
269         ret = btrfs_update_inode(trans, root, inode);
270 
271 fail:
272         return ret;
273 }
274 
275 
276 /*
277  * conditionally insert an inline extent into the file.  This
278  * does the checks required to make sure the data is small enough
279  * to fit as an inline extent.
280  */
281 static noinline int cow_file_range_inline(struct btrfs_root *root,
282                                           struct inode *inode, u64 start,
283                                           u64 end, size_t compressed_size,
284                                           int compress_type,
285                                           struct page **compressed_pages)
286 {
287         struct btrfs_fs_info *fs_info = root->fs_info;
288         struct btrfs_trans_handle *trans;
289         u64 isize = i_size_read(inode);
290         u64 actual_end = min(end + 1, isize);
291         u64 inline_len = actual_end - start;
292         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
293         u64 data_len = inline_len;
294         int ret;
295         struct btrfs_path *path;
296         int extent_inserted = 0;
297         u32 extent_item_size;
298 
299         if (compressed_size)
300                 data_len = compressed_size;
301 
302         if (start > 0 ||
303             actual_end > fs_info->sectorsize ||
304             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
305             (!compressed_size &&
306             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
307             end + 1 < isize ||
308             data_len > fs_info->max_inline) {
309                 return 1;
310         }
311 
312         path = btrfs_alloc_path();
313         if (!path)
314                 return -ENOMEM;
315 
316         trans = btrfs_join_transaction(root);
317         if (IS_ERR(trans)) {
318                 btrfs_free_path(path);
319                 return PTR_ERR(trans);
320         }
321         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
322 
323         if (compressed_size && compressed_pages)
324                 extent_item_size = btrfs_file_extent_calc_inline_size(
325                    compressed_size);
326         else
327                 extent_item_size = btrfs_file_extent_calc_inline_size(
328                     inline_len);
329 
330         ret = __btrfs_drop_extents(trans, root, inode, path,
331                                    start, aligned_end, NULL,
332                                    1, 1, extent_item_size, &extent_inserted);
333         if (ret) {
334                 btrfs_abort_transaction(trans, ret);
335                 goto out;
336         }
337 
338         if (isize > actual_end)
339                 inline_len = min_t(u64, isize, actual_end);
340         ret = insert_inline_extent(trans, path, extent_inserted,
341                                    root, inode, start,
342                                    inline_len, compressed_size,
343                                    compress_type, compressed_pages);
344         if (ret && ret != -ENOSPC) {
345                 btrfs_abort_transaction(trans, ret);
346                 goto out;
347         } else if (ret == -ENOSPC) {
348                 ret = 1;
349                 goto out;
350         }
351 
352         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
353         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
354 out:
355         /*
356          * Don't forget to free the reserved space, as for inlined extent
357          * it won't count as data extent, free them directly here.
358          * And at reserve time, it's always aligned to page size, so
359          * just free one page here.
360          */
361         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
362         btrfs_free_path(path);
363         btrfs_end_transaction(trans);
364         return ret;
365 }
366 
367 struct async_extent {
368         u64 start;
369         u64 ram_size;
370         u64 compressed_size;
371         struct page **pages;
372         unsigned long nr_pages;
373         int compress_type;
374         struct list_head list;
375 };
376 
377 struct async_cow {
378         struct inode *inode;
379         struct btrfs_root *root;
380         struct page *locked_page;
381         u64 start;
382         u64 end;
383         unsigned int write_flags;
384         struct list_head extents;
385         struct btrfs_work work;
386 };
387 
388 static noinline int add_async_extent(struct async_cow *cow,
389                                      u64 start, u64 ram_size,
390                                      u64 compressed_size,
391                                      struct page **pages,
392                                      unsigned long nr_pages,
393                                      int compress_type)
394 {
395         struct async_extent *async_extent;
396 
397         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
398         BUG_ON(!async_extent); /* -ENOMEM */
399         async_extent->start = start;
400         async_extent->ram_size = ram_size;
401         async_extent->compressed_size = compressed_size;
402         async_extent->pages = pages;
403         async_extent->nr_pages = nr_pages;
404         async_extent->compress_type = compress_type;
405         list_add_tail(&async_extent->list, &cow->extents);
406         return 0;
407 }
408 
409 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
410 {
411         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
412 
413         /* force compress */
414         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
415                 return 1;
416         /* defrag ioctl */
417         if (BTRFS_I(inode)->defrag_compress)
418                 return 1;
419         /* bad compression ratios */
420         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
421                 return 0;
422         if (btrfs_test_opt(fs_info, COMPRESS) ||
423             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
424             BTRFS_I(inode)->prop_compress)
425                 return btrfs_compress_heuristic(inode, start, end);
426         return 0;
427 }
428 
429 static inline void inode_should_defrag(struct btrfs_inode *inode,
430                 u64 start, u64 end, u64 num_bytes, u64 small_write)
431 {
432         /* If this is a small write inside eof, kick off a defrag */
433         if (num_bytes < small_write &&
434             (start > 0 || end + 1 < inode->disk_i_size))
435                 btrfs_add_inode_defrag(NULL, inode);
436 }
437 
438 /*
439  * we create compressed extents in two phases.  The first
440  * phase compresses a range of pages that have already been
441  * locked (both pages and state bits are locked).
442  *
443  * This is done inside an ordered work queue, and the compression
444  * is spread across many cpus.  The actual IO submission is step
445  * two, and the ordered work queue takes care of making sure that
446  * happens in the same order things were put onto the queue by
447  * writepages and friends.
448  *
449  * If this code finds it can't get good compression, it puts an
450  * entry onto the work queue to write the uncompressed bytes.  This
451  * makes sure that both compressed inodes and uncompressed inodes
452  * are written in the same order that the flusher thread sent them
453  * down.
454  */
455 static noinline void compress_file_range(struct inode *inode,
456                                         struct page *locked_page,
457                                         u64 start, u64 end,
458                                         struct async_cow *async_cow,
459                                         int *num_added)
460 {
461         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
462         struct btrfs_root *root = BTRFS_I(inode)->root;
463         u64 blocksize = fs_info->sectorsize;
464         u64 actual_end;
465         u64 isize = i_size_read(inode);
466         int ret = 0;
467         struct page **pages = NULL;
468         unsigned long nr_pages;
469         unsigned long total_compressed = 0;
470         unsigned long total_in = 0;
471         int i;
472         int will_compress;
473         int compress_type = fs_info->compress_type;
474         int redirty = 0;
475 
476         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
477                         SZ_16K);
478 
479         actual_end = min_t(u64, isize, end + 1);
480 again:
481         will_compress = 0;
482         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
483         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
484         nr_pages = min_t(unsigned long, nr_pages,
485                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
486 
487         /*
488          * we don't want to send crud past the end of i_size through
489          * compression, that's just a waste of CPU time.  So, if the
490          * end of the file is before the start of our current
491          * requested range of bytes, we bail out to the uncompressed
492          * cleanup code that can deal with all of this.
493          *
494          * It isn't really the fastest way to fix things, but this is a
495          * very uncommon corner.
496          */
497         if (actual_end <= start)
498                 goto cleanup_and_bail_uncompressed;
499 
500         total_compressed = actual_end - start;
501 
502         /*
503          * skip compression for a small file range(<=blocksize) that
504          * isn't an inline extent, since it doesn't save disk space at all.
505          */
506         if (total_compressed <= blocksize &&
507            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
508                 goto cleanup_and_bail_uncompressed;
509 
510         total_compressed = min_t(unsigned long, total_compressed,
511                         BTRFS_MAX_UNCOMPRESSED);
512         total_in = 0;
513         ret = 0;
514 
515         /*
516          * we do compression for mount -o compress and when the
517          * inode has not been flagged as nocompress.  This flag can
518          * change at any time if we discover bad compression ratios.
519          */
520         if (inode_need_compress(inode, start, end)) {
521                 WARN_ON(pages);
522                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
523                 if (!pages) {
524                         /* just bail out to the uncompressed code */
525                         goto cont;
526                 }
527 
528                 if (BTRFS_I(inode)->defrag_compress)
529                         compress_type = BTRFS_I(inode)->defrag_compress;
530                 else if (BTRFS_I(inode)->prop_compress)
531                         compress_type = BTRFS_I(inode)->prop_compress;
532 
533                 /*
534                  * we need to call clear_page_dirty_for_io on each
535                  * page in the range.  Otherwise applications with the file
536                  * mmap'd can wander in and change the page contents while
537                  * we are compressing them.
538                  *
539                  * If the compression fails for any reason, we set the pages
540                  * dirty again later on.
541                  *
542                  * Note that the remaining part is redirtied, the start pointer
543                  * has moved, the end is the original one.
544                  */
545                 if (!redirty) {
546                         extent_range_clear_dirty_for_io(inode, start, end);
547                         redirty = 1;
548                 }
549 
550                 /* Compression level is applied here and only here */
551                 ret = btrfs_compress_pages(
552                         compress_type | (fs_info->compress_level << 4),
553                                            inode->i_mapping, start,
554                                            pages,
555                                            &nr_pages,
556                                            &total_in,
557                                            &total_compressed);
558 
559                 if (!ret) {
560                         unsigned long offset = total_compressed &
561                                 (PAGE_SIZE - 1);
562                         struct page *page = pages[nr_pages - 1];
563                         char *kaddr;
564 
565                         /* zero the tail end of the last page, we might be
566                          * sending it down to disk
567                          */
568                         if (offset) {
569                                 kaddr = kmap_atomic(page);
570                                 memset(kaddr + offset, 0,
571                                        PAGE_SIZE - offset);
572                                 kunmap_atomic(kaddr);
573                         }
574                         will_compress = 1;
575                 }
576         }
577 cont:
578         if (start == 0) {
579                 /* lets try to make an inline extent */
580                 if (ret || total_in < actual_end) {
581                         /* we didn't compress the entire range, try
582                          * to make an uncompressed inline extent.
583                          */
584                         ret = cow_file_range_inline(root, inode, start, end,
585                                             0, BTRFS_COMPRESS_NONE, NULL);
586                 } else {
587                         /* try making a compressed inline extent */
588                         ret = cow_file_range_inline(root, inode, start, end,
589                                                     total_compressed,
590                                                     compress_type, pages);
591                 }
592                 if (ret <= 0) {
593                         unsigned long clear_flags = EXTENT_DELALLOC |
594                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
595                                 EXTENT_DO_ACCOUNTING;
596                         unsigned long page_error_op;
597 
598                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
599 
600                         /*
601                          * inline extent creation worked or returned error,
602                          * we don't need to create any more async work items.
603                          * Unlock and free up our temp pages.
604                          *
605                          * We use DO_ACCOUNTING here because we need the
606                          * delalloc_release_metadata to be done _after_ we drop
607                          * our outstanding extent for clearing delalloc for this
608                          * range.
609                          */
610                         extent_clear_unlock_delalloc(inode, start, end, end,
611                                                      NULL, clear_flags,
612                                                      PAGE_UNLOCK |
613                                                      PAGE_CLEAR_DIRTY |
614                                                      PAGE_SET_WRITEBACK |
615                                                      page_error_op |
616                                                      PAGE_END_WRITEBACK);
617                         goto free_pages_out;
618                 }
619         }
620 
621         if (will_compress) {
622                 /*
623                  * we aren't doing an inline extent round the compressed size
624                  * up to a block size boundary so the allocator does sane
625                  * things
626                  */
627                 total_compressed = ALIGN(total_compressed, blocksize);
628 
629                 /*
630                  * one last check to make sure the compression is really a
631                  * win, compare the page count read with the blocks on disk,
632                  * compression must free at least one sector size
633                  */
634                 total_in = ALIGN(total_in, PAGE_SIZE);
635                 if (total_compressed + blocksize <= total_in) {
636                         *num_added += 1;
637 
638                         /*
639                          * The async work queues will take care of doing actual
640                          * allocation on disk for these compressed pages, and
641                          * will submit them to the elevator.
642                          */
643                         add_async_extent(async_cow, start, total_in,
644                                         total_compressed, pages, nr_pages,
645                                         compress_type);
646 
647                         if (start + total_in < end) {
648                                 start += total_in;
649                                 pages = NULL;
650                                 cond_resched();
651                                 goto again;
652                         }
653                         return;
654                 }
655         }
656         if (pages) {
657                 /*
658                  * the compression code ran but failed to make things smaller,
659                  * free any pages it allocated and our page pointer array
660                  */
661                 for (i = 0; i < nr_pages; i++) {
662                         WARN_ON(pages[i]->mapping);
663                         put_page(pages[i]);
664                 }
665                 kfree(pages);
666                 pages = NULL;
667                 total_compressed = 0;
668                 nr_pages = 0;
669 
670                 /* flag the file so we don't compress in the future */
671                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
672                     !(BTRFS_I(inode)->prop_compress)) {
673                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
674                 }
675         }
676 cleanup_and_bail_uncompressed:
677         /*
678          * No compression, but we still need to write the pages in the file
679          * we've been given so far.  redirty the locked page if it corresponds
680          * to our extent and set things up for the async work queue to run
681          * cow_file_range to do the normal delalloc dance.
682          */
683         if (page_offset(locked_page) >= start &&
684             page_offset(locked_page) <= end)
685                 __set_page_dirty_nobuffers(locked_page);
686                 /* unlocked later on in the async handlers */
687 
688         if (redirty)
689                 extent_range_redirty_for_io(inode, start, end);
690         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
691                          BTRFS_COMPRESS_NONE);
692         *num_added += 1;
693 
694         return;
695 
696 free_pages_out:
697         for (i = 0; i < nr_pages; i++) {
698                 WARN_ON(pages[i]->mapping);
699                 put_page(pages[i]);
700         }
701         kfree(pages);
702 }
703 
704 static void free_async_extent_pages(struct async_extent *async_extent)
705 {
706         int i;
707 
708         if (!async_extent->pages)
709                 return;
710 
711         for (i = 0; i < async_extent->nr_pages; i++) {
712                 WARN_ON(async_extent->pages[i]->mapping);
713                 put_page(async_extent->pages[i]);
714         }
715         kfree(async_extent->pages);
716         async_extent->nr_pages = 0;
717         async_extent->pages = NULL;
718 }
719 
720 /*
721  * phase two of compressed writeback.  This is the ordered portion
722  * of the code, which only gets called in the order the work was
723  * queued.  We walk all the async extents created by compress_file_range
724  * and send them down to the disk.
725  */
726 static noinline void submit_compressed_extents(struct inode *inode,
727                                               struct async_cow *async_cow)
728 {
729         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
730         struct async_extent *async_extent;
731         u64 alloc_hint = 0;
732         struct btrfs_key ins;
733         struct extent_map *em;
734         struct btrfs_root *root = BTRFS_I(inode)->root;
735         struct extent_io_tree *io_tree;
736         int ret = 0;
737 
738 again:
739         while (!list_empty(&async_cow->extents)) {
740                 async_extent = list_entry(async_cow->extents.next,
741                                           struct async_extent, list);
742                 list_del(&async_extent->list);
743 
744                 io_tree = &BTRFS_I(inode)->io_tree;
745 
746 retry:
747                 /* did the compression code fall back to uncompressed IO? */
748                 if (!async_extent->pages) {
749                         int page_started = 0;
750                         unsigned long nr_written = 0;
751 
752                         lock_extent(io_tree, async_extent->start,
753                                          async_extent->start +
754                                          async_extent->ram_size - 1);
755 
756                         /* allocate blocks */
757                         ret = cow_file_range(inode, async_cow->locked_page,
758                                              async_extent->start,
759                                              async_extent->start +
760                                              async_extent->ram_size - 1,
761                                              async_extent->start +
762                                              async_extent->ram_size - 1,
763                                              &page_started, &nr_written, 0,
764                                              NULL);
765 
766                         /* JDM XXX */
767 
768                         /*
769                          * if page_started, cow_file_range inserted an
770                          * inline extent and took care of all the unlocking
771                          * and IO for us.  Otherwise, we need to submit
772                          * all those pages down to the drive.
773                          */
774                         if (!page_started && !ret)
775                                 extent_write_locked_range(inode,
776                                                   async_extent->start,
777                                                   async_extent->start +
778                                                   async_extent->ram_size - 1,
779                                                   WB_SYNC_ALL);
780                         else if (ret)
781                                 unlock_page(async_cow->locked_page);
782                         kfree(async_extent);
783                         cond_resched();
784                         continue;
785                 }
786 
787                 lock_extent(io_tree, async_extent->start,
788                             async_extent->start + async_extent->ram_size - 1);
789 
790                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
791                                            async_extent->compressed_size,
792                                            async_extent->compressed_size,
793                                            0, alloc_hint, &ins, 1, 1);
794                 if (ret) {
795                         free_async_extent_pages(async_extent);
796 
797                         if (ret == -ENOSPC) {
798                                 unlock_extent(io_tree, async_extent->start,
799                                               async_extent->start +
800                                               async_extent->ram_size - 1);
801 
802                                 /*
803                                  * we need to redirty the pages if we decide to
804                                  * fallback to uncompressed IO, otherwise we
805                                  * will not submit these pages down to lower
806                                  * layers.
807                                  */
808                                 extent_range_redirty_for_io(inode,
809                                                 async_extent->start,
810                                                 async_extent->start +
811                                                 async_extent->ram_size - 1);
812 
813                                 goto retry;
814                         }
815                         goto out_free;
816                 }
817                 /*
818                  * here we're doing allocation and writeback of the
819                  * compressed pages
820                  */
821                 em = create_io_em(inode, async_extent->start,
822                                   async_extent->ram_size, /* len */
823                                   async_extent->start, /* orig_start */
824                                   ins.objectid, /* block_start */
825                                   ins.offset, /* block_len */
826                                   ins.offset, /* orig_block_len */
827                                   async_extent->ram_size, /* ram_bytes */
828                                   async_extent->compress_type,
829                                   BTRFS_ORDERED_COMPRESSED);
830                 if (IS_ERR(em))
831                         /* ret value is not necessary due to void function */
832                         goto out_free_reserve;
833                 free_extent_map(em);
834 
835                 ret = btrfs_add_ordered_extent_compress(inode,
836                                                 async_extent->start,
837                                                 ins.objectid,
838                                                 async_extent->ram_size,
839                                                 ins.offset,
840                                                 BTRFS_ORDERED_COMPRESSED,
841                                                 async_extent->compress_type);
842                 if (ret) {
843                         btrfs_drop_extent_cache(BTRFS_I(inode),
844                                                 async_extent->start,
845                                                 async_extent->start +
846                                                 async_extent->ram_size - 1, 0);
847                         goto out_free_reserve;
848                 }
849                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
850 
851                 /*
852                  * clear dirty, set writeback and unlock the pages.
853                  */
854                 extent_clear_unlock_delalloc(inode, async_extent->start,
855                                 async_extent->start +
856                                 async_extent->ram_size - 1,
857                                 async_extent->start +
858                                 async_extent->ram_size - 1,
859                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
860                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
861                                 PAGE_SET_WRITEBACK);
862                 if (btrfs_submit_compressed_write(inode,
863                                     async_extent->start,
864                                     async_extent->ram_size,
865                                     ins.objectid,
866                                     ins.offset, async_extent->pages,
867                                     async_extent->nr_pages,
868                                     async_cow->write_flags)) {
869                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
870                         struct page *p = async_extent->pages[0];
871                         const u64 start = async_extent->start;
872                         const u64 end = start + async_extent->ram_size - 1;
873 
874                         p->mapping = inode->i_mapping;
875                         tree->ops->writepage_end_io_hook(p, start, end,
876                                                          NULL, 0);
877                         p->mapping = NULL;
878                         extent_clear_unlock_delalloc(inode, start, end, end,
879                                                      NULL, 0,
880                                                      PAGE_END_WRITEBACK |
881                                                      PAGE_SET_ERROR);
882                         free_async_extent_pages(async_extent);
883                 }
884                 alloc_hint = ins.objectid + ins.offset;
885                 kfree(async_extent);
886                 cond_resched();
887         }
888         return;
889 out_free_reserve:
890         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
891         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
892 out_free:
893         extent_clear_unlock_delalloc(inode, async_extent->start,
894                                      async_extent->start +
895                                      async_extent->ram_size - 1,
896                                      async_extent->start +
897                                      async_extent->ram_size - 1,
898                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
899                                      EXTENT_DELALLOC_NEW |
900                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
901                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
902                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
903                                      PAGE_SET_ERROR);
904         free_async_extent_pages(async_extent);
905         kfree(async_extent);
906         goto again;
907 }
908 
909 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
910                                       u64 num_bytes)
911 {
912         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
913         struct extent_map *em;
914         u64 alloc_hint = 0;
915 
916         read_lock(&em_tree->lock);
917         em = search_extent_mapping(em_tree, start, num_bytes);
918         if (em) {
919                 /*
920                  * if block start isn't an actual block number then find the
921                  * first block in this inode and use that as a hint.  If that
922                  * block is also bogus then just don't worry about it.
923                  */
924                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
925                         free_extent_map(em);
926                         em = search_extent_mapping(em_tree, 0, 0);
927                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
928                                 alloc_hint = em->block_start;
929                         if (em)
930                                 free_extent_map(em);
931                 } else {
932                         alloc_hint = em->block_start;
933                         free_extent_map(em);
934                 }
935         }
936         read_unlock(&em_tree->lock);
937 
938         return alloc_hint;
939 }
940 
941 /*
942  * when extent_io.c finds a delayed allocation range in the file,
943  * the call backs end up in this code.  The basic idea is to
944  * allocate extents on disk for the range, and create ordered data structs
945  * in ram to track those extents.
946  *
947  * locked_page is the page that writepage had locked already.  We use
948  * it to make sure we don't do extra locks or unlocks.
949  *
950  * *page_started is set to one if we unlock locked_page and do everything
951  * required to start IO on it.  It may be clean and already done with
952  * IO when we return.
953  */
954 static noinline int cow_file_range(struct inode *inode,
955                                    struct page *locked_page,
956                                    u64 start, u64 end, u64 delalloc_end,
957                                    int *page_started, unsigned long *nr_written,
958                                    int unlock, struct btrfs_dedupe_hash *hash)
959 {
960         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
961         struct btrfs_root *root = BTRFS_I(inode)->root;
962         u64 alloc_hint = 0;
963         u64 num_bytes;
964         unsigned long ram_size;
965         u64 disk_num_bytes;
966         u64 cur_alloc_size = 0;
967         u64 blocksize = fs_info->sectorsize;
968         struct btrfs_key ins;
969         struct extent_map *em;
970         unsigned clear_bits;
971         unsigned long page_ops;
972         bool extent_reserved = false;
973         int ret = 0;
974 
975         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
976                 WARN_ON_ONCE(1);
977                 ret = -EINVAL;
978                 goto out_unlock;
979         }
980 
981         num_bytes = ALIGN(end - start + 1, blocksize);
982         num_bytes = max(blocksize,  num_bytes);
983         disk_num_bytes = num_bytes;
984 
985         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
986 
987         if (start == 0) {
988                 /* lets try to make an inline extent */
989                 ret = cow_file_range_inline(root, inode, start, end, 0,
990                                         BTRFS_COMPRESS_NONE, NULL);
991                 if (ret == 0) {
992                         /*
993                          * We use DO_ACCOUNTING here because we need the
994                          * delalloc_release_metadata to be run _after_ we drop
995                          * our outstanding extent for clearing delalloc for this
996                          * range.
997                          */
998                         extent_clear_unlock_delalloc(inode, start, end,
999                                      delalloc_end, NULL,
1000                                      EXTENT_LOCKED | EXTENT_DELALLOC |
1001                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1002                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1003                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1004                                      PAGE_END_WRITEBACK);
1005                         *nr_written = *nr_written +
1006                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1007                         *page_started = 1;
1008                         goto out;
1009                 } else if (ret < 0) {
1010                         goto out_unlock;
1011                 }
1012         }
1013 
1014         BUG_ON(disk_num_bytes >
1015                btrfs_super_total_bytes(fs_info->super_copy));
1016 
1017         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1018         btrfs_drop_extent_cache(BTRFS_I(inode), start,
1019                         start + num_bytes - 1, 0);
1020 
1021         while (disk_num_bytes > 0) {
1022                 cur_alloc_size = disk_num_bytes;
1023                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1024                                            fs_info->sectorsize, 0, alloc_hint,
1025                                            &ins, 1, 1);
1026                 if (ret < 0)
1027                         goto out_unlock;
1028                 cur_alloc_size = ins.offset;
1029                 extent_reserved = true;
1030 
1031                 ram_size = ins.offset;
1032                 em = create_io_em(inode, start, ins.offset, /* len */
1033                                   start, /* orig_start */
1034                                   ins.objectid, /* block_start */
1035                                   ins.offset, /* block_len */
1036                                   ins.offset, /* orig_block_len */
1037                                   ram_size, /* ram_bytes */
1038                                   BTRFS_COMPRESS_NONE, /* compress_type */
1039                                   BTRFS_ORDERED_REGULAR /* type */);
1040                 if (IS_ERR(em)) {
1041                         ret = PTR_ERR(em);
1042                         goto out_reserve;
1043                 }
1044                 free_extent_map(em);
1045 
1046                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1047                                                ram_size, cur_alloc_size, 0);
1048                 if (ret)
1049                         goto out_drop_extent_cache;
1050 
1051                 if (root->root_key.objectid ==
1052                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1053                         ret = btrfs_reloc_clone_csums(inode, start,
1054                                                       cur_alloc_size);
1055                         /*
1056                          * Only drop cache here, and process as normal.
1057                          *
1058                          * We must not allow extent_clear_unlock_delalloc()
1059                          * at out_unlock label to free meta of this ordered
1060                          * extent, as its meta should be freed by
1061                          * btrfs_finish_ordered_io().
1062                          *
1063                          * So we must continue until @start is increased to
1064                          * skip current ordered extent.
1065                          */
1066                         if (ret)
1067                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1068                                                 start + ram_size - 1, 0);
1069                 }
1070 
1071                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1072 
1073                 /* we're not doing compressed IO, don't unlock the first
1074                  * page (which the caller expects to stay locked), don't
1075                  * clear any dirty bits and don't set any writeback bits
1076                  *
1077                  * Do set the Private2 bit so we know this page was properly
1078                  * setup for writepage
1079                  */
1080                 page_ops = unlock ? PAGE_UNLOCK : 0;
1081                 page_ops |= PAGE_SET_PRIVATE2;
1082 
1083                 extent_clear_unlock_delalloc(inode, start,
1084                                              start + ram_size - 1,
1085                                              delalloc_end, locked_page,
1086                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1087                                              page_ops);
1088                 if (disk_num_bytes < cur_alloc_size)
1089                         disk_num_bytes = 0;
1090                 else
1091                         disk_num_bytes -= cur_alloc_size;
1092                 num_bytes -= cur_alloc_size;
1093                 alloc_hint = ins.objectid + ins.offset;
1094                 start += cur_alloc_size;
1095                 extent_reserved = false;
1096 
1097                 /*
1098                  * btrfs_reloc_clone_csums() error, since start is increased
1099                  * extent_clear_unlock_delalloc() at out_unlock label won't
1100                  * free metadata of current ordered extent, we're OK to exit.
1101                  */
1102                 if (ret)
1103                         goto out_unlock;
1104         }
1105 out:
1106         return ret;
1107 
1108 out_drop_extent_cache:
1109         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1110 out_reserve:
1111         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1112         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1113 out_unlock:
1114         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1115                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1116         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1117                 PAGE_END_WRITEBACK;
1118         /*
1119          * If we reserved an extent for our delalloc range (or a subrange) and
1120          * failed to create the respective ordered extent, then it means that
1121          * when we reserved the extent we decremented the extent's size from
1122          * the data space_info's bytes_may_use counter and incremented the
1123          * space_info's bytes_reserved counter by the same amount. We must make
1124          * sure extent_clear_unlock_delalloc() does not try to decrement again
1125          * the data space_info's bytes_may_use counter, therefore we do not pass
1126          * it the flag EXTENT_CLEAR_DATA_RESV.
1127          */
1128         if (extent_reserved) {
1129                 extent_clear_unlock_delalloc(inode, start,
1130                                              start + cur_alloc_size,
1131                                              start + cur_alloc_size,
1132                                              locked_page,
1133                                              clear_bits,
1134                                              page_ops);
1135                 start += cur_alloc_size;
1136                 if (start >= end)
1137                         goto out;
1138         }
1139         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1140                                      locked_page,
1141                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1142                                      page_ops);
1143         goto out;
1144 }
1145 
1146 /*
1147  * work queue call back to started compression on a file and pages
1148  */
1149 static noinline void async_cow_start(struct btrfs_work *work)
1150 {
1151         struct async_cow *async_cow;
1152         int num_added = 0;
1153         async_cow = container_of(work, struct async_cow, work);
1154 
1155         compress_file_range(async_cow->inode, async_cow->locked_page,
1156                             async_cow->start, async_cow->end, async_cow,
1157                             &num_added);
1158         if (num_added == 0) {
1159                 btrfs_add_delayed_iput(async_cow->inode);
1160                 async_cow->inode = NULL;
1161         }
1162 }
1163 
1164 /*
1165  * work queue call back to submit previously compressed pages
1166  */
1167 static noinline void async_cow_submit(struct btrfs_work *work)
1168 {
1169         struct btrfs_fs_info *fs_info;
1170         struct async_cow *async_cow;
1171         struct btrfs_root *root;
1172         unsigned long nr_pages;
1173 
1174         async_cow = container_of(work, struct async_cow, work);
1175 
1176         root = async_cow->root;
1177         fs_info = root->fs_info;
1178         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1179                 PAGE_SHIFT;
1180 
1181         /*
1182          * atomic_sub_return implies a barrier for waitqueue_active
1183          */
1184         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1185             5 * SZ_1M &&
1186             waitqueue_active(&fs_info->async_submit_wait))
1187                 wake_up(&fs_info->async_submit_wait);
1188 
1189         if (async_cow->inode)
1190                 submit_compressed_extents(async_cow->inode, async_cow);
1191 }
1192 
1193 static noinline void async_cow_free(struct btrfs_work *work)
1194 {
1195         struct async_cow *async_cow;
1196         async_cow = container_of(work, struct async_cow, work);
1197         if (async_cow->inode)
1198                 btrfs_add_delayed_iput(async_cow->inode);
1199         kfree(async_cow);
1200 }
1201 
1202 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1203                                 u64 start, u64 end, int *page_started,
1204                                 unsigned long *nr_written,
1205                                 unsigned int write_flags)
1206 {
1207         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1208         struct async_cow *async_cow;
1209         struct btrfs_root *root = BTRFS_I(inode)->root;
1210         unsigned long nr_pages;
1211         u64 cur_end;
1212 
1213         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1214                          1, 0, NULL);
1215         while (start < end) {
1216                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1217                 BUG_ON(!async_cow); /* -ENOMEM */
1218                 async_cow->inode = igrab(inode);
1219                 async_cow->root = root;
1220                 async_cow->locked_page = locked_page;
1221                 async_cow->start = start;
1222                 async_cow->write_flags = write_flags;
1223 
1224                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1225                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1226                         cur_end = end;
1227                 else
1228                         cur_end = min(end, start + SZ_512K - 1);
1229 
1230                 async_cow->end = cur_end;
1231                 INIT_LIST_HEAD(&async_cow->extents);
1232 
1233                 btrfs_init_work(&async_cow->work,
1234                                 btrfs_delalloc_helper,
1235                                 async_cow_start, async_cow_submit,
1236                                 async_cow_free);
1237 
1238                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1239                         PAGE_SHIFT;
1240                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1241 
1242                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1243 
1244                 *nr_written += nr_pages;
1245                 start = cur_end + 1;
1246         }
1247         *page_started = 1;
1248         return 0;
1249 }
1250 
1251 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1252                                         u64 bytenr, u64 num_bytes)
1253 {
1254         int ret;
1255         struct btrfs_ordered_sum *sums;
1256         LIST_HEAD(list);
1257 
1258         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1259                                        bytenr + num_bytes - 1, &list, 0);
1260         if (ret == 0 && list_empty(&list))
1261                 return 0;
1262 
1263         while (!list_empty(&list)) {
1264                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1265                 list_del(&sums->list);
1266                 kfree(sums);
1267         }
1268         if (ret < 0)
1269                 return ret;
1270         return 1;
1271 }
1272 
1273 /*
1274  * when nowcow writeback call back.  This checks for snapshots or COW copies
1275  * of the extents that exist in the file, and COWs the file as required.
1276  *
1277  * If no cow copies or snapshots exist, we write directly to the existing
1278  * blocks on disk
1279  */
1280 static noinline int run_delalloc_nocow(struct inode *inode,
1281                                        struct page *locked_page,
1282                               u64 start, u64 end, int *page_started, int force,
1283                               unsigned long *nr_written)
1284 {
1285         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1286         struct btrfs_root *root = BTRFS_I(inode)->root;
1287         struct extent_buffer *leaf;
1288         struct btrfs_path *path;
1289         struct btrfs_file_extent_item *fi;
1290         struct btrfs_key found_key;
1291         struct extent_map *em;
1292         u64 cow_start;
1293         u64 cur_offset;
1294         u64 extent_end;
1295         u64 extent_offset;
1296         u64 disk_bytenr;
1297         u64 num_bytes;
1298         u64 disk_num_bytes;
1299         u64 ram_bytes;
1300         int extent_type;
1301         int ret, err;
1302         int type;
1303         int nocow;
1304         int check_prev = 1;
1305         bool nolock;
1306         u64 ino = btrfs_ino(BTRFS_I(inode));
1307 
1308         path = btrfs_alloc_path();
1309         if (!path) {
1310                 extent_clear_unlock_delalloc(inode, start, end, end,
1311                                              locked_page,
1312                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1313                                              EXTENT_DO_ACCOUNTING |
1314                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1315                                              PAGE_CLEAR_DIRTY |
1316                                              PAGE_SET_WRITEBACK |
1317                                              PAGE_END_WRITEBACK);
1318                 return -ENOMEM;
1319         }
1320 
1321         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1322 
1323         cow_start = (u64)-1;
1324         cur_offset = start;
1325         while (1) {
1326                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1327                                                cur_offset, 0);
1328                 if (ret < 0)
1329                         goto error;
1330                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1331                         leaf = path->nodes[0];
1332                         btrfs_item_key_to_cpu(leaf, &found_key,
1333                                               path->slots[0] - 1);
1334                         if (found_key.objectid == ino &&
1335                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1336                                 path->slots[0]--;
1337                 }
1338                 check_prev = 0;
1339 next_slot:
1340                 leaf = path->nodes[0];
1341                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1342                         ret = btrfs_next_leaf(root, path);
1343                         if (ret < 0) {
1344                                 if (cow_start != (u64)-1)
1345                                         cur_offset = cow_start;
1346                                 goto error;
1347                         }
1348                         if (ret > 0)
1349                                 break;
1350                         leaf = path->nodes[0];
1351                 }
1352 
1353                 nocow = 0;
1354                 disk_bytenr = 0;
1355                 num_bytes = 0;
1356                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1357 
1358                 if (found_key.objectid > ino)
1359                         break;
1360                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1361                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1362                         path->slots[0]++;
1363                         goto next_slot;
1364                 }
1365                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1366                     found_key.offset > end)
1367                         break;
1368 
1369                 if (found_key.offset > cur_offset) {
1370                         extent_end = found_key.offset;
1371                         extent_type = 0;
1372                         goto out_check;
1373                 }
1374 
1375                 fi = btrfs_item_ptr(leaf, path->slots[0],
1376                                     struct btrfs_file_extent_item);
1377                 extent_type = btrfs_file_extent_type(leaf, fi);
1378 
1379                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1380                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1381                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1382                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1383                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1384                         extent_end = found_key.offset +
1385                                 btrfs_file_extent_num_bytes(leaf, fi);
1386                         disk_num_bytes =
1387                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1388                         if (extent_end <= start) {
1389                                 path->slots[0]++;
1390                                 goto next_slot;
1391                         }
1392                         if (disk_bytenr == 0)
1393                                 goto out_check;
1394                         if (btrfs_file_extent_compression(leaf, fi) ||
1395                             btrfs_file_extent_encryption(leaf, fi) ||
1396                             btrfs_file_extent_other_encoding(leaf, fi))
1397                                 goto out_check;
1398                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1399                                 goto out_check;
1400                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1401                                 goto out_check;
1402                         ret = btrfs_cross_ref_exist(root, ino,
1403                                                     found_key.offset -
1404                                                     extent_offset, disk_bytenr);
1405                         if (ret) {
1406                                 /*
1407                                  * ret could be -EIO if the above fails to read
1408                                  * metadata.
1409                                  */
1410                                 if (ret < 0) {
1411                                         if (cow_start != (u64)-1)
1412                                                 cur_offset = cow_start;
1413                                         goto error;
1414                                 }
1415 
1416                                 WARN_ON_ONCE(nolock);
1417                                 goto out_check;
1418                         }
1419                         disk_bytenr += extent_offset;
1420                         disk_bytenr += cur_offset - found_key.offset;
1421                         num_bytes = min(end + 1, extent_end) - cur_offset;
1422                         /*
1423                          * if there are pending snapshots for this root,
1424                          * we fall into common COW way.
1425                          */
1426                         if (!nolock) {
1427                                 err = btrfs_start_write_no_snapshotting(root);
1428                                 if (!err)
1429                                         goto out_check;
1430                         }
1431                         /*
1432                          * force cow if csum exists in the range.
1433                          * this ensure that csum for a given extent are
1434                          * either valid or do not exist.
1435                          */
1436                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1437                                                   num_bytes);
1438                         if (ret) {
1439                                 if (!nolock)
1440                                         btrfs_end_write_no_snapshotting(root);
1441 
1442                                 /*
1443                                  * ret could be -EIO if the above fails to read
1444                                  * metadata.
1445                                  */
1446                                 if (ret < 0) {
1447                                         if (cow_start != (u64)-1)
1448                                                 cur_offset = cow_start;
1449                                         goto error;
1450                                 }
1451                                 WARN_ON_ONCE(nolock);
1452                                 goto out_check;
1453                         }
1454                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1455                                 if (!nolock)
1456                                         btrfs_end_write_no_snapshotting(root);
1457                                 goto out_check;
1458                         }
1459                         nocow = 1;
1460                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1461                         extent_end = found_key.offset +
1462                                 btrfs_file_extent_inline_len(leaf,
1463                                                      path->slots[0], fi);
1464                         extent_end = ALIGN(extent_end,
1465                                            fs_info->sectorsize);
1466                 } else {
1467                         BUG_ON(1);
1468                 }
1469 out_check:
1470                 if (extent_end <= start) {
1471                         path->slots[0]++;
1472                         if (!nolock && nocow)
1473                                 btrfs_end_write_no_snapshotting(root);
1474                         if (nocow)
1475                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1476                         goto next_slot;
1477                 }
1478                 if (!nocow) {
1479                         if (cow_start == (u64)-1)
1480                                 cow_start = cur_offset;
1481                         cur_offset = extent_end;
1482                         if (cur_offset > end)
1483                                 break;
1484                         path->slots[0]++;
1485                         goto next_slot;
1486                 }
1487 
1488                 btrfs_release_path(path);
1489                 if (cow_start != (u64)-1) {
1490                         ret = cow_file_range(inode, locked_page,
1491                                              cow_start, found_key.offset - 1,
1492                                              end, page_started, nr_written, 1,
1493                                              NULL);
1494                         if (ret) {
1495                                 if (!nolock && nocow)
1496                                         btrfs_end_write_no_snapshotting(root);
1497                                 if (nocow)
1498                                         btrfs_dec_nocow_writers(fs_info,
1499                                                                 disk_bytenr);
1500                                 goto error;
1501                         }
1502                         cow_start = (u64)-1;
1503                 }
1504 
1505                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1506                         u64 orig_start = found_key.offset - extent_offset;
1507 
1508                         em = create_io_em(inode, cur_offset, num_bytes,
1509                                           orig_start,
1510                                           disk_bytenr, /* block_start */
1511                                           num_bytes, /* block_len */
1512                                           disk_num_bytes, /* orig_block_len */
1513                                           ram_bytes, BTRFS_COMPRESS_NONE,
1514                                           BTRFS_ORDERED_PREALLOC);
1515                         if (IS_ERR(em)) {
1516                                 if (!nolock && nocow)
1517                                         btrfs_end_write_no_snapshotting(root);
1518                                 if (nocow)
1519                                         btrfs_dec_nocow_writers(fs_info,
1520                                                                 disk_bytenr);
1521                                 ret = PTR_ERR(em);
1522                                 goto error;
1523                         }
1524                         free_extent_map(em);
1525                 }
1526 
1527                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1528                         type = BTRFS_ORDERED_PREALLOC;
1529                 } else {
1530                         type = BTRFS_ORDERED_NOCOW;
1531                 }
1532 
1533                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1534                                                num_bytes, num_bytes, type);
1535                 if (nocow)
1536                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1537                 BUG_ON(ret); /* -ENOMEM */
1538 
1539                 if (root->root_key.objectid ==
1540                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1541                         /*
1542                          * Error handled later, as we must prevent
1543                          * extent_clear_unlock_delalloc() in error handler
1544                          * from freeing metadata of created ordered extent.
1545                          */
1546                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1547                                                       num_bytes);
1548 
1549                 extent_clear_unlock_delalloc(inode, cur_offset,
1550                                              cur_offset + num_bytes - 1, end,
1551                                              locked_page, EXTENT_LOCKED |
1552                                              EXTENT_DELALLOC |
1553                                              EXTENT_CLEAR_DATA_RESV,
1554                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1555 
1556                 if (!nolock && nocow)
1557                         btrfs_end_write_no_snapshotting(root);
1558                 cur_offset = extent_end;
1559 
1560                 /*
1561                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1562                  * handler, as metadata for created ordered extent will only
1563                  * be freed by btrfs_finish_ordered_io().
1564                  */
1565                 if (ret)
1566                         goto error;
1567                 if (cur_offset > end)
1568                         break;
1569         }
1570         btrfs_release_path(path);
1571 
1572         if (cur_offset <= end && cow_start == (u64)-1) {
1573                 cow_start = cur_offset;
1574                 cur_offset = end;
1575         }
1576 
1577         if (cow_start != (u64)-1) {
1578                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1579                                      page_started, nr_written, 1, NULL);
1580                 if (ret)
1581                         goto error;
1582         }
1583 
1584 error:
1585         if (ret && cur_offset < end)
1586                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1587                                              locked_page, EXTENT_LOCKED |
1588                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1589                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1590                                              PAGE_CLEAR_DIRTY |
1591                                              PAGE_SET_WRITEBACK |
1592                                              PAGE_END_WRITEBACK);
1593         btrfs_free_path(path);
1594         return ret;
1595 }
1596 
1597 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1598 {
1599 
1600         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1601             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1602                 return 0;
1603 
1604         /*
1605          * @defrag_bytes is a hint value, no spinlock held here,
1606          * if is not zero, it means the file is defragging.
1607          * Force cow if given extent needs to be defragged.
1608          */
1609         if (BTRFS_I(inode)->defrag_bytes &&
1610             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1611                            EXTENT_DEFRAG, 0, NULL))
1612                 return 1;
1613 
1614         return 0;
1615 }
1616 
1617 /*
1618  * extent_io.c call back to do delayed allocation processing
1619  */
1620 static int run_delalloc_range(void *private_data, struct page *locked_page,
1621                               u64 start, u64 end, int *page_started,
1622                               unsigned long *nr_written,
1623                               struct writeback_control *wbc)
1624 {
1625         struct inode *inode = private_data;
1626         int ret;
1627         int force_cow = need_force_cow(inode, start, end);
1628         unsigned int write_flags = wbc_to_write_flags(wbc);
1629 
1630         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1631                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1632                                          page_started, 1, nr_written);
1633         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1634                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1635                                          page_started, 0, nr_written);
1636         } else if (!inode_need_compress(inode, start, end)) {
1637                 ret = cow_file_range(inode, locked_page, start, end, end,
1638                                       page_started, nr_written, 1, NULL);
1639         } else {
1640                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1641                         &BTRFS_I(inode)->runtime_flags);
1642                 ret = cow_file_range_async(inode, locked_page, start, end,
1643                                            page_started, nr_written,
1644                                            write_flags);
1645         }
1646         if (ret)
1647                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1648         return ret;
1649 }
1650 
1651 static void btrfs_split_extent_hook(void *private_data,
1652                                     struct extent_state *orig, u64 split)
1653 {
1654         struct inode *inode = private_data;
1655         u64 size;
1656 
1657         /* not delalloc, ignore it */
1658         if (!(orig->state & EXTENT_DELALLOC))
1659                 return;
1660 
1661         size = orig->end - orig->start + 1;
1662         if (size > BTRFS_MAX_EXTENT_SIZE) {
1663                 u32 num_extents;
1664                 u64 new_size;
1665 
1666                 /*
1667                  * See the explanation in btrfs_merge_extent_hook, the same
1668                  * applies here, just in reverse.
1669                  */
1670                 new_size = orig->end - split + 1;
1671                 num_extents = count_max_extents(new_size);
1672                 new_size = split - orig->start;
1673                 num_extents += count_max_extents(new_size);
1674                 if (count_max_extents(size) >= num_extents)
1675                         return;
1676         }
1677 
1678         spin_lock(&BTRFS_I(inode)->lock);
1679         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1680         spin_unlock(&BTRFS_I(inode)->lock);
1681 }
1682 
1683 /*
1684  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1685  * extents so we can keep track of new extents that are just merged onto old
1686  * extents, such as when we are doing sequential writes, so we can properly
1687  * account for the metadata space we'll need.
1688  */
1689 static void btrfs_merge_extent_hook(void *private_data,
1690                                     struct extent_state *new,
1691                                     struct extent_state *other)
1692 {
1693         struct inode *inode = private_data;
1694         u64 new_size, old_size;
1695         u32 num_extents;
1696 
1697         /* not delalloc, ignore it */
1698         if (!(other->state & EXTENT_DELALLOC))
1699                 return;
1700 
1701         if (new->start > other->start)
1702                 new_size = new->end - other->start + 1;
1703         else
1704                 new_size = other->end - new->start + 1;
1705 
1706         /* we're not bigger than the max, unreserve the space and go */
1707         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1708                 spin_lock(&BTRFS_I(inode)->lock);
1709                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1710                 spin_unlock(&BTRFS_I(inode)->lock);
1711                 return;
1712         }
1713 
1714         /*
1715          * We have to add up either side to figure out how many extents were
1716          * accounted for before we merged into one big extent.  If the number of
1717          * extents we accounted for is <= the amount we need for the new range
1718          * then we can return, otherwise drop.  Think of it like this
1719          *
1720          * [ 4k][MAX_SIZE]
1721          *
1722          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1723          * need 2 outstanding extents, on one side we have 1 and the other side
1724          * we have 1 so they are == and we can return.  But in this case
1725          *
1726          * [MAX_SIZE+4k][MAX_SIZE+4k]
1727          *
1728          * Each range on their own accounts for 2 extents, but merged together
1729          * they are only 3 extents worth of accounting, so we need to drop in
1730          * this case.
1731          */
1732         old_size = other->end - other->start + 1;
1733         num_extents = count_max_extents(old_size);
1734         old_size = new->end - new->start + 1;
1735         num_extents += count_max_extents(old_size);
1736         if (count_max_extents(new_size) >= num_extents)
1737                 return;
1738 
1739         spin_lock(&BTRFS_I(inode)->lock);
1740         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1741         spin_unlock(&BTRFS_I(inode)->lock);
1742 }
1743 
1744 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1745                                       struct inode *inode)
1746 {
1747         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1748 
1749         spin_lock(&root->delalloc_lock);
1750         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1751                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1752                               &root->delalloc_inodes);
1753                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1754                         &BTRFS_I(inode)->runtime_flags);
1755                 root->nr_delalloc_inodes++;
1756                 if (root->nr_delalloc_inodes == 1) {
1757                         spin_lock(&fs_info->delalloc_root_lock);
1758                         BUG_ON(!list_empty(&root->delalloc_root));
1759                         list_add_tail(&root->delalloc_root,
1760                                       &fs_info->delalloc_roots);
1761                         spin_unlock(&fs_info->delalloc_root_lock);
1762                 }
1763         }
1764         spin_unlock(&root->delalloc_lock);
1765 }
1766 
1767 
1768 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
1769                                 struct btrfs_inode *inode)
1770 {
1771         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1772 
1773         if (!list_empty(&inode->delalloc_inodes)) {
1774                 list_del_init(&inode->delalloc_inodes);
1775                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1776                           &inode->runtime_flags);
1777                 root->nr_delalloc_inodes--;
1778                 if (!root->nr_delalloc_inodes) {
1779                         spin_lock(&fs_info->delalloc_root_lock);
1780                         BUG_ON(list_empty(&root->delalloc_root));
1781                         list_del_init(&root->delalloc_root);
1782                         spin_unlock(&fs_info->delalloc_root_lock);
1783                 }
1784         }
1785 }
1786 
1787 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1788                                      struct btrfs_inode *inode)
1789 {
1790         spin_lock(&root->delalloc_lock);
1791         __btrfs_del_delalloc_inode(root, inode);
1792         spin_unlock(&root->delalloc_lock);
1793 }
1794 
1795 /*
1796  * extent_io.c set_bit_hook, used to track delayed allocation
1797  * bytes in this file, and to maintain the list of inodes that
1798  * have pending delalloc work to be done.
1799  */
1800 static void btrfs_set_bit_hook(void *private_data,
1801                                struct extent_state *state, unsigned *bits)
1802 {
1803         struct inode *inode = private_data;
1804 
1805         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1806 
1807         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1808                 WARN_ON(1);
1809         /*
1810          * set_bit and clear bit hooks normally require _irqsave/restore
1811          * but in this case, we are only testing for the DELALLOC
1812          * bit, which is only set or cleared with irqs on
1813          */
1814         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1815                 struct btrfs_root *root = BTRFS_I(inode)->root;
1816                 u64 len = state->end + 1 - state->start;
1817                 u32 num_extents = count_max_extents(len);
1818                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1819 
1820                 spin_lock(&BTRFS_I(inode)->lock);
1821                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1822                 spin_unlock(&BTRFS_I(inode)->lock);
1823 
1824                 /* For sanity tests */
1825                 if (btrfs_is_testing(fs_info))
1826                         return;
1827 
1828                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1829                                          fs_info->delalloc_batch);
1830                 spin_lock(&BTRFS_I(inode)->lock);
1831                 BTRFS_I(inode)->delalloc_bytes += len;
1832                 if (*bits & EXTENT_DEFRAG)
1833                         BTRFS_I(inode)->defrag_bytes += len;
1834                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1835                                          &BTRFS_I(inode)->runtime_flags))
1836                         btrfs_add_delalloc_inodes(root, inode);
1837                 spin_unlock(&BTRFS_I(inode)->lock);
1838         }
1839 
1840         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1841             (*bits & EXTENT_DELALLOC_NEW)) {
1842                 spin_lock(&BTRFS_I(inode)->lock);
1843                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1844                         state->start;
1845                 spin_unlock(&BTRFS_I(inode)->lock);
1846         }
1847 }
1848 
1849 /*
1850  * extent_io.c clear_bit_hook, see set_bit_hook for why
1851  */
1852 static void btrfs_clear_bit_hook(void *private_data,
1853                                  struct extent_state *state,
1854                                  unsigned *bits)
1855 {
1856         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1857         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1858         u64 len = state->end + 1 - state->start;
1859         u32 num_extents = count_max_extents(len);
1860 
1861         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1862                 spin_lock(&inode->lock);
1863                 inode->defrag_bytes -= len;
1864                 spin_unlock(&inode->lock);
1865         }
1866 
1867         /*
1868          * set_bit and clear bit hooks normally require _irqsave/restore
1869          * but in this case, we are only testing for the DELALLOC
1870          * bit, which is only set or cleared with irqs on
1871          */
1872         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1873                 struct btrfs_root *root = inode->root;
1874                 bool do_list = !btrfs_is_free_space_inode(inode);
1875 
1876                 spin_lock(&inode->lock);
1877                 btrfs_mod_outstanding_extents(inode, -num_extents);
1878                 spin_unlock(&inode->lock);
1879 
1880                 /*
1881                  * We don't reserve metadata space for space cache inodes so we
1882                  * don't need to call dellalloc_release_metadata if there is an
1883                  * error.
1884                  */
1885                 if (*bits & EXTENT_CLEAR_META_RESV &&
1886                     root != fs_info->tree_root)
1887                         btrfs_delalloc_release_metadata(inode, len);
1888 
1889                 /* For sanity tests. */
1890                 if (btrfs_is_testing(fs_info))
1891                         return;
1892 
1893                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1894                     do_list && !(state->state & EXTENT_NORESERVE) &&
1895                     (*bits & EXTENT_CLEAR_DATA_RESV))
1896                         btrfs_free_reserved_data_space_noquota(
1897                                         &inode->vfs_inode,
1898                                         state->start, len);
1899 
1900                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1901                                          fs_info->delalloc_batch);
1902                 spin_lock(&inode->lock);
1903                 inode->delalloc_bytes -= len;
1904                 if (do_list && inode->delalloc_bytes == 0 &&
1905                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1906                                         &inode->runtime_flags))
1907                         btrfs_del_delalloc_inode(root, inode);
1908                 spin_unlock(&inode->lock);
1909         }
1910 
1911         if ((state->state & EXTENT_DELALLOC_NEW) &&
1912             (*bits & EXTENT_DELALLOC_NEW)) {
1913                 spin_lock(&inode->lock);
1914                 ASSERT(inode->new_delalloc_bytes >= len);
1915                 inode->new_delalloc_bytes -= len;
1916                 spin_unlock(&inode->lock);
1917         }
1918 }
1919 
1920 /*
1921  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1922  * we don't create bios that span stripes or chunks
1923  *
1924  * return 1 if page cannot be merged to bio
1925  * return 0 if page can be merged to bio
1926  * return error otherwise
1927  */
1928 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1929                          size_t size, struct bio *bio,
1930                          unsigned long bio_flags)
1931 {
1932         struct inode *inode = page->mapping->host;
1933         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1934         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1935         u64 length = 0;
1936         u64 map_length;
1937         int ret;
1938 
1939         if (bio_flags & EXTENT_BIO_COMPRESSED)
1940                 return 0;
1941 
1942         length = bio->bi_iter.bi_size;
1943         map_length = length;
1944         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1945                               NULL, 0);
1946         if (ret < 0)
1947                 return ret;
1948         if (map_length < length + size)
1949                 return 1;
1950         return 0;
1951 }
1952 
1953 /*
1954  * in order to insert checksums into the metadata in large chunks,
1955  * we wait until bio submission time.   All the pages in the bio are
1956  * checksummed and sums are attached onto the ordered extent record.
1957  *
1958  * At IO completion time the cums attached on the ordered extent record
1959  * are inserted into the btree
1960  */
1961 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1962                                     int mirror_num, unsigned long bio_flags,
1963                                     u64 bio_offset)
1964 {
1965         struct inode *inode = private_data;
1966         blk_status_t ret = 0;
1967 
1968         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1969         BUG_ON(ret); /* -ENOMEM */
1970         return 0;
1971 }
1972 
1973 /*
1974  * in order to insert checksums into the metadata in large chunks,
1975  * we wait until bio submission time.   All the pages in the bio are
1976  * checksummed and sums are attached onto the ordered extent record.
1977  *
1978  * At IO completion time the cums attached on the ordered extent record
1979  * are inserted into the btree
1980  */
1981 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1982                           int mirror_num, unsigned long bio_flags,
1983                           u64 bio_offset)
1984 {
1985         struct inode *inode = private_data;
1986         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1987         blk_status_t ret;
1988 
1989         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1990         if (ret) {
1991                 bio->bi_status = ret;
1992                 bio_endio(bio);
1993         }
1994         return ret;
1995 }
1996 
1997 /*
1998  * extent_io.c submission hook. This does the right thing for csum calculation
1999  * on write, or reading the csums from the tree before a read.
2000  *
2001  * Rules about async/sync submit,
2002  * a) read:                             sync submit
2003  *
2004  * b) write without checksum:           sync submit
2005  *
2006  * c) write with checksum:
2007  *    c-1) if bio is issued by fsync:   sync submit
2008  *         (sync_writers != 0)
2009  *
2010  *    c-2) if root is reloc root:       sync submit
2011  *         (only in case of buffered IO)
2012  *
2013  *    c-3) otherwise:                   async submit
2014  */
2015 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
2016                                  int mirror_num, unsigned long bio_flags,
2017                                  u64 bio_offset)
2018 {
2019         struct inode *inode = private_data;
2020         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2021         struct btrfs_root *root = BTRFS_I(inode)->root;
2022         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2023         blk_status_t ret = 0;
2024         int skip_sum;
2025         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2026 
2027         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2028 
2029         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2030                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2031 
2032         if (bio_op(bio) != REQ_OP_WRITE) {
2033                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2034                 if (ret)
2035                         goto out;
2036 
2037                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2038                         ret = btrfs_submit_compressed_read(inode, bio,
2039                                                            mirror_num,
2040                                                            bio_flags);
2041                         goto out;
2042                 } else if (!skip_sum) {
2043                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2044                         if (ret)
2045                                 goto out;
2046                 }
2047                 goto mapit;
2048         } else if (async && !skip_sum) {
2049                 /* csum items have already been cloned */
2050                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2051                         goto mapit;
2052                 /* we're doing a write, do the async checksumming */
2053                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2054                                           bio_offset, inode,
2055                                           __btrfs_submit_bio_start,
2056                                           __btrfs_submit_bio_done);
2057                 goto out;
2058         } else if (!skip_sum) {
2059                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2060                 if (ret)
2061                         goto out;
2062         }
2063 
2064 mapit:
2065         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2066 
2067 out:
2068         if (ret) {
2069                 bio->bi_status = ret;
2070                 bio_endio(bio);
2071         }
2072         return ret;
2073 }
2074 
2075 /*
2076  * given a list of ordered sums record them in the inode.  This happens
2077  * at IO completion time based on sums calculated at bio submission time.
2078  */
2079 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2080                              struct inode *inode, struct list_head *list)
2081 {
2082         struct btrfs_ordered_sum *sum;
2083         int ret;
2084 
2085         list_for_each_entry(sum, list, list) {
2086                 trans->adding_csums = true;
2087                 ret = btrfs_csum_file_blocks(trans,
2088                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2089                 trans->adding_csums = false;
2090                 if (ret)
2091                         return ret;
2092         }
2093         return 0;
2094 }
2095 
2096 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2097                               unsigned int extra_bits,
2098                               struct extent_state **cached_state, int dedupe)
2099 {
2100         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2101         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2102                                    extra_bits, cached_state);
2103 }
2104 
2105 /* see btrfs_writepage_start_hook for details on why this is required */
2106 struct btrfs_writepage_fixup {
2107         struct page *page;
2108         struct btrfs_work work;
2109 };
2110 
2111 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2112 {
2113         struct btrfs_writepage_fixup *fixup;
2114         struct btrfs_ordered_extent *ordered;
2115         struct extent_state *cached_state = NULL;
2116         struct extent_changeset *data_reserved = NULL;
2117         struct page *page;
2118         struct inode *inode;
2119         u64 page_start;
2120         u64 page_end;
2121         int ret;
2122 
2123         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2124         page = fixup->page;
2125 again:
2126         lock_page(page);
2127         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2128                 ClearPageChecked(page);
2129                 goto out_page;
2130         }
2131 
2132         inode = page->mapping->host;
2133         page_start = page_offset(page);
2134         page_end = page_offset(page) + PAGE_SIZE - 1;
2135 
2136         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2137                          &cached_state);
2138 
2139         /* already ordered? We're done */
2140         if (PagePrivate2(page))
2141                 goto out;
2142 
2143         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2144                                         PAGE_SIZE);
2145         if (ordered) {
2146                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2147                                      page_end, &cached_state);
2148                 unlock_page(page);
2149                 btrfs_start_ordered_extent(inode, ordered, 1);
2150                 btrfs_put_ordered_extent(ordered);
2151                 goto again;
2152         }
2153 
2154         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2155                                            PAGE_SIZE);
2156         if (ret) {
2157                 mapping_set_error(page->mapping, ret);
2158                 end_extent_writepage(page, ret, page_start, page_end);
2159                 ClearPageChecked(page);
2160                 goto out;
2161          }
2162 
2163         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2164                                         &cached_state, 0);
2165         if (ret) {
2166                 mapping_set_error(page->mapping, ret);
2167                 end_extent_writepage(page, ret, page_start, page_end);
2168                 ClearPageChecked(page);
2169                 goto out;
2170         }
2171 
2172         ClearPageChecked(page);
2173         set_page_dirty(page);
2174         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2175 out:
2176         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2177                              &cached_state);
2178 out_page:
2179         unlock_page(page);
2180         put_page(page);
2181         kfree(fixup);
2182         extent_changeset_free(data_reserved);
2183 }
2184 
2185 /*
2186  * There are a few paths in the higher layers of the kernel that directly
2187  * set the page dirty bit without asking the filesystem if it is a
2188  * good idea.  This causes problems because we want to make sure COW
2189  * properly happens and the data=ordered rules are followed.
2190  *
2191  * In our case any range that doesn't have the ORDERED bit set
2192  * hasn't been properly setup for IO.  We kick off an async process
2193  * to fix it up.  The async helper will wait for ordered extents, set
2194  * the delalloc bit and make it safe to write the page.
2195  */
2196 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2197 {
2198         struct inode *inode = page->mapping->host;
2199         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2200         struct btrfs_writepage_fixup *fixup;
2201 
2202         /* this page is properly in the ordered list */
2203         if (TestClearPagePrivate2(page))
2204                 return 0;
2205 
2206         if (PageChecked(page))
2207                 return -EAGAIN;
2208 
2209         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2210         if (!fixup)
2211                 return -EAGAIN;
2212 
2213         SetPageChecked(page);
2214         get_page(page);
2215         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2216                         btrfs_writepage_fixup_worker, NULL, NULL);
2217         fixup->page = page;
2218         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2219         return -EBUSY;
2220 }
2221 
2222 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2223                                        struct inode *inode, u64 file_pos,
2224                                        u64 disk_bytenr, u64 disk_num_bytes,
2225                                        u64 num_bytes, u64 ram_bytes,
2226                                        u8 compression, u8 encryption,
2227                                        u16 other_encoding, int extent_type)
2228 {
2229         struct btrfs_root *root = BTRFS_I(inode)->root;
2230         struct btrfs_file_extent_item *fi;
2231         struct btrfs_path *path;
2232         struct extent_buffer *leaf;
2233         struct btrfs_key ins;
2234         u64 qg_released;
2235         int extent_inserted = 0;
2236         int ret;
2237 
2238         path = btrfs_alloc_path();
2239         if (!path)
2240                 return -ENOMEM;
2241 
2242         /*
2243          * we may be replacing one extent in the tree with another.
2244          * The new extent is pinned in the extent map, and we don't want
2245          * to drop it from the cache until it is completely in the btree.
2246          *
2247          * So, tell btrfs_drop_extents to leave this extent in the cache.
2248          * the caller is expected to unpin it and allow it to be merged
2249          * with the others.
2250          */
2251         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2252                                    file_pos + num_bytes, NULL, 0,
2253                                    1, sizeof(*fi), &extent_inserted);
2254         if (ret)
2255                 goto out;
2256 
2257         if (!extent_inserted) {
2258                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2259                 ins.offset = file_pos;
2260                 ins.type = BTRFS_EXTENT_DATA_KEY;
2261 
2262                 path->leave_spinning = 1;
2263                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2264                                               sizeof(*fi));
2265                 if (ret)
2266                         goto out;
2267         }
2268         leaf = path->nodes[0];
2269         fi = btrfs_item_ptr(leaf, path->slots[0],
2270                             struct btrfs_file_extent_item);
2271         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2272         btrfs_set_file_extent_type(leaf, fi, extent_type);
2273         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2274         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2275         btrfs_set_file_extent_offset(leaf, fi, 0);
2276         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2277         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2278         btrfs_set_file_extent_compression(leaf, fi, compression);
2279         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2280         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2281 
2282         btrfs_mark_buffer_dirty(leaf);
2283         btrfs_release_path(path);
2284 
2285         inode_add_bytes(inode, num_bytes);
2286 
2287         ins.objectid = disk_bytenr;
2288         ins.offset = disk_num_bytes;
2289         ins.type = BTRFS_EXTENT_ITEM_KEY;
2290 
2291         /*
2292          * Release the reserved range from inode dirty range map, as it is
2293          * already moved into delayed_ref_head
2294          */
2295         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2296         if (ret < 0)
2297                 goto out;
2298         qg_released = ret;
2299         ret = btrfs_alloc_reserved_file_extent(trans, root,
2300                                                btrfs_ino(BTRFS_I(inode)),
2301                                                file_pos, qg_released, &ins);
2302 out:
2303         btrfs_free_path(path);
2304 
2305         return ret;
2306 }
2307 
2308 /* snapshot-aware defrag */
2309 struct sa_defrag_extent_backref {
2310         struct rb_node node;
2311         struct old_sa_defrag_extent *old;
2312         u64 root_id;
2313         u64 inum;
2314         u64 file_pos;
2315         u64 extent_offset;
2316         u64 num_bytes;
2317         u64 generation;
2318 };
2319 
2320 struct old_sa_defrag_extent {
2321         struct list_head list;
2322         struct new_sa_defrag_extent *new;
2323 
2324         u64 extent_offset;
2325         u64 bytenr;
2326         u64 offset;
2327         u64 len;
2328         int count;
2329 };
2330 
2331 struct new_sa_defrag_extent {
2332         struct rb_root root;
2333         struct list_head head;
2334         struct btrfs_path *path;
2335         struct inode *inode;
2336         u64 file_pos;
2337         u64 len;
2338         u64 bytenr;
2339         u64 disk_len;
2340         u8 compress_type;
2341 };
2342 
2343 static int backref_comp(struct sa_defrag_extent_backref *b1,
2344                         struct sa_defrag_extent_backref *b2)
2345 {
2346         if (b1->root_id < b2->root_id)
2347                 return -1;
2348         else if (b1->root_id > b2->root_id)
2349                 return 1;
2350 
2351         if (b1->inum < b2->inum)
2352                 return -1;
2353         else if (b1->inum > b2->inum)
2354                 return 1;
2355 
2356         if (b1->file_pos < b2->file_pos)
2357                 return -1;
2358         else if (b1->file_pos > b2->file_pos)
2359                 return 1;
2360 
2361         /*
2362          * [------------------------------] ===> (a range of space)
2363          *     |<--->|   |<---->| =============> (fs/file tree A)
2364          * |<---------------------------->| ===> (fs/file tree B)
2365          *
2366          * A range of space can refer to two file extents in one tree while
2367          * refer to only one file extent in another tree.
2368          *
2369          * So we may process a disk offset more than one time(two extents in A)
2370          * and locate at the same extent(one extent in B), then insert two same
2371          * backrefs(both refer to the extent in B).
2372          */
2373         return 0;
2374 }
2375 
2376 static void backref_insert(struct rb_root *root,
2377                            struct sa_defrag_extent_backref *backref)
2378 {
2379         struct rb_node **p = &root->rb_node;
2380         struct rb_node *parent = NULL;
2381         struct sa_defrag_extent_backref *entry;
2382         int ret;
2383 
2384         while (*p) {
2385                 parent = *p;
2386                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2387 
2388                 ret = backref_comp(backref, entry);
2389                 if (ret < 0)
2390                         p = &(*p)->rb_left;
2391                 else
2392                         p = &(*p)->rb_right;
2393         }
2394 
2395         rb_link_node(&backref->node, parent, p);
2396         rb_insert_color(&backref->node, root);
2397 }
2398 
2399 /*
2400  * Note the backref might has changed, and in this case we just return 0.
2401  */
2402 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2403                                        void *ctx)
2404 {
2405         struct btrfs_file_extent_item *extent;
2406         struct old_sa_defrag_extent *old = ctx;
2407         struct new_sa_defrag_extent *new = old->new;
2408         struct btrfs_path *path = new->path;
2409         struct btrfs_key key;
2410         struct btrfs_root *root;
2411         struct sa_defrag_extent_backref *backref;
2412         struct extent_buffer *leaf;
2413         struct inode *inode = new->inode;
2414         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2415         int slot;
2416         int ret;
2417         u64 extent_offset;
2418         u64 num_bytes;
2419 
2420         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2421             inum == btrfs_ino(BTRFS_I(inode)))
2422                 return 0;
2423 
2424         key.objectid = root_id;
2425         key.type = BTRFS_ROOT_ITEM_KEY;
2426         key.offset = (u64)-1;
2427 
2428         root = btrfs_read_fs_root_no_name(fs_info, &key);
2429         if (IS_ERR(root)) {
2430                 if (PTR_ERR(root) == -ENOENT)
2431                         return 0;
2432                 WARN_ON(1);
2433                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2434                          inum, offset, root_id);
2435                 return PTR_ERR(root);
2436         }
2437 
2438         key.objectid = inum;
2439         key.type = BTRFS_EXTENT_DATA_KEY;
2440         if (offset > (u64)-1 << 32)
2441                 key.offset = 0;
2442         else
2443                 key.offset = offset;
2444 
2445         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2446         if (WARN_ON(ret < 0))
2447                 return ret;
2448         ret = 0;
2449 
2450         while (1) {
2451                 cond_resched();
2452 
2453                 leaf = path->nodes[0];
2454                 slot = path->slots[0];
2455 
2456                 if (slot >= btrfs_header_nritems(leaf)) {
2457                         ret = btrfs_next_leaf(root, path);
2458                         if (ret < 0) {
2459                                 goto out;
2460                         } else if (ret > 0) {
2461                                 ret = 0;
2462                                 goto out;
2463                         }
2464                         continue;
2465                 }
2466 
2467                 path->slots[0]++;
2468 
2469                 btrfs_item_key_to_cpu(leaf, &key, slot);
2470 
2471                 if (key.objectid > inum)
2472                         goto out;
2473 
2474                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2475                         continue;
2476 
2477                 extent = btrfs_item_ptr(leaf, slot,
2478                                         struct btrfs_file_extent_item);
2479 
2480                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2481                         continue;
2482 
2483                 /*
2484                  * 'offset' refers to the exact key.offset,
2485                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2486                  * (key.offset - extent_offset).
2487                  */
2488                 if (key.offset != offset)
2489                         continue;
2490 
2491                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2492                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2493 
2494                 if (extent_offset >= old->extent_offset + old->offset +
2495                     old->len || extent_offset + num_bytes <=
2496                     old->extent_offset + old->offset)
2497                         continue;
2498                 break;
2499         }
2500 
2501         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2502         if (!backref) {
2503                 ret = -ENOENT;
2504                 goto out;
2505         }
2506 
2507         backref->root_id = root_id;
2508         backref->inum = inum;
2509         backref->file_pos = offset;
2510         backref->num_bytes = num_bytes;
2511         backref->extent_offset = extent_offset;
2512         backref->generation = btrfs_file_extent_generation(leaf, extent);
2513         backref->old = old;
2514         backref_insert(&new->root, backref);
2515         old->count++;
2516 out:
2517         btrfs_release_path(path);
2518         WARN_ON(ret);
2519         return ret;
2520 }
2521 
2522 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2523                                    struct new_sa_defrag_extent *new)
2524 {
2525         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2526         struct old_sa_defrag_extent *old, *tmp;
2527         int ret;
2528 
2529         new->path = path;
2530 
2531         list_for_each_entry_safe(old, tmp, &new->head, list) {
2532                 ret = iterate_inodes_from_logical(old->bytenr +
2533                                                   old->extent_offset, fs_info,
2534                                                   path, record_one_backref,
2535                                                   old, false);
2536                 if (ret < 0 && ret != -ENOENT)
2537                         return false;
2538 
2539                 /* no backref to be processed for this extent */
2540                 if (!old->count) {
2541                         list_del(&old->list);
2542                         kfree(old);
2543                 }
2544         }
2545 
2546         if (list_empty(&new->head))
2547                 return false;
2548 
2549         return true;
2550 }
2551 
2552 static int relink_is_mergable(struct extent_buffer *leaf,
2553                               struct btrfs_file_extent_item *fi,
2554                               struct new_sa_defrag_extent *new)
2555 {
2556         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2557                 return 0;
2558 
2559         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2560                 return 0;
2561 
2562         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2563                 return 0;
2564 
2565         if (btrfs_file_extent_encryption(leaf, fi) ||
2566             btrfs_file_extent_other_encoding(leaf, fi))
2567                 return 0;
2568 
2569         return 1;
2570 }
2571 
2572 /*
2573  * Note the backref might has changed, and in this case we just return 0.
2574  */
2575 static noinline int relink_extent_backref(struct btrfs_path *path,
2576                                  struct sa_defrag_extent_backref *prev,
2577                                  struct sa_defrag_extent_backref *backref)
2578 {
2579         struct btrfs_file_extent_item *extent;
2580         struct btrfs_file_extent_item *item;
2581         struct btrfs_ordered_extent *ordered;
2582         struct btrfs_trans_handle *trans;
2583         struct btrfs_root *root;
2584         struct btrfs_key key;
2585         struct extent_buffer *leaf;
2586         struct old_sa_defrag_extent *old = backref->old;
2587         struct new_sa_defrag_extent *new = old->new;
2588         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2589         struct inode *inode;
2590         struct extent_state *cached = NULL;
2591         int ret = 0;
2592         u64 start;
2593         u64 len;
2594         u64 lock_start;
2595         u64 lock_end;
2596         bool merge = false;
2597         int index;
2598 
2599         if (prev && prev->root_id == backref->root_id &&
2600             prev->inum == backref->inum &&
2601             prev->file_pos + prev->num_bytes == backref->file_pos)
2602                 merge = true;
2603 
2604         /* step 1: get root */
2605         key.objectid = backref->root_id;
2606         key.type = BTRFS_ROOT_ITEM_KEY;
2607         key.offset = (u64)-1;
2608 
2609         index = srcu_read_lock(&fs_info->subvol_srcu);
2610 
2611         root = btrfs_read_fs_root_no_name(fs_info, &key);
2612         if (IS_ERR(root)) {
2613                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2614                 if (PTR_ERR(root) == -ENOENT)
2615                         return 0;
2616                 return PTR_ERR(root);
2617         }
2618 
2619         if (btrfs_root_readonly(root)) {
2620                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2621                 return 0;
2622         }
2623 
2624         /* step 2: get inode */
2625         key.objectid = backref->inum;
2626         key.type = BTRFS_INODE_ITEM_KEY;
2627         key.offset = 0;
2628 
2629         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2630         if (IS_ERR(inode)) {
2631                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2632                 return 0;
2633         }
2634 
2635         srcu_read_unlock(&fs_info->subvol_srcu, index);
2636 
2637         /* step 3: relink backref */
2638         lock_start = backref->file_pos;
2639         lock_end = backref->file_pos + backref->num_bytes - 1;
2640         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2641                          &cached);
2642 
2643         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2644         if (ordered) {
2645                 btrfs_put_ordered_extent(ordered);
2646                 goto out_unlock;
2647         }
2648 
2649         trans = btrfs_join_transaction(root);
2650         if (IS_ERR(trans)) {
2651                 ret = PTR_ERR(trans);
2652                 goto out_unlock;
2653         }
2654 
2655         key.objectid = backref->inum;
2656         key.type = BTRFS_EXTENT_DATA_KEY;
2657         key.offset = backref->file_pos;
2658 
2659         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2660         if (ret < 0) {
2661                 goto out_free_path;
2662         } else if (ret > 0) {
2663                 ret = 0;
2664                 goto out_free_path;
2665         }
2666 
2667         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2668                                 struct btrfs_file_extent_item);
2669 
2670         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2671             backref->generation)
2672                 goto out_free_path;
2673 
2674         btrfs_release_path(path);
2675 
2676         start = backref->file_pos;
2677         if (backref->extent_offset < old->extent_offset + old->offset)
2678                 start += old->extent_offset + old->offset -
2679                          backref->extent_offset;
2680 
2681         len = min(backref->extent_offset + backref->num_bytes,
2682                   old->extent_offset + old->offset + old->len);
2683         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2684 
2685         ret = btrfs_drop_extents(trans, root, inode, start,
2686                                  start + len, 1);
2687         if (ret)
2688                 goto out_free_path;
2689 again:
2690         key.objectid = btrfs_ino(BTRFS_I(inode));
2691         key.type = BTRFS_EXTENT_DATA_KEY;
2692         key.offset = start;
2693 
2694         path->leave_spinning = 1;
2695         if (merge) {
2696                 struct btrfs_file_extent_item *fi;
2697                 u64 extent_len;
2698                 struct btrfs_key found_key;
2699 
2700                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2701                 if (ret < 0)
2702                         goto out_free_path;
2703 
2704                 path->slots[0]--;
2705                 leaf = path->nodes[0];
2706                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2707 
2708                 fi = btrfs_item_ptr(leaf, path->slots[0],
2709                                     struct btrfs_file_extent_item);
2710                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2711 
2712                 if (extent_len + found_key.offset == start &&
2713                     relink_is_mergable(leaf, fi, new)) {
2714                         btrfs_set_file_extent_num_bytes(leaf, fi,
2715                                                         extent_len + len);
2716                         btrfs_mark_buffer_dirty(leaf);
2717                         inode_add_bytes(inode, len);
2718 
2719                         ret = 1;
2720                         goto out_free_path;
2721                 } else {
2722                         merge = false;
2723                         btrfs_release_path(path);
2724                         goto again;
2725                 }
2726         }
2727 
2728         ret = btrfs_insert_empty_item(trans, root, path, &key,
2729                                         sizeof(*extent));
2730         if (ret) {
2731                 btrfs_abort_transaction(trans, ret);
2732                 goto out_free_path;
2733         }
2734 
2735         leaf = path->nodes[0];
2736         item = btrfs_item_ptr(leaf, path->slots[0],
2737                                 struct btrfs_file_extent_item);
2738         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2739         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2740         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2741         btrfs_set_file_extent_num_bytes(leaf, item, len);
2742         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2743         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2744         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2745         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2746         btrfs_set_file_extent_encryption(leaf, item, 0);
2747         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2748 
2749         btrfs_mark_buffer_dirty(leaf);
2750         inode_add_bytes(inode, len);
2751         btrfs_release_path(path);
2752 
2753         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2754                         new->disk_len, 0,
2755                         backref->root_id, backref->inum,
2756                         new->file_pos); /* start - extent_offset */
2757         if (ret) {
2758                 btrfs_abort_transaction(trans, ret);
2759                 goto out_free_path;
2760         }
2761 
2762         ret = 1;
2763 out_free_path:
2764         btrfs_release_path(path);
2765         path->leave_spinning = 0;
2766         btrfs_end_transaction(trans);
2767 out_unlock:
2768         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2769                              &cached);
2770         iput(inode);
2771         return ret;
2772 }
2773 
2774 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2775 {
2776         struct old_sa_defrag_extent *old, *tmp;
2777 
2778         if (!new)
2779                 return;
2780 
2781         list_for_each_entry_safe(old, tmp, &new->head, list) {
2782                 kfree(old);
2783         }
2784         kfree(new);
2785 }
2786 
2787 static void relink_file_extents(struct new_sa_defrag_extent *new)
2788 {
2789         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2790         struct btrfs_path *path;
2791         struct sa_defrag_extent_backref *backref;
2792         struct sa_defrag_extent_backref *prev = NULL;
2793         struct inode *inode;
2794         struct btrfs_root *root;
2795         struct rb_node *node;
2796         int ret;
2797 
2798         inode = new->inode;
2799         root = BTRFS_I(inode)->root;
2800 
2801         path = btrfs_alloc_path();
2802         if (!path)
2803                 return;
2804 
2805         if (!record_extent_backrefs(path, new)) {
2806                 btrfs_free_path(path);
2807                 goto out;
2808         }
2809         btrfs_release_path(path);
2810 
2811         while (1) {
2812                 node = rb_first(&new->root);
2813                 if (!node)
2814                         break;
2815                 rb_erase(node, &new->root);
2816 
2817                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2818 
2819                 ret = relink_extent_backref(path, prev, backref);
2820                 WARN_ON(ret < 0);
2821 
2822                 kfree(prev);
2823 
2824                 if (ret == 1)
2825                         prev = backref;
2826                 else
2827                         prev = NULL;
2828                 cond_resched();
2829         }
2830         kfree(prev);
2831 
2832         btrfs_free_path(path);
2833 out:
2834         free_sa_defrag_extent(new);
2835 
2836         atomic_dec(&fs_info->defrag_running);
2837         wake_up(&fs_info->transaction_wait);
2838 }
2839 
2840 static struct new_sa_defrag_extent *
2841 record_old_file_extents(struct inode *inode,
2842                         struct btrfs_ordered_extent *ordered)
2843 {
2844         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2845         struct btrfs_root *root = BTRFS_I(inode)->root;
2846         struct btrfs_path *path;
2847         struct btrfs_key key;
2848         struct old_sa_defrag_extent *old;
2849         struct new_sa_defrag_extent *new;
2850         int ret;
2851 
2852         new = kmalloc(sizeof(*new), GFP_NOFS);
2853         if (!new)
2854                 return NULL;
2855 
2856         new->inode = inode;
2857         new->file_pos = ordered->file_offset;
2858         new->len = ordered->len;
2859         new->bytenr = ordered->start;
2860         new->disk_len = ordered->disk_len;
2861         new->compress_type = ordered->compress_type;
2862         new->root = RB_ROOT;
2863         INIT_LIST_HEAD(&new->head);
2864 
2865         path = btrfs_alloc_path();
2866         if (!path)
2867                 goto out_kfree;
2868 
2869         key.objectid = btrfs_ino(BTRFS_I(inode));
2870         key.type = BTRFS_EXTENT_DATA_KEY;
2871         key.offset = new->file_pos;
2872 
2873         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2874         if (ret < 0)
2875                 goto out_free_path;
2876         if (ret > 0 && path->slots[0] > 0)
2877                 path->slots[0]--;
2878 
2879         /* find out all the old extents for the file range */
2880         while (1) {
2881                 struct btrfs_file_extent_item *extent;
2882                 struct extent_buffer *l;
2883                 int slot;
2884                 u64 num_bytes;
2885                 u64 offset;
2886                 u64 end;
2887                 u64 disk_bytenr;
2888                 u64 extent_offset;
2889 
2890                 l = path->nodes[0];
2891                 slot = path->slots[0];
2892 
2893                 if (slot >= btrfs_header_nritems(l)) {
2894                         ret = btrfs_next_leaf(root, path);
2895                         if (ret < 0)
2896                                 goto out_free_path;
2897                         else if (ret > 0)
2898                                 break;
2899                         continue;
2900                 }
2901 
2902                 btrfs_item_key_to_cpu(l, &key, slot);
2903 
2904                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2905                         break;
2906                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2907                         break;
2908                 if (key.offset >= new->file_pos + new->len)
2909                         break;
2910 
2911                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2912 
2913                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2914                 if (key.offset + num_bytes < new->file_pos)
2915                         goto next;
2916 
2917                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2918                 if (!disk_bytenr)
2919                         goto next;
2920 
2921                 extent_offset = btrfs_file_extent_offset(l, extent);
2922 
2923                 old = kmalloc(sizeof(*old), GFP_NOFS);
2924                 if (!old)
2925                         goto out_free_path;
2926 
2927                 offset = max(new->file_pos, key.offset);
2928                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2929 
2930                 old->bytenr = disk_bytenr;
2931                 old->extent_offset = extent_offset;
2932                 old->offset = offset - key.offset;
2933                 old->len = end - offset;
2934                 old->new = new;
2935                 old->count = 0;
2936                 list_add_tail(&old->list, &new->head);
2937 next:
2938                 path->slots[0]++;
2939                 cond_resched();
2940         }
2941 
2942         btrfs_free_path(path);
2943         atomic_inc(&fs_info->defrag_running);
2944 
2945         return new;
2946 
2947 out_free_path:
2948         btrfs_free_path(path);
2949 out_kfree:
2950         free_sa_defrag_extent(new);
2951         return NULL;
2952 }
2953 
2954 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2955                                          u64 start, u64 len)
2956 {
2957         struct btrfs_block_group_cache *cache;
2958 
2959         cache = btrfs_lookup_block_group(fs_info, start);
2960         ASSERT(cache);
2961 
2962         spin_lock(&cache->lock);
2963         cache->delalloc_bytes -= len;
2964         spin_unlock(&cache->lock);
2965 
2966         btrfs_put_block_group(cache);
2967 }
2968 
2969 /* as ordered data IO finishes, this gets called so we can finish
2970  * an ordered extent if the range of bytes in the file it covers are
2971  * fully written.
2972  */
2973 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2974 {
2975         struct inode *inode = ordered_extent->inode;
2976         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2977         struct btrfs_root *root = BTRFS_I(inode)->root;
2978         struct btrfs_trans_handle *trans = NULL;
2979         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2980         struct extent_state *cached_state = NULL;
2981         struct new_sa_defrag_extent *new = NULL;
2982         int compress_type = 0;
2983         int ret = 0;
2984         u64 logical_len = ordered_extent->len;
2985         bool nolock;
2986         bool truncated = false;
2987         bool range_locked = false;
2988         bool clear_new_delalloc_bytes = false;
2989 
2990         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2991             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2992             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2993                 clear_new_delalloc_bytes = true;
2994 
2995         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2996 
2997         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2998                 ret = -EIO;
2999                 goto out;
3000         }
3001 
3002         btrfs_free_io_failure_record(BTRFS_I(inode),
3003                         ordered_extent->file_offset,
3004                         ordered_extent->file_offset +
3005                         ordered_extent->len - 1);
3006 
3007         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3008                 truncated = true;
3009                 logical_len = ordered_extent->truncated_len;
3010                 /* Truncated the entire extent, don't bother adding */
3011                 if (!logical_len)
3012                         goto out;
3013         }
3014 
3015         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3016                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3017 
3018                 /*
3019                  * For mwrite(mmap + memset to write) case, we still reserve
3020                  * space for NOCOW range.
3021                  * As NOCOW won't cause a new delayed ref, just free the space
3022                  */
3023                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3024                                        ordered_extent->len);
3025                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3026                 if (nolock)
3027                         trans = btrfs_join_transaction_nolock(root);
3028                 else
3029                         trans = btrfs_join_transaction(root);
3030                 if (IS_ERR(trans)) {
3031                         ret = PTR_ERR(trans);
3032                         trans = NULL;
3033                         goto out;
3034                 }
3035                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3036                 ret = btrfs_update_inode_fallback(trans, root, inode);
3037                 if (ret) /* -ENOMEM or corruption */
3038                         btrfs_abort_transaction(trans, ret);
3039                 goto out;
3040         }
3041 
3042         range_locked = true;
3043         lock_extent_bits(io_tree, ordered_extent->file_offset,
3044                          ordered_extent->file_offset + ordered_extent->len - 1,
3045                          &cached_state);
3046 
3047         ret = test_range_bit(io_tree, ordered_extent->file_offset,
3048                         ordered_extent->file_offset + ordered_extent->len - 1,
3049                         EXTENT_DEFRAG, 0, cached_state);
3050         if (ret) {
3051                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3052                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3053                         /* the inode is shared */
3054                         new = record_old_file_extents(inode, ordered_extent);
3055 
3056                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3057                         ordered_extent->file_offset + ordered_extent->len - 1,
3058                         EXTENT_DEFRAG, 0, 0, &cached_state);
3059         }
3060 
3061         if (nolock)
3062                 trans = btrfs_join_transaction_nolock(root);
3063         else
3064                 trans = btrfs_join_transaction(root);
3065         if (IS_ERR(trans)) {
3066                 ret = PTR_ERR(trans);
3067                 trans = NULL;
3068                 goto out;
3069         }
3070 
3071         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3072 
3073         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3074                 compress_type = ordered_extent->compress_type;
3075         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3076                 BUG_ON(compress_type);
3077                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3078                                        ordered_extent->len);
3079                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3080                                                 ordered_extent->file_offset,
3081                                                 ordered_extent->file_offset +
3082                                                 logical_len);
3083         } else {
3084                 BUG_ON(root == fs_info->tree_root);
3085                 ret = insert_reserved_file_extent(trans, inode,
3086                                                 ordered_extent->file_offset,
3087                                                 ordered_extent->start,
3088                                                 ordered_extent->disk_len,
3089                                                 logical_len, logical_len,
3090                                                 compress_type, 0, 0,
3091                                                 BTRFS_FILE_EXTENT_REG);
3092                 if (!ret)
3093                         btrfs_release_delalloc_bytes(fs_info,
3094                                                      ordered_extent->start,
3095                                                      ordered_extent->disk_len);
3096         }
3097         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3098                            ordered_extent->file_offset, ordered_extent->len,
3099                            trans->transid);
3100         if (ret < 0) {
3101                 btrfs_abort_transaction(trans, ret);
3102                 goto out;
3103         }
3104 
3105         ret = add_pending_csums(trans, inode, &ordered_extent->list);
3106         if (ret) {
3107                 btrfs_abort_transaction(trans, ret);
3108                 goto out;
3109         }
3110 
3111         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3112         ret = btrfs_update_inode_fallback(trans, root, inode);
3113         if (ret) { /* -ENOMEM or corruption */
3114                 btrfs_abort_transaction(trans, ret);
3115                 goto out;
3116         }
3117         ret = 0;
3118 out:
3119         if (range_locked || clear_new_delalloc_bytes) {
3120                 unsigned int clear_bits = 0;
3121 
3122                 if (range_locked)
3123                         clear_bits |= EXTENT_LOCKED;
3124                 if (clear_new_delalloc_bytes)
3125                         clear_bits |= EXTENT_DELALLOC_NEW;
3126                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3127                                  ordered_extent->file_offset,
3128                                  ordered_extent->file_offset +
3129                                  ordered_extent->len - 1,
3130                                  clear_bits,
3131                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3132                                  0, &cached_state);
3133         }
3134 
3135         if (trans)
3136                 btrfs_end_transaction(trans);
3137 
3138         if (ret || truncated) {
3139                 u64 start, end;
3140 
3141                 if (truncated)
3142                         start = ordered_extent->file_offset + logical_len;
3143                 else
3144                         start = ordered_extent->file_offset;
3145                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3146                 clear_extent_uptodate(io_tree, start, end, NULL);
3147 
3148                 /* Drop the cache for the part of the extent we didn't write. */
3149                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3150 
3151                 /*
3152                  * If the ordered extent had an IOERR or something else went
3153                  * wrong we need to return the space for this ordered extent
3154                  * back to the allocator.  We only free the extent in the
3155                  * truncated case if we didn't write out the extent at all.
3156                  */
3157                 if ((ret || !logical_len) &&
3158                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3159                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3160                         btrfs_free_reserved_extent(fs_info,
3161                                                    ordered_extent->start,
3162                                                    ordered_extent->disk_len, 1);
3163         }
3164 
3165 
3166         /*
3167          * This needs to be done to make sure anybody waiting knows we are done
3168          * updating everything for this ordered extent.
3169          */
3170         btrfs_remove_ordered_extent(inode, ordered_extent);
3171 
3172         /* for snapshot-aware defrag */
3173         if (new) {
3174                 if (ret) {
3175                         free_sa_defrag_extent(new);
3176                         atomic_dec(&fs_info->defrag_running);
3177                 } else {
3178                         relink_file_extents(new);
3179                 }
3180         }
3181 
3182         /* once for us */
3183         btrfs_put_ordered_extent(ordered_extent);
3184         /* once for the tree */
3185         btrfs_put_ordered_extent(ordered_extent);
3186 
3187         return ret;
3188 }
3189 
3190 static void finish_ordered_fn(struct btrfs_work *work)
3191 {
3192         struct btrfs_ordered_extent *ordered_extent;
3193         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3194         btrfs_finish_ordered_io(ordered_extent);
3195 }
3196 
3197 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3198                                 struct extent_state *state, int uptodate)
3199 {
3200         struct inode *inode = page->mapping->host;
3201         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3202         struct btrfs_ordered_extent *ordered_extent = NULL;
3203         struct btrfs_workqueue *wq;
3204         btrfs_work_func_t func;
3205 
3206         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3207 
3208         ClearPagePrivate2(page);
3209         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3210                                             end - start + 1, uptodate))
3211                 return;
3212 
3213         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3214                 wq = fs_info->endio_freespace_worker;
3215                 func = btrfs_freespace_write_helper;
3216         } else {
3217                 wq = fs_info->endio_write_workers;
3218                 func = btrfs_endio_write_helper;
3219         }
3220 
3221         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3222                         NULL);
3223         btrfs_queue_work(wq, &ordered_extent->work);
3224 }
3225 
3226 static int __readpage_endio_check(struct inode *inode,
3227                                   struct btrfs_io_bio *io_bio,
3228                                   int icsum, struct page *page,
3229                                   int pgoff, u64 start, size_t len)
3230 {
3231         char *kaddr;
3232         u32 csum_expected;
3233         u32 csum = ~(u32)0;
3234 
3235         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3236 
3237         kaddr = kmap_atomic(page);
3238         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3239         btrfs_csum_final(csum, (u8 *)&csum);
3240         if (csum != csum_expected)
3241                 goto zeroit;
3242 
3243         kunmap_atomic(kaddr);
3244         return 0;
3245 zeroit:
3246         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3247                                     io_bio->mirror_num);
3248         memset(kaddr + pgoff, 1, len);
3249         flush_dcache_page(page);
3250         kunmap_atomic(kaddr);
3251         return -EIO;
3252 }
3253 
3254 /*
3255  * when reads are done, we need to check csums to verify the data is correct
3256  * if there's a match, we allow the bio to finish.  If not, the code in
3257  * extent_io.c will try to find good copies for us.
3258  */
3259 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3260                                       u64 phy_offset, struct page *page,
3261                                       u64 start, u64 end, int mirror)
3262 {
3263         size_t offset = start - page_offset(page);
3264         struct inode *inode = page->mapping->host;
3265         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3266         struct btrfs_root *root = BTRFS_I(inode)->root;
3267 
3268         if (PageChecked(page)) {
3269                 ClearPageChecked(page);
3270                 return 0;
3271         }
3272 
3273         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3274                 return 0;
3275 
3276         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3277             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3278                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3279                 return 0;
3280         }
3281 
3282         phy_offset >>= inode->i_sb->s_blocksize_bits;
3283         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3284                                       start, (size_t)(end - start + 1));
3285 }
3286 
3287 void btrfs_add_delayed_iput(struct inode *inode)
3288 {
3289         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3290         struct btrfs_inode *binode = BTRFS_I(inode);
3291 
3292         if (atomic_add_unless(&inode->i_count, -1, 1))
3293                 return;
3294 
3295         spin_lock(&fs_info->delayed_iput_lock);
3296         if (binode->delayed_iput_count == 0) {
3297                 ASSERT(list_empty(&binode->delayed_iput));
3298                 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3299         } else {
3300                 binode->delayed_iput_count++;
3301         }
3302         spin_unlock(&fs_info->delayed_iput_lock);
3303 }
3304 
3305 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3306 {
3307 
3308         spin_lock(&fs_info->delayed_iput_lock);
3309         while (!list_empty(&fs_info->delayed_iputs)) {
3310                 struct btrfs_inode *inode;
3311 
3312                 inode = list_first_entry(&fs_info->delayed_iputs,
3313                                 struct btrfs_inode, delayed_iput);
3314                 if (inode->delayed_iput_count) {
3315                         inode->delayed_iput_count--;
3316                         list_move_tail(&inode->delayed_iput,
3317                                         &fs_info->delayed_iputs);
3318                 } else {
3319                         list_del_init(&inode->delayed_iput);
3320                 }
3321                 spin_unlock(&fs_info->delayed_iput_lock);
3322                 iput(&inode->vfs_inode);
3323                 spin_lock(&fs_info->delayed_iput_lock);
3324         }
3325         spin_unlock(&fs_info->delayed_iput_lock);
3326 }
3327 
3328 /*
3329  * This is called in transaction commit time. If there are no orphan
3330  * files in the subvolume, it removes orphan item and frees block_rsv
3331  * structure.
3332  */
3333 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3334                               struct btrfs_root *root)
3335 {
3336         struct btrfs_fs_info *fs_info = root->fs_info;
3337         struct btrfs_block_rsv *block_rsv;
3338         int ret;
3339 
3340         if (atomic_read(&root->orphan_inodes) ||
3341             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3342                 return;
3343 
3344         spin_lock(&root->orphan_lock);
3345         if (atomic_read(&root->orphan_inodes)) {
3346                 spin_unlock(&root->orphan_lock);
3347                 return;
3348         }
3349 
3350         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3351                 spin_unlock(&root->orphan_lock);
3352                 return;
3353         }
3354 
3355         block_rsv = root->orphan_block_rsv;
3356         root->orphan_block_rsv = NULL;
3357         spin_unlock(&root->orphan_lock);
3358 
3359         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3360             btrfs_root_refs(&root->root_item) > 0) {
3361                 ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3362                                             root->root_key.objectid);
3363                 if (ret)
3364                         btrfs_abort_transaction(trans, ret);
3365                 else
3366                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3367                                   &root->state);
3368         }
3369 
3370         if (block_rsv) {
3371                 WARN_ON(block_rsv->size > 0);
3372                 btrfs_free_block_rsv(fs_info, block_rsv);
3373         }
3374 }
3375 
3376 /*
3377  * This creates an orphan entry for the given inode in case something goes
3378  * wrong in the middle of an unlink/truncate.
3379  *
3380  * NOTE: caller of this function should reserve 5 units of metadata for
3381  *       this function.
3382  */
3383 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3384                 struct btrfs_inode *inode)
3385 {
3386         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3387         struct btrfs_root *root = inode->root;
3388         struct btrfs_block_rsv *block_rsv = NULL;
3389         int reserve = 0;
3390         int insert = 0;
3391         int ret;
3392 
3393         if (!root->orphan_block_rsv) {
3394                 block_rsv = btrfs_alloc_block_rsv(fs_info,
3395                                                   BTRFS_BLOCK_RSV_TEMP);
3396                 if (!block_rsv)
3397                         return -ENOMEM;
3398         }
3399 
3400         spin_lock(&root->orphan_lock);
3401         if (!root->orphan_block_rsv) {
3402                 root->orphan_block_rsv = block_rsv;
3403         } else if (block_rsv) {
3404                 btrfs_free_block_rsv(fs_info, block_rsv);
3405                 block_rsv = NULL;
3406         }
3407 
3408         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3409                               &inode->runtime_flags)) {
3410 #if 0
3411                 /*
3412                  * For proper ENOSPC handling, we should do orphan
3413                  * cleanup when mounting. But this introduces backward
3414                  * compatibility issue.
3415                  */
3416                 if (!xchg(&root->orphan_item_inserted, 1))
3417                         insert = 2;
3418                 else
3419                         insert = 1;
3420 #endif
3421                 insert = 1;
3422                 atomic_inc(&root->orphan_inodes);
3423         }
3424 
3425         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3426                               &inode->runtime_flags))
3427                 reserve = 1;
3428         spin_unlock(&root->orphan_lock);
3429 
3430         /* grab metadata reservation from transaction handle */
3431         if (reserve) {
3432                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3433                 ASSERT(!ret);
3434                 if (ret) {
3435                         /*
3436                          * dec doesn't need spin_lock as ->orphan_block_rsv
3437                          * would be released only if ->orphan_inodes is
3438                          * zero.
3439                          */
3440                         atomic_dec(&root->orphan_inodes);
3441                         clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3442                                   &inode->runtime_flags);
3443                         if (insert)
3444                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3445                                           &inode->runtime_flags);
3446                         return ret;
3447                 }
3448         }
3449 
3450         /* insert an orphan item to track this unlinked/truncated file */
3451         if (insert >= 1) {
3452                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3453                 if (ret) {
3454                         if (reserve) {
3455                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3456                                           &inode->runtime_flags);
3457                                 btrfs_orphan_release_metadata(inode);
3458                         }
3459                         /*
3460                          * btrfs_orphan_commit_root may race with us and set
3461                          * ->orphan_block_rsv to zero, in order to avoid that,
3462                          * decrease ->orphan_inodes after everything is done.
3463                          */
3464                         atomic_dec(&root->orphan_inodes);
3465                         if (ret != -EEXIST) {
3466                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3467                                           &inode->runtime_flags);
3468                                 btrfs_abort_transaction(trans, ret);
3469                                 return ret;
3470                         }
3471                 }
3472                 ret = 0;
3473         }
3474 
3475         /* insert an orphan item to track subvolume contains orphan files */
3476         if (insert >= 2) {
3477                 ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
3478                                                root->root_key.objectid);
3479                 if (ret && ret != -EEXIST) {
3480                         btrfs_abort_transaction(trans, ret);
3481                         return ret;
3482                 }
3483         }
3484         return 0;
3485 }
3486 
3487 /*
3488  * We have done the truncate/delete so we can go ahead and remove the orphan
3489  * item for this particular inode.
3490  */
3491 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3492                             struct btrfs_inode *inode)
3493 {
3494         struct btrfs_root *root = inode->root;
3495         int delete_item = 0;
3496         int ret = 0;
3497 
3498         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3499                                &inode->runtime_flags))
3500                 delete_item = 1;
3501 
3502         if (delete_item && trans)
3503                 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
3504 
3505         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3506                                &inode->runtime_flags))
3507                 btrfs_orphan_release_metadata(inode);
3508 
3509         /*
3510          * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
3511          * to zero, in order to avoid that, decrease ->orphan_inodes after
3512          * everything is done.
3513          */
3514         if (delete_item)
3515                 atomic_dec(&root->orphan_inodes);
3516 
3517         return ret;
3518 }
3519 
3520 /*
3521  * this cleans up any orphans that may be left on the list from the last use
3522  * of this root.
3523  */
3524 int btrfs_orphan_cleanup(struct btrfs_root *root)
3525 {
3526         struct btrfs_fs_info *fs_info = root->fs_info;
3527         struct btrfs_path *path;
3528         struct extent_buffer *leaf;
3529         struct btrfs_key key, found_key;
3530         struct btrfs_trans_handle *trans;
3531         struct inode *inode;
3532         u64 last_objectid = 0;
3533         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3534 
3535         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3536                 return 0;
3537 
3538         path = btrfs_alloc_path();
3539         if (!path) {
3540                 ret = -ENOMEM;
3541                 goto out;
3542         }
3543         path->reada = READA_BACK;
3544 
3545         key.objectid = BTRFS_ORPHAN_OBJECTID;
3546         key.type = BTRFS_ORPHAN_ITEM_KEY;
3547         key.offset = (u64)-1;
3548 
3549         while (1) {
3550                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3551                 if (ret < 0)
3552                         goto out;
3553 
3554                 /*
3555                  * if ret == 0 means we found what we were searching for, which
3556                  * is weird, but possible, so only screw with path if we didn't
3557                  * find the key and see if we have stuff that matches
3558                  */
3559                 if (ret > 0) {
3560                         ret = 0;
3561                         if (path->slots[0] == 0)
3562                                 break;
3563                         path->slots[0]--;
3564                 }
3565 
3566                 /* pull out the item */
3567                 leaf = path->nodes[0];
3568                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3569 
3570                 /* make sure the item matches what we want */
3571                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3572                         break;
3573                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3574                         break;
3575 
3576                 /* release the path since we're done with it */
3577                 btrfs_release_path(path);
3578 
3579                 /*
3580                  * this is where we are basically btrfs_lookup, without the
3581                  * crossing root thing.  we store the inode number in the
3582                  * offset of the orphan item.
3583                  */
3584 
3585                 if (found_key.offset == last_objectid) {
3586                         btrfs_err(fs_info,
3587                                   "Error removing orphan entry, stopping orphan cleanup");
3588                         ret = -EINVAL;
3589                         goto out;
3590                 }
3591 
3592                 last_objectid = found_key.offset;
3593 
3594                 found_key.objectid = found_key.offset;
3595                 found_key.type = BTRFS_INODE_ITEM_KEY;
3596                 found_key.offset = 0;
3597                 inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3598                 ret = PTR_ERR_OR_ZERO(inode);
3599                 if (ret && ret != -ENOENT)
3600                         goto out;
3601 
3602                 if (ret == -ENOENT && root == fs_info->tree_root) {
3603                         struct btrfs_root *dead_root;
3604                         struct btrfs_fs_info *fs_info = root->fs_info;
3605                         int is_dead_root = 0;
3606 
3607                         /*
3608                          * this is an orphan in the tree root. Currently these
3609                          * could come from 2 sources:
3610                          *  a) a snapshot deletion in progress
3611                          *  b) a free space cache inode
3612                          * We need to distinguish those two, as the snapshot
3613                          * orphan must not get deleted.
3614                          * find_dead_roots already ran before us, so if this
3615                          * is a snapshot deletion, we should find the root
3616                          * in the dead_roots list
3617                          */
3618                         spin_lock(&fs_info->trans_lock);
3619                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3620                                             root_list) {
3621                                 if (dead_root->root_key.objectid ==
3622                                     found_key.objectid) {
3623                                         is_dead_root = 1;
3624                                         break;
3625                                 }
3626                         }
3627                         spin_unlock(&fs_info->trans_lock);
3628                         if (is_dead_root) {
3629                                 /* prevent this orphan from being found again */
3630                                 key.offset = found_key.objectid - 1;
3631                                 continue;
3632                         }
3633                 }
3634                 /*
3635                  * Inode is already gone but the orphan item is still there,
3636                  * kill the orphan item.
3637                  */
3638                 if (ret == -ENOENT) {
3639                         trans = btrfs_start_transaction(root, 1);
3640                         if (IS_ERR(trans)) {
3641                                 ret = PTR_ERR(trans);
3642                                 goto out;
3643                         }
3644                         btrfs_debug(fs_info, "auto deleting %Lu",
3645                                     found_key.objectid);
3646                         ret = btrfs_del_orphan_item(trans, root,
3647                                                     found_key.objectid);
3648                         btrfs_end_transaction(trans);
3649                         if (ret)
3650                                 goto out;
3651                         continue;
3652                 }
3653 
3654                 /*
3655                  * add this inode to the orphan list so btrfs_orphan_del does
3656                  * the proper thing when we hit it
3657                  */
3658                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3659                         &BTRFS_I(inode)->runtime_flags);
3660                 atomic_inc(&root->orphan_inodes);
3661 
3662                 /* if we have links, this was a truncate, lets do that */
3663                 if (inode->i_nlink) {
3664                         if (WARN_ON(!S_ISREG(inode->i_mode))) {
3665                                 iput(inode);
3666                                 continue;
3667                         }
3668                         nr_truncate++;
3669 
3670                         /* 1 for the orphan item deletion. */
3671                         trans = btrfs_start_transaction(root, 1);
3672                         if (IS_ERR(trans)) {
3673                                 iput(inode);
3674                                 ret = PTR_ERR(trans);
3675                                 goto out;
3676                         }
3677                         ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3678                         btrfs_end_transaction(trans);
3679                         if (ret) {
3680                                 iput(inode);
3681                                 goto out;
3682                         }
3683 
3684                         ret = btrfs_truncate(inode);
3685                         if (ret)
3686                                 btrfs_orphan_del(NULL, BTRFS_I(inode));
3687                 } else {
3688                         nr_unlink++;
3689                 }
3690 
3691                 /* this will do delete_inode and everything for us */
3692                 iput(inode);
3693                 if (ret)
3694                         goto out;
3695         }
3696         /* release the path since we're done with it */
3697         btrfs_release_path(path);
3698 
3699         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3700 
3701         if (root->orphan_block_rsv)
3702                 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
3703                                         (u64)-1);
3704 
3705         if (root->orphan_block_rsv ||
3706             test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3707                 trans = btrfs_join_transaction(root);
3708                 if (!IS_ERR(trans))
3709                         btrfs_end_transaction(trans);
3710         }
3711 
3712         if (nr_unlink)
3713                 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3714         if (nr_truncate)
3715                 btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
3716 
3717 out:
3718         if (ret)
3719                 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3720         btrfs_free_path(path);
3721         return ret;
3722 }
3723 
3724 /*
3725  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3726  * don't find any xattrs, we know there can't be any acls.
3727  *
3728  * slot is the slot the inode is in, objectid is the objectid of the inode
3729  */
3730 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3731                                           int slot, u64 objectid,
3732                                           int *first_xattr_slot)
3733 {
3734         u32 nritems = btrfs_header_nritems(leaf);
3735         struct btrfs_key found_key;
3736         static u64 xattr_access = 0;
3737         static u64 xattr_default = 0;
3738         int scanned = 0;
3739 
3740         if (!xattr_access) {
3741                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3742                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3743                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3744                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3745         }
3746 
3747         slot++;
3748         *first_xattr_slot = -1;
3749         while (slot < nritems) {
3750                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3751 
3752                 /* we found a different objectid, there must not be acls */
3753                 if (found_key.objectid != objectid)
3754                         return 0;
3755 
3756                 /* we found an xattr, assume we've got an acl */
3757                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3758                         if (*first_xattr_slot == -1)
3759                                 *first_xattr_slot = slot;
3760                         if (found_key.offset == xattr_access ||
3761                             found_key.offset == xattr_default)
3762                                 return 1;
3763                 }
3764 
3765                 /*
3766                  * we found a key greater than an xattr key, there can't
3767                  * be any acls later on
3768                  */
3769                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3770                         return 0;
3771 
3772                 slot++;
3773                 scanned++;
3774 
3775                 /*
3776                  * it goes inode, inode backrefs, xattrs, extents,
3777                  * so if there are a ton of hard links to an inode there can
3778                  * be a lot of backrefs.  Don't waste time searching too hard,
3779                  * this is just an optimization
3780                  */
3781                 if (scanned >= 8)
3782                         break;
3783         }
3784         /* we hit the end of the leaf before we found an xattr or
3785          * something larger than an xattr.  We have to assume the inode
3786          * has acls
3787          */
3788         if (*first_xattr_slot == -1)
3789                 *first_xattr_slot = slot;
3790         return 1;
3791 }
3792 
3793 /*
3794  * read an inode from the btree into the in-memory inode
3795  */
3796 static int btrfs_read_locked_inode(struct inode *inode)
3797 {
3798         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3799         struct btrfs_path *path;
3800         struct extent_buffer *leaf;
3801         struct btrfs_inode_item *inode_item;
3802         struct btrfs_root *root = BTRFS_I(inode)->root;
3803         struct btrfs_key location;
3804         unsigned long ptr;
3805         int maybe_acls;
3806         u32 rdev;
3807         int ret;
3808         bool filled = false;
3809         int first_xattr_slot;
3810 
3811         ret = btrfs_fill_inode(inode, &rdev);
3812         if (!ret)
3813                 filled = true;
3814 
3815         path = btrfs_alloc_path();
3816         if (!path) {
3817                 ret = -ENOMEM;
3818                 goto make_bad;
3819         }
3820 
3821         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3822 
3823         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3824         if (ret) {
3825                 if (ret > 0)
3826                         ret = -ENOENT;
3827                 goto make_bad;
3828         }
3829 
3830         leaf = path->nodes[0];
3831 
3832         if (filled)
3833                 goto cache_index;
3834 
3835         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3836                                     struct btrfs_inode_item);
3837         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3838         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3839         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3840         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3841         btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3842 
3843         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3844         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3845 
3846         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3847         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3848 
3849         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3850         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3851 
3852         BTRFS_I(inode)->i_otime.tv_sec =
3853                 btrfs_timespec_sec(leaf, &inode_item->otime);
3854         BTRFS_I(inode)->i_otime.tv_nsec =
3855                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3856 
3857         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3858         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3859         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3860 
3861         inode_set_iversion_queried(inode,
3862                                    btrfs_inode_sequence(leaf, inode_item));
3863         inode->i_generation = BTRFS_I(inode)->generation;
3864         inode->i_rdev = 0;
3865         rdev = btrfs_inode_rdev(leaf, inode_item);
3866 
3867         BTRFS_I(inode)->index_cnt = (u64)-1;
3868         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3869 
3870 cache_index:
3871         /*
3872          * If we were modified in the current generation and evicted from memory
3873          * and then re-read we need to do a full sync since we don't have any
3874          * idea about which extents were modified before we were evicted from
3875          * cache.
3876          *
3877          * This is required for both inode re-read from disk and delayed inode
3878          * in delayed_nodes_tree.
3879          */
3880         if (BTRFS_I(inode)->last_trans == fs_info->generation)
3881                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3882                         &BTRFS_I(inode)->runtime_flags);
3883 
3884         /*
3885          * We don't persist the id of the transaction where an unlink operation
3886          * against the inode was last made. So here we assume the inode might
3887          * have been evicted, and therefore the exact value of last_unlink_trans
3888          * lost, and set it to last_trans to avoid metadata inconsistencies
3889          * between the inode and its parent if the inode is fsync'ed and the log
3890          * replayed. For example, in the scenario:
3891          *
3892          * touch mydir/foo
3893          * ln mydir/foo mydir/bar
3894          * sync
3895          * unlink mydir/bar
3896          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3897          * xfs_io -c fsync mydir/foo
3898          * <power failure>
3899          * mount fs, triggers fsync log replay
3900          *
3901          * We must make sure that when we fsync our inode foo we also log its
3902          * parent inode, otherwise after log replay the parent still has the
3903          * dentry with the "bar" name but our inode foo has a link count of 1
3904          * and doesn't have an inode ref with the name "bar" anymore.
3905          *
3906          * Setting last_unlink_trans to last_trans is a pessimistic approach,
3907          * but it guarantees correctness at the expense of occasional full
3908          * transaction commits on fsync if our inode is a directory, or if our
3909          * inode is not a directory, logging its parent unnecessarily.
3910          */
3911         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3912 
3913         path->slots[0]++;
3914         if (inode->i_nlink != 1 ||
3915             path->slots[0] >= btrfs_header_nritems(leaf))
3916                 goto cache_acl;
3917 
3918         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3919         if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3920                 goto cache_acl;
3921 
3922         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3923         if (location.type == BTRFS_INODE_REF_KEY) {
3924                 struct btrfs_inode_ref *ref;
3925 
3926                 ref = (struct btrfs_inode_ref *)ptr;
3927                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3928         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3929                 struct btrfs_inode_extref *extref;
3930 
3931                 extref = (struct btrfs_inode_extref *)ptr;
3932                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3933                                                                      extref);
3934         }
3935 cache_acl:
3936         /*
3937          * try to precache a NULL acl entry for files that don't have
3938          * any xattrs or acls
3939          */
3940         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3941                         btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3942         if (first_xattr_slot != -1) {
3943                 path->slots[0] = first_xattr_slot;
3944                 ret = btrfs_load_inode_props(inode, path);
3945                 if (ret)
3946                         btrfs_err(fs_info,
3947                                   "error loading props for ino %llu (root %llu): %d",
3948                                   btrfs_ino(BTRFS_I(inode)),
3949                                   root->root_key.objectid, ret);
3950         }
3951         btrfs_free_path(path);
3952 
3953         if (!maybe_acls)
3954                 cache_no_acl(inode);
3955 
3956         switch (inode->i_mode & S_IFMT) {
3957         case S_IFREG:
3958                 inode->i_mapping->a_ops = &btrfs_aops;
3959                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3960                 inode->i_fop = &btrfs_file_operations;
3961                 inode->i_op = &btrfs_file_inode_operations;
3962                 break;
3963         case S_IFDIR:
3964                 inode->i_fop = &btrfs_dir_file_operations;
3965                 inode->i_op = &btrfs_dir_inode_operations;
3966                 break;
3967         case S_IFLNK:
3968                 inode->i_op = &btrfs_symlink_inode_operations;
3969                 inode_nohighmem(inode);
3970                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3971                 break;
3972         default:
3973                 inode->i_op = &btrfs_special_inode_operations;
3974                 init_special_inode(inode, inode->i_mode, rdev);
3975                 break;
3976         }
3977 
3978         btrfs_update_iflags(inode);
3979         return 0;
3980 
3981 make_bad:
3982         btrfs_free_path(path);
3983         make_bad_inode(inode);
3984         return ret;
3985 }
3986 
3987 /*
3988  * given a leaf and an inode, copy the inode fields into the leaf
3989  */
3990 static void fill_inode_item(struct btrfs_trans_handle *trans,
3991                             struct extent_buffer *leaf,
3992                             struct btrfs_inode_item *item,
3993                             struct inode *inode)
3994 {
3995         struct btrfs_map_token token;
3996 
3997         btrfs_init_map_token(&token);
3998 
3999         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
4000         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
4001         btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
4002                                    &token);
4003         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
4004         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
4005 
4006         btrfs_set_token_timespec_sec(leaf, &item->atime,
4007                                      inode->i_atime.tv_sec, &token);
4008         btrfs_set_token_timespec_nsec(leaf, &item->atime,
4009                                       inode->i_atime.tv_nsec, &token);
4010 
4011         btrfs_set_token_timespec_sec(leaf, &item->mtime,
4012                                      inode->i_mtime.tv_sec, &token);
4013         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
4014                                       inode->i_mtime.tv_nsec, &token);
4015 
4016         btrfs_set_token_timespec_sec(leaf, &item->ctime,
4017                                      inode->i_ctime.tv_sec, &token);
4018         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
4019                                       inode->i_ctime.tv_nsec, &token);
4020 
4021         btrfs_set_token_timespec_sec(leaf, &item->otime,
4022                                      BTRFS_I(inode)->i_otime.tv_sec, &token);
4023         btrfs_set_token_timespec_nsec(leaf, &item->otime,
4024                                       BTRFS_I(inode)->i_otime.tv_nsec, &token);
4025 
4026         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
4027                                      &token);
4028         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
4029                                          &token);
4030         btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
4031                                        &token);
4032         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
4033         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
4034         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
4035         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
4036 }
4037 
4038 /*
4039  * copy everything in the in-memory inode into the btree.
4040  */
4041 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4042                                 struct btrfs_root *root, struct inode *inode)
4043 {
4044         struct btrfs_inode_item *inode_item;
4045         struct btrfs_path *path;
4046         struct extent_buffer *leaf;
4047         int ret;
4048 
4049         path = btrfs_alloc_path();
4050         if (!path)
4051                 return -ENOMEM;
4052 
4053         path->leave_spinning = 1;
4054         ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
4055                                  1);
4056         if (ret) {
4057                 if (ret > 0)
4058                         ret = -ENOENT;
4059                 goto failed;
4060         }
4061 
4062         leaf = path->nodes[0];
4063         inode_item = btrfs_item_ptr(leaf, path->slots[0],
4064                                     struct btrfs_inode_item);
4065 
4066         fill_inode_item(trans, leaf, inode_item, inode);
4067         btrfs_mark_buffer_dirty(leaf);
4068         btrfs_set_inode_last_trans(trans, inode);
4069         ret = 0;
4070 failed:
4071         btrfs_free_path(path);
4072         return ret;
4073 }
4074 
4075 /*
4076  * copy everything in the in-memory inode into the btree.
4077  */
4078 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4079                                 struct btrfs_root *root, struct inode *inode)
4080 {
4081         struct btrfs_fs_info *fs_info = root->fs_info;
4082         int ret;
4083 
4084         /*
4085          * If the inode is a free space inode, we can deadlock during commit
4086          * if we put it into the delayed code.
4087          *
4088          * The data relocation inode should also be directly updated
4089          * without delay
4090          */
4091         if (!btrfs_is_free_space_inode(BTRFS_I(inode))
4092             && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
4093             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4094                 btrfs_update_root_times(trans, root);
4095 
4096                 ret = btrfs_delayed_update_inode(trans, root, inode);
4097                 if (!ret)
4098                         btrfs_set_inode_last_trans(trans, inode);
4099                 return ret;
4100         }
4101 
4102         return btrfs_update_inode_item(trans, root, inode);
4103 }
4104 
4105 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4106                                          struct btrfs_root *root,
4107                                          struct inode *inode)
4108 {
4109         int ret;
4110 
4111         ret = btrfs_update_inode(trans, root, inode);
4112         if (ret == -ENOSPC)
4113                 return btrfs_update_inode_item(trans, root, inode);
4114         return ret;
4115 }
4116 
4117 /*
4118  * unlink helper that gets used here in inode.c and in the tree logging
4119  * recovery code.  It remove a link in a directory with a given name, and
4120  * also drops the back refs in the inode to the directory
4121  */
4122 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4123                                 struct btrfs_root *root,
4124                                 struct btrfs_inode *dir,
4125                                 struct btrfs_inode *inode,
4126                                 const char *name, int name_len)
4127 {
4128         struct btrfs_fs_info *fs_info = root->fs_info;
4129         struct btrfs_path *path;
4130         int ret = 0;
4131         struct extent_buffer *leaf;
4132         struct btrfs_dir_item *di;
4133         struct btrfs_key key;
4134         u64 index;
4135         u64 ino = btrfs_ino(inode);
4136         u64 dir_ino = btrfs_ino(dir);
4137 
4138         path = btrfs_alloc_path();
4139         if (!path) {
4140                 ret = -ENOMEM;
4141                 goto out;
4142         }
4143 
4144         path->leave_spinning = 1;
4145         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4146                                     name, name_len, -1);
4147         if (IS_ERR(di)) {
4148                 ret = PTR_ERR(di);
4149                 goto err;
4150         }
4151         if (!di) {
4152                 ret = -ENOENT;
4153                 goto err;
4154         }
4155         leaf = path->nodes[0];
4156         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4157         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4158         if (ret)
4159                 goto err;
4160         btrfs_release_path(path);
4161 
4162         /*
4163          * If we don't have dir index, we have to get it by looking up
4164          * the inode ref, since we get the inode ref, remove it directly,
4165          * it is unnecessary to do delayed deletion.
4166          *
4167          * But if we have dir index, needn't search inode ref to get it.
4168          * Since the inode ref is close to the inode item, it is better
4169          * that we delay to delete it, and just do this deletion when
4170          * we update the inode item.
4171          */
4172         if (inode->dir_index) {
4173                 ret = btrfs_delayed_delete_inode_ref(inode);
4174                 if (!ret) {
4175                         index = inode->dir_index;
4176                         goto skip_backref;
4177                 }
4178         }
4179 
4180         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4181                                   dir_ino, &index);
4182         if (ret) {
4183                 btrfs_info(fs_info,
4184                         "failed to delete reference to %.*s, inode %llu parent %llu",
4185                         name_len, name, ino, dir_ino);
4186                 btrfs_abort_transaction(trans, ret);
4187                 goto err;
4188         }
4189 skip_backref:
4190         ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
4191         if (ret) {
4192                 btrfs_abort_transaction(trans, ret);
4193                 goto err;
4194         }
4195 
4196         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4197                         dir_ino);
4198         if (ret != 0 && ret != -ENOENT) {
4199                 btrfs_abort_transaction(trans, ret);
4200                 goto err;
4201         }
4202 
4203         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4204                         index);
4205         if (ret == -ENOENT)
4206                 ret = 0;
4207         else if (ret)
4208                 btrfs_abort_transaction(trans, ret);
4209 err:
4210         btrfs_free_path(path);
4211         if (ret)
4212                 goto out;
4213 
4214         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4215         inode_inc_iversion(&inode->vfs_inode);
4216         inode_inc_iversion(&dir->vfs_inode);
4217         inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4218                 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4219         ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
4220 out:
4221         return ret;
4222 }
4223 
4224 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4225                        struct btrfs_root *root,
4226                        struct btrfs_inode *dir, struct btrfs_inode *inode,
4227                        const char *name, int name_len)
4228 {
4229         int ret;
4230         ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4231         if (!ret) {
4232                 drop_nlink(&inode->vfs_inode);
4233                 ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
4234         }
4235         return ret;
4236 }
4237 
4238 /*
4239  * helper to start transaction for unlink and rmdir.
4240  *
4241  * unlink and rmdir are special in btrfs, they do not always free space, so
4242  * if we cannot make our reservations the normal way try and see if there is
4243  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4244  * allow the unlink to occur.
4245  */
4246 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4247 {
4248         struct btrfs_root *root = BTRFS_I(dir)->root;
4249 
4250         /*
4251          * 1 for the possible orphan item
4252          * 1 for the dir item
4253          * 1 for the dir index
4254          * 1 for the inode ref
4255          * 1 for the inode
4256          */
4257         return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4258 }
4259 
4260 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4261 {
4262         struct btrfs_root *root = BTRFS_I(dir)->root;
4263         struct btrfs_trans_handle *trans;
4264         struct inode *inode = d_inode(dentry);
4265         int ret;
4266 
4267         trans = __unlink_start_trans(dir);
4268         if (IS_ERR(trans))
4269                 return PTR_ERR(trans);
4270 
4271         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4272                         0);
4273 
4274         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4275                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4276                         dentry->d_name.len);
4277         if (ret)
4278                 goto out;
4279 
4280         if (inode->i_nlink == 0) {
4281                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4282                 if (ret)
4283                         goto out;
4284         }
4285 
4286 out:
4287         btrfs_end_transaction(trans);
4288         btrfs_btree_balance_dirty(root->fs_info);
4289         return ret;
4290 }
4291 
4292 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4293                         struct btrfs_root *root,
4294                         struct inode *dir, u64 objectid,
4295                         const char *name, int name_len)
4296 {
4297         struct btrfs_fs_info *fs_info = root->fs_info;
4298         struct btrfs_path *path;
4299         struct extent_buffer *leaf;
4300         struct btrfs_dir_item *di;
4301         struct btrfs_key key;
4302         u64 index;
4303         int ret;
4304         u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4305 
4306         path = btrfs_alloc_path();
4307         if (!path)
4308                 return -ENOMEM;
4309 
4310         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4311                                    name, name_len, -1);
4312         if (IS_ERR_OR_NULL(di)) {
4313                 if (!di)
4314                         ret = -ENOENT;
4315                 else
4316                         ret = PTR_ERR(di);
4317                 goto out;
4318         }
4319 
4320         leaf = path->nodes[0];
4321         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4322         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4323         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4324         if (ret) {
4325                 btrfs_abort_transaction(trans, ret);
4326                 goto out;
4327         }
4328         btrfs_release_path(path);
4329 
4330         ret = btrfs_del_root_ref(trans, fs_info, objectid,
4331                                  root->root_key.objectid, dir_ino,
4332                                  &index, name, name_len);
4333         if (ret < 0) {
4334                 if (ret != -ENOENT) {
4335                         btrfs_abort_transaction(trans, ret);
4336                         goto out;
4337                 }
4338                 di = btrfs_search_dir_index_item(root, path, dir_ino,
4339                                                  name, name_len);
4340                 if (IS_ERR_OR_NULL(di)) {
4341                         if (!di)
4342                                 ret = -ENOENT;
4343                         else
4344                                 ret = PTR_ERR(di);
4345                         btrfs_abort_transaction(trans, ret);
4346                         goto out;
4347                 }
4348 
4349                 leaf = path->nodes[0];
4350                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4351                 btrfs_release_path(path);
4352                 index = key.offset;
4353         }
4354         btrfs_release_path(path);
4355 
4356         ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index);
4357         if (ret) {
4358                 btrfs_abort_transaction(trans, ret);
4359                 goto out;
4360         }
4361 
4362         btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4363         inode_inc_iversion(dir);
4364         dir->i_mtime = dir->i_ctime = current_time(dir);
4365         ret = btrfs_update_inode_fallback(trans, root, dir);
4366         if (ret)
4367                 btrfs_abort_transaction(trans, ret);
4368 out:
4369         btrfs_free_path(path);
4370         return ret;
4371 }
4372 
4373 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4374 {
4375         struct inode *inode = d_inode(dentry);
4376         int err = 0;
4377         struct btrfs_root *root = BTRFS_I(dir)->root;
4378         struct btrfs_trans_handle *trans;
4379         u64 last_unlink_trans;
4380 
4381         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4382                 return -ENOTEMPTY;
4383         if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4384                 return -EPERM;
4385 
4386         trans = __unlink_start_trans(dir);
4387         if (IS_ERR(trans))
4388                 return PTR_ERR(trans);
4389 
4390         if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4391                 err = btrfs_unlink_subvol(trans, root, dir,
4392                                           BTRFS_I(inode)->location.objectid,
4393                                           dentry->d_name.name,
4394                                           dentry->d_name.len);
4395                 goto out;
4396         }
4397 
4398         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4399         if (err)
4400                 goto out;
4401 
4402         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4403 
4404         /* now the directory is empty */
4405         err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4406                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4407                         dentry->d_name.len);
4408         if (!err) {
4409                 btrfs_i_size_write(BTRFS_I(inode), 0);
4410                 /*
4411                  * Propagate the last_unlink_trans value of the deleted dir to
4412                  * its parent directory. This is to prevent an unrecoverable
4413                  * log tree in the case we do something like this:
4414                  * 1) create dir foo
4415                  * 2) create snapshot under dir foo
4416                  * 3) delete the snapshot
4417                  * 4) rmdir foo
4418                  * 5) mkdir foo
4419                  * 6) fsync foo or some file inside foo
4420                  */
4421                 if (last_unlink_trans >= trans->transid)
4422                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4423         }
4424 out:
4425         btrfs_end_transaction(trans);
4426         btrfs_btree_balance_dirty(root->fs_info);
4427 
4428         return err;
4429 }
4430 
4431 static int truncate_space_check(struct btrfs_trans_handle *trans,
4432                                 struct btrfs_root *root,
4433                                 u64 bytes_deleted)
4434 {
4435         struct btrfs_fs_info *fs_info = root->fs_info;
4436         int ret;
4437 
4438         /*
4439          * This is only used to apply pressure to the enospc system, we don't
4440          * intend to use this reservation at all.
4441          */
4442         bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4443         bytes_deleted *= fs_info->nodesize;
4444         ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4445                                   bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4446         if (!ret) {
4447                 trace_btrfs_space_reservation(fs_info, "transaction",
4448                                               trans->transid,
4449                                               bytes_deleted, 1);
4450                 trans->bytes_reserved += bytes_deleted;
4451         }
4452         return ret;
4453 
4454 }
4455 
4456 /*
4457  * Return this if we need to call truncate_block for the last bit of the
4458  * truncate.
4459  */
4460 #define NEED_TRUNCATE_BLOCK 1
4461 
4462 /*
4463  * this can truncate away extent items, csum items and directory items.
4464  * It starts at a high offset and removes keys until it can't find
4465  * any higher than new_size
4466  *
4467  * csum items that cross the new i_size are truncated to the new size
4468  * as well.
4469  *
4470  * min_type is the minimum key type to truncate down to.  If set to 0, this
4471  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4472  */
4473 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4474                                struct btrfs_root *root,
4475                                struct inode *inode,
4476                                u64 new_size, u32 min_type)
4477 {
4478         struct btrfs_fs_info *fs_info = root->fs_info;
4479         struct btrfs_path *path;
4480         struct extent_buffer *leaf;
4481         struct btrfs_file_extent_item *fi;
4482         struct btrfs_key key;
4483         struct btrfs_key found_key;
4484         u64 extent_start = 0;
4485         u64 extent_num_bytes = 0;
4486         u64 extent_offset = 0;
4487         u64 item_end = 0;
4488         u64 last_size = new_size;
4489         u32 found_type = (u8)-1;
4490         int found_extent;
4491         int del_item;
4492         int pending_del_nr = 0;
4493         int pending_del_slot = 0;
4494         int extent_type = -1;
4495         int ret;
4496         int err = 0;
4497         u64 ino = btrfs_ino(BTRFS_I(inode));
4498         u64 bytes_deleted = 0;
4499         bool be_nice = false;
4500         bool should_throttle = false;
4501         bool should_end = false;
4502 
4503         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4504 
4505         /*
4506          * for non-free space inodes and ref cows, we want to back off from
4507          * time to time
4508          */
4509         if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4510             test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4511                 be_nice = true;
4512 
4513         path = btrfs_alloc_path();
4514         if (!path)
4515                 return -ENOMEM;
4516         path->reada = READA_BACK;
4517 
4518         /*
4519          * We want to drop from the next block forward in case this new size is
4520          * not block aligned since we will be keeping the last block of the
4521          * extent just the way it is.
4522          */
4523         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4524             root == fs_info->tree_root)
4525                 btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4526                                         fs_info->sectorsize),
4527                                         (u64)-1, 0);
4528 
4529         /*
4530          * This function is also used to drop the items in the log tree before
4531          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4532          * it is used to drop the loged items. So we shouldn't kill the delayed
4533          * items.
4534          */
4535         if (min_type == 0 && root == BTRFS_I(inode)->root)
4536                 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4537 
4538         key.objectid = ino;
4539         key.offset = (u64)-1;
4540         key.type = (u8)-1;
4541 
4542 search_again:
4543         /*
4544          * with a 16K leaf size and 128MB extents, you can actually queue
4545          * up a huge file in a single leaf.  Most of the time that
4546          * bytes_deleted is > 0, it will be huge by the time we get here
4547          */
4548         if (be_nice && bytes_deleted > SZ_32M) {
4549                 if (btrfs_should_end_transaction(trans)) {
4550                         err = -EAGAIN;
4551                         goto error;
4552                 }
4553         }
4554 
4555 
4556         path->leave_spinning = 1;
4557         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4558         if (ret < 0) {
4559                 err = ret;
4560                 goto out;
4561         }
4562 
4563         if (ret > 0) {
4564                 /* there are no items in the tree for us to truncate, we're
4565                  * done
4566                  */
4567                 if (path->slots[0] == 0)
4568                         goto out;
4569                 path->slots[0]--;
4570         }
4571 
4572         while (1) {
4573                 fi = NULL;
4574                 leaf = path->nodes[0];
4575                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4576                 found_type = found_key.type;
4577 
4578                 if (found_key.objectid != ino)
4579                         break;
4580 
4581                 if (found_type < min_type)
4582                         break;
4583 
4584                 item_end = found_key.offset;
4585                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
4586                         fi = btrfs_item_ptr(leaf, path->slots[0],
4587                                             struct btrfs_file_extent_item);
4588                         extent_type = btrfs_file_extent_type(leaf, fi);
4589                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4590                                 item_end +=
4591                                     btrfs_file_extent_num_bytes(leaf, fi);
4592 
4593                                 trace_btrfs_truncate_show_fi_regular(
4594                                         BTRFS_I(inode), leaf, fi,
4595                                         found_key.offset);
4596                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4597                                 item_end += btrfs_file_extent_inline_len(leaf,
4598                                                          path->slots[0], fi);
4599 
4600                                 trace_btrfs_truncate_show_fi_inline(
4601                                         BTRFS_I(inode), leaf, fi, path->slots[0],
4602                                         found_key.offset);
4603                         }
4604                         item_end--;
4605                 }
4606                 if (found_type > min_type) {
4607                         del_item = 1;
4608                 } else {
4609                         if (item_end < new_size)
4610                                 break;
4611                         if (found_key.offset >= new_size)
4612                                 del_item = 1;
4613                         else
4614                                 del_item = 0;
4615                 }
4616                 found_extent = 0;
4617                 /* FIXME, shrink the extent if the ref count is only 1 */
4618                 if (found_type != BTRFS_EXTENT_DATA_KEY)
4619                         goto delete;
4620 
4621                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4622                         u64 num_dec;
4623                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4624                         if (!del_item) {
4625                                 u64 orig_num_bytes =
4626                                         btrfs_file_extent_num_bytes(leaf, fi);
4627                                 extent_num_bytes = ALIGN(new_size -
4628                                                 found_key.offset,
4629                                                 fs_info->sectorsize);
4630                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4631                                                          extent_num_bytes);
4632                                 num_dec = (orig_num_bytes -
4633                                            extent_num_bytes);
4634                                 if (test_bit(BTRFS_ROOT_REF_COWS,
4635                                              &root->state) &&
4636                                     extent_start != 0)
4637                                         inode_sub_bytes(inode, num_dec);
4638                                 btrfs_mark_buffer_dirty(leaf);
4639                         } else {
4640                                 extent_num_bytes =
4641                                         btrfs_file_extent_disk_num_bytes(leaf,
4642                                                                          fi);
4643                                 extent_offset = found_key.offset -
4644                                         btrfs_file_extent_offset(leaf, fi);
4645 
4646                                 /* FIXME blocksize != 4096 */
4647                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4648                                 if (extent_start != 0) {
4649                                         found_extent = 1;
4650                                         if (test_bit(BTRFS_ROOT_REF_COWS,
4651                                                      &root->state))
4652                                                 inode_sub_bytes(inode, num_dec);
4653                                 }
4654                         }
4655                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4656                         /*
4657                          * we can't truncate inline items that have had
4658                          * special encodings
4659                          */
4660                         if (!del_item &&
4661                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
4662                             btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4663                             btrfs_file_extent_compression(leaf, fi) == 0) {
4664                                 u32 size = (u32)(new_size - found_key.offset);
4665 
4666                                 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4667                                 size = btrfs_file_extent_calc_inline_size(size);
4668                                 btrfs_truncate_item(root->fs_info, path, size, 1);
4669                         } else if (!del_item) {
4670                                 /*
4671                                  * We have to bail so the last_size is set to
4672                                  * just before this extent.
4673                                  */
4674                                 err = NEED_TRUNCATE_BLOCK;
4675                                 break;
4676                         }
4677 
4678                         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4679                                 inode_sub_bytes(inode, item_end + 1 - new_size);
4680                 }
4681 delete:
4682                 if (del_item)
4683                         last_size = found_key.offset;
4684                 else
4685                         last_size = new_size;
4686                 if (del_item) {
4687                         if (!pending_del_nr) {
4688                                 /* no pending yet, add ourselves */
4689                                 pending_del_slot = path->slots[0];
4690                                 pending_del_nr = 1;
4691                         } else if (pending_del_nr &&
4692                                    path->slots[0] + 1 == pending_del_slot) {
4693                                 /* hop on the pending chunk */
4694                                 pending_del_nr++;
4695                                 pending_del_slot = path->slots[0];
4696                         } else {
4697                                 BUG();
4698                         }
4699                 } else {
4700                         break;
4701                 }
4702                 should_throttle = false;
4703 
4704                 if (found_extent &&
4705                     (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4706                      root == fs_info->tree_root)) {
4707                         btrfs_set_path_blocking(path);
4708                         bytes_deleted += extent_num_bytes;
4709                         ret = btrfs_free_extent(trans, root, extent_start,
4710                                                 extent_num_bytes, 0,
4711                                                 btrfs_header_owner(leaf),
4712                                                 ino, extent_offset);
4713                         BUG_ON(ret);
4714                         if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4715                                 btrfs_async_run_delayed_refs(fs_info,
4716                                         trans->delayed_ref_updates * 2,
4717                                         trans->transid, 0);
4718                         if (be_nice) {
4719                                 if (truncate_space_check(trans, root,
4720                                                          extent_num_bytes)) {
4721                                         should_end = true;
4722                                 }
4723                                 if (btrfs_should_throttle_delayed_refs(trans,
4724                                                                        fs_info))
4725                                         should_throttle = true;
4726                         }
4727                 }
4728 
4729                 if (found_type == BTRFS_INODE_ITEM_KEY)
4730                         break;
4731 
4732                 if (path->slots[0] == 0 ||
4733                     path->slots[0] != pending_del_slot ||
4734                     should_throttle || should_end) {
4735                         if (pending_del_nr) {
4736                                 ret = btrfs_del_items(trans, root, path,
4737                                                 pending_del_slot,
4738                                                 pending_del_nr);
4739                                 if (ret) {
4740                                         btrfs_abort_transaction(trans, ret);
4741                                         goto error;
4742                                 }
4743                                 pending_del_nr = 0;
4744                         }
4745                         btrfs_release_path(path);
4746                         if (should_throttle) {
4747                                 unsigned long updates = trans->delayed_ref_updates;
4748                                 if (updates) {
4749                                         trans->delayed_ref_updates = 0;
4750                                         ret = btrfs_run_delayed_refs(trans,
4751                                                                    fs_info,
4752                                                                    updates * 2);
4753                                         if (ret && !err)
4754                                                 err = ret;
4755                                 }
4756                         }
4757                         /*
4758                          * if we failed to refill our space rsv, bail out
4759                          * and let the transaction restart
4760                          */
4761                         if (should_end) {
4762                                 err = -EAGAIN;
4763                                 goto error;
4764                         }
4765                         goto search_again;
4766                 } else {
4767                         path->slots[0]--;
4768                 }
4769         }
4770 out:
4771         if (pending_del_nr) {
4772                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
4773                                       pending_del_nr);
4774                 if (ret)
4775                         btrfs_abort_transaction(trans, ret);
4776         }
4777 error:
4778         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4779                 ASSERT(last_size >= new_size);
4780                 if (!err && last_size > new_size)
4781                         last_size = new_size;
4782                 btrfs_ordered_update_i_size(inode, last_size, NULL);
4783         }
4784 
4785         btrfs_free_path(path);
4786 
4787         if (be_nice && bytes_deleted > SZ_32M) {
4788                 unsigned long updates = trans->delayed_ref_updates;
4789                 if (updates) {
4790                         trans->delayed_ref_updates = 0;
4791                         ret = btrfs_run_delayed_refs(trans, fs_info,
4792                                                      updates * 2);
4793                         if (ret && !err)
4794                                 err = ret;
4795                 }
4796         }
4797         return err;
4798 }
4799 
4800 /*
4801  * btrfs_truncate_block - read, zero a chunk and write a block
4802  * @inode - inode that we're zeroing
4803  * @from - the offset to start zeroing
4804  * @len - the length to zero, 0 to zero the entire range respective to the
4805  *      offset
4806  * @front - zero up to the offset instead of from the offset on
4807  *
4808  * This will find the block for the "from" offset and cow the block and zero the
4809  * part we want to zero.  This is used with truncate and hole punching.
4810  */
4811 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4812                         int front)
4813 {
4814         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4815         struct address_space *mapping = inode->i_mapping;
4816         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4817         struct btrfs_ordered_extent *ordered;
4818         struct extent_state *cached_state = NULL;
4819         struct extent_changeset *data_reserved = NULL;
4820         char *kaddr;
4821         u32 blocksize = fs_info->sectorsize;
4822         pgoff_t index = from >> PAGE_SHIFT;
4823         unsigned offset = from & (blocksize - 1);
4824         struct page *page;
4825         gfp_t mask = btrfs_alloc_write_mask(mapping);
4826         int ret = 0;
4827         u64 block_start;
4828         u64 block_end;
4829 
4830         if (IS_ALIGNED(offset, blocksize) &&
4831             (!len || IS_ALIGNED(len, blocksize)))
4832                 goto out;
4833 
4834         block_start = round_down(from, blocksize);
4835         block_end = block_start + blocksize - 1;
4836 
4837         ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4838                                            block_start, blocksize);
4839         if (ret)
4840                 goto out;
4841 
4842 again:
4843         page = find_or_create_page(mapping, index, mask);
4844         if (!page) {
4845                 btrfs_delalloc_release_space(inode, data_reserved,
4846                                              block_start, blocksize);
4847                 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4848                 ret = -ENOMEM;
4849                 goto out;
4850         }
4851 
4852         if (!PageUptodate(page)) {
4853                 ret = btrfs_readpage(NULL, page);
4854                 lock_page(page);
4855                 if (page->mapping != mapping) {
4856                         unlock_page(page);
4857                         put_page(page);
4858                         goto again;
4859                 }
4860                 if (!PageUptodate(page)) {
4861                         ret = -EIO;
4862                         goto out_unlock;
4863                 }
4864         }
4865         wait_on_page_writeback(page);
4866 
4867         lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4868         set_page_extent_mapped(page);
4869 
4870         ordered = btrfs_lookup_ordered_extent(inode, block_start);
4871         if (ordered) {
4872                 unlock_extent_cached(io_tree, block_start, block_end,
4873                                      &cached_state);
4874                 unlock_page(page);
4875                 put_page(page);
4876                 btrfs_start_ordered_extent(inode, ordered, 1);
4877                 btrfs_put_ordered_extent(ordered);
4878                 goto again;
4879         }
4880 
4881         clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4882                           EXTENT_DIRTY | EXTENT_DELALLOC |
4883                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4884                           0, 0, &cached_state);
4885 
4886         ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4887                                         &cached_state, 0);
4888         if (ret) {
4889                 unlock_extent_cached(io_tree, block_start, block_end,
4890                                      &cached_state);
4891                 goto out_unlock;
4892         }
4893 
4894         if (offset != blocksize) {
4895                 if (!len)
4896                         len = blocksize - offset;
4897                 kaddr = kmap(page);
4898                 if (front)
4899                         memset(kaddr + (block_start - page_offset(page)),
4900                                 0, offset);
4901                 else
4902                         memset(kaddr + (block_start - page_offset(page)) +  offset,
4903                                 0, len);
4904                 flush_dcache_page(page);
4905                 kunmap(page);
4906         }
4907         ClearPageChecked(page);
4908         set_page_dirty(page);
4909         unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
4910 
4911 out_unlock:
4912         if (ret)
4913                 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4914                                              blocksize);
4915         btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4916         unlock_page(page);
4917         put_page(page);
4918 out:
4919         extent_changeset_free(data_reserved);
4920         return ret;
4921 }
4922 
4923 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4924                              u64 offset, u64 len)
4925 {
4926         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4927         struct btrfs_trans_handle *trans;
4928         int ret;
4929 
4930         /*
4931          * Still need to make sure the inode looks like it's been updated so
4932          * that any holes get logged if we fsync.
4933          */
4934         if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
4935                 BTRFS_I(inode)->last_trans = fs_info->generation;
4936                 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4937                 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4938                 return 0;
4939         }
4940 
4941         /*
4942          * 1 - for the one we're dropping
4943          * 1 - for the one we're adding
4944          * 1 - for updating the inode.
4945          */
4946         trans = btrfs_start_transaction(root, 3);
4947         if (IS_ERR(trans))
4948                 return PTR_ERR(trans);
4949 
4950         ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4951         if (ret) {
4952                 btrfs_abort_transaction(trans, ret);
4953                 btrfs_end_transaction(trans);
4954                 return ret;
4955         }
4956 
4957         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
4958                         offset, 0, 0, len, 0, len, 0, 0, 0);
4959         if (ret)
4960                 btrfs_abort_transaction(trans, ret);
4961         else
4962                 btrfs_update_inode(trans, root, inode);
4963         btrfs_end_transaction(trans);
4964         return ret;
4965 }
4966 
4967 /*
4968  * This function puts in dummy file extents for the area we're creating a hole
4969  * for.  So if we are truncating this file to a larger size we need to insert
4970  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4971  * the range between oldsize and size
4972  */
4973 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4974 {
4975         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4976         struct btrfs_root *root = BTRFS_I(inode)->root;
4977         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4978         struct extent_map *em = NULL;
4979         struct extent_state *cached_state = NULL;
4980         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4981         u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4982         u64 block_end = ALIGN(size, fs_info->sectorsize);
4983         u64 last_byte;
4984         u64 cur_offset;
4985         u64 hole_size;
4986         int err = 0;
4987 
4988         /*
4989          * If our size started in the middle of a block we need to zero out the
4990          * rest of the block before we expand the i_size, otherwise we could
4991          * expose stale data.
4992          */
4993         err = btrfs_truncate_block(inode, oldsize, 0, 0);
4994         if (err)
4995                 return err;
4996 
4997         if (size <= hole_start)
4998                 return 0;
4999 
5000         while (1) {
5001                 struct btrfs_ordered_extent *ordered;
5002 
5003                 lock_extent_bits(io_tree, hole_start, block_end - 1,
5004                                  &cached_state);
5005                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
5006                                                      block_end - hole_start);
5007                 if (!ordered)
5008                         break;
5009                 unlock_extent_cached(io_tree, hole_start, block_end - 1,
5010                                      &cached_state);
5011                 btrfs_start_ordered_extent(inode, ordered, 1);
5012                 btrfs_put_ordered_extent(ordered);
5013         }
5014 
5015         cur_offset = hole_start;
5016         while (1) {
5017                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
5018                                 block_end - cur_offset, 0);
5019                 if (IS_ERR(em)) {
5020                         err = PTR_ERR(em);
5021                         em = NULL;
5022                         break;
5023                 }
5024                 last_byte = min(extent_map_end(em), block_end);
5025                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5026                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5027                         struct extent_map *hole_em;
5028                         hole_size = last_byte - cur_offset;
5029 
5030                         err = maybe_insert_hole(root, inode, cur_offset,
5031                                                 hole_size);
5032                         if (err)
5033                                 break;
5034                         btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
5035                                                 cur_offset + hole_size - 1, 0);
5036                         hole_em = alloc_extent_map();
5037                         if (!hole_em) {
5038                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5039                                         &BTRFS_I(inode)->runtime_flags);
5040                                 goto next;
5041                         }
5042                         hole_em->start = cur_offset;
5043                         hole_em->len = hole_size;
5044                         hole_em->orig_start = cur_offset;
5045 
5046                         hole_em->block_start = EXTENT_MAP_HOLE;
5047                         hole_em->block_len = 0;
5048                         hole_em->orig_block_len = 0;
5049                         hole_em->ram_bytes = hole_size;
5050                         hole_em->bdev = fs_info->fs_devices->latest_bdev;
5051                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
5052                         hole_em->generation = fs_info->generation;
5053 
5054                         while (1) {
5055                                 write_lock(&em_tree->lock);
5056                                 err = add_extent_mapping(em_tree, hole_em, 1);
5057                                 write_unlock(&em_tree->lock);
5058                                 if (err != -EEXIST)
5059                                         break;
5060                                 btrfs_drop_extent_cache(BTRFS_I(inode),
5061                                                         cur_offset,
5062                                                         cur_offset +
5063                                                         hole_size - 1, 0);
5064                         }
5065                         free_extent_map(hole_em);
5066                 }
5067 next:
5068                 free_extent_map(em);
5069                 em = NULL;
5070                 cur_offset = last_byte;
5071                 if (cur_offset >= block_end)
5072                         break;
5073         }
5074         free_extent_map(em);
5075         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
5076         return err;
5077 }
5078 
5079 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5080 {
5081         struct btrfs_root *root = BTRFS_I(inode)->root;
5082         struct btrfs_trans_handle *trans;
5083         loff_t oldsize = i_size_read(inode);
5084         loff_t newsize = attr->ia_size;
5085         int mask = attr->ia_valid;
5086         int ret;
5087 
5088         /*
5089          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5090          * special case where we need to update the times despite not having
5091          * these flags set.  For all other operations the VFS set these flags
5092          * explicitly if it wants a timestamp update.
5093          */
5094         if (newsize != oldsize) {
5095                 inode_inc_iversion(inode);
5096                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5097                         inode->i_ctime = inode->i_mtime =
5098                                 current_time(inode);
5099         }
5100 
5101         if (newsize > oldsize) {
5102                 /*
5103                  * Don't do an expanding truncate while snapshotting is ongoing.
5104                  * This is to ensure the snapshot captures a fully consistent
5105                  * state of this file - if the snapshot captures this expanding
5106                  * truncation, it must capture all writes that happened before
5107                  * this truncation.
5108                  */
5109                 btrfs_wait_for_snapshot_creation(root);
5110                 ret = btrfs_cont_expand(inode, oldsize, newsize);
5111                 if (ret) {
5112                         btrfs_end_write_no_snapshotting(root);
5113                         return ret;
5114                 }
5115 
5116                 trans = btrfs_start_transaction(root, 1);
5117                 if (IS_ERR(trans)) {
5118                         btrfs_end_write_no_snapshotting(root);
5119                         return PTR_ERR(trans);
5120                 }
5121 
5122                 i_size_write(inode, newsize);
5123                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5124                 pagecache_isize_extended(inode, oldsize, newsize);
5125                 ret = btrfs_update_inode(trans, root, inode);
5126                 btrfs_end_write_no_snapshotting(root);
5127                 btrfs_end_transaction(trans);
5128         } else {
5129 
5130                 /*
5131                  * We're truncating a file that used to have good data down to
5132                  * zero. Make sure it gets into the ordered flush list so that
5133                  * any new writes get down to disk quickly.
5134                  */
5135                 if (newsize == 0)
5136                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5137                                 &BTRFS_I(inode)->runtime_flags);
5138 
5139                 /*
5140                  * 1 for the orphan item we're going to add
5141                  * 1 for the orphan item deletion.
5142                  */
5143                 trans = btrfs_start_transaction(root, 2);
5144                 if (IS_ERR(trans))
5145                         return PTR_ERR(trans);
5146 
5147                 /*
5148                  * We need to do this in case we fail at _any_ point during the
5149                  * actual truncate.  Once we do the truncate_setsize we could
5150                  * invalidate pages which forces any outstanding ordered io to
5151                  * be instantly completed which will give us extents that need
5152                  * to be truncated.  If we fail to get an orphan inode down we
5153                  * could have left over extents that were never meant to live,
5154                  * so we need to guarantee from this point on that everything
5155                  * will be consistent.
5156                  */
5157                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
5158                 btrfs_end_transaction(trans);
5159                 if (ret)
5160                         return ret;
5161 
5162                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
5163                 truncate_setsize(inode, newsize);
5164 
5165                 /* Disable nonlocked read DIO to avoid the end less truncate */
5166                 btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5167                 inode_dio_wait(inode);
5168                 btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5169 
5170                 ret = btrfs_truncate(inode);
5171                 if (ret && inode->i_nlink) {
5172                         int err;
5173 
5174                         /* To get a stable disk_i_size */
5175                         err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5176                         if (err) {
5177                                 btrfs_orphan_del(NULL, BTRFS_I(inode));
5178                                 return err;
5179                         }
5180 
5181                         /*
5182                          * failed to truncate, disk_i_size is only adjusted down
5183                          * as we remove extents, so it should represent the true
5184                          * size of the inode, so reset the in memory size and
5185                          * delete our orphan entry.
5186                          */
5187                         trans = btrfs_join_transaction(root);
5188                         if (IS_ERR(trans)) {
5189                                 btrfs_orphan_del(NULL, BTRFS_I(inode));
5190                                 return ret;
5191                         }
5192                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5193                         err = btrfs_orphan_del(trans, BTRFS_I(inode));
5194                         if (err)
5195                                 btrfs_abort_transaction(trans, err);
5196                         btrfs_end_transaction(trans);
5197                 }
5198         }
5199 
5200         return ret;
5201 }
5202 
5203 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5204 {
5205         struct inode *inode = d_inode(dentry);
5206         struct btrfs_root *root = BTRFS_I(inode)->root;
5207         int err;
5208 
5209         if (btrfs_root_readonly(root))
5210                 return -EROFS;
5211 
5212         err = setattr_prepare(dentry, attr);
5213         if (err)
5214                 return err;
5215 
5216         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5217                 err = btrfs_setsize(inode, attr);
5218                 if (err)
5219                         return err;
5220         }
5221 
5222         if (attr->ia_valid) {
5223                 setattr_copy(inode, attr);
5224                 inode_inc_iversion(inode);
5225                 err = btrfs_dirty_inode(inode);
5226 
5227                 if (!err && attr->ia_valid & ATTR_MODE)
5228                         err = posix_acl_chmod(inode, inode->i_mode);
5229         }
5230 
5231         return err;
5232 }
5233 
5234 /*
5235  * While truncating the inode pages during eviction, we get the VFS calling
5236  * btrfs_invalidatepage() against each page of the inode. This is slow because
5237  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5238  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5239  * extent_state structures over and over, wasting lots of time.
5240  *
5241  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5242  * those expensive operations on a per page basis and do only the ordered io
5243  * finishing, while we release here the extent_map and extent_state structures,
5244  * without the excessive merging and splitting.
5245  */
5246 static void evict_inode_truncate_pages(struct inode *inode)
5247 {
5248         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5249         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5250         struct rb_node *node;
5251 
5252         ASSERT(inode->i_state & I_FREEING);
5253         truncate_inode_pages_final(&inode->i_data);
5254 
5255         write_lock(&map_tree->lock);
5256         while (!RB_EMPTY_ROOT(&map_tree->map)) {
5257                 struct extent_map *em;
5258 
5259                 node = rb_first(&map_tree->map);
5260                 em = rb_entry(node, struct extent_map, rb_node);
5261                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5262                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5263                 remove_extent_mapping(map_tree, em);
5264                 free_extent_map(em);
5265                 if (need_resched()) {
5266                         write_unlock(&map_tree->lock);
5267                         cond_resched();
5268                         write_lock(&map_tree->lock);
5269                 }
5270         }
5271         write_unlock(&map_tree->lock);
5272 
5273         /*
5274          * Keep looping until we have no more ranges in the io tree.
5275          * We can have ongoing bios started by readpages (called from readahead)
5276          * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5277          * still in progress (unlocked the pages in the bio but did not yet
5278          * unlocked the ranges in the io tree). Therefore this means some
5279          * ranges can still be locked and eviction started because before
5280          * submitting those bios, which are executed by a separate task (work
5281          * queue kthread), inode references (inode->i_count) were not taken
5282          * (which would be dropped in the end io callback of each bio).
5283          * Therefore here we effectively end up waiting for those bios and
5284          * anyone else holding locked ranges without having bumped the inode's
5285          * reference count - if we don't do it, when they access the inode's
5286          * io_tree to unlock a range it may be too late, leading to an
5287          * use-after-free issue.
5288          */
5289         spin_lock(&io_tree->lock);
5290         while (!RB_EMPTY_ROOT(&io_tree->state)) {