~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/inode.c

Version: ~ [ linux-5.6-rc3 ] ~ [ linux-5.5.6 ] ~ [ linux-5.4.22 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.106 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.171 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.214 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.214 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Copyright (C) 2007 Oracle.  All rights reserved.
  3  *
  4  * This program is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU General Public
  6  * License v2 as published by the Free Software Foundation.
  7  *
  8  * This program is distributed in the hope that it will be useful,
  9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11  * General Public License for more details.
 12  *
 13  * You should have received a copy of the GNU General Public
 14  * License along with this program; if not, write to the
 15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 16  * Boston, MA 021110-1307, USA.
 17  */
 18 
 19 #include <linux/kernel.h>
 20 #include <linux/bio.h>
 21 #include <linux/buffer_head.h>
 22 #include <linux/file.h>
 23 #include <linux/fs.h>
 24 #include <linux/pagemap.h>
 25 #include <linux/highmem.h>
 26 #include <linux/time.h>
 27 #include <linux/init.h>
 28 #include <linux/string.h>
 29 #include <linux/backing-dev.h>
 30 #include <linux/mpage.h>
 31 #include <linux/swap.h>
 32 #include <linux/writeback.h>
 33 #include <linux/statfs.h>
 34 #include <linux/compat.h>
 35 #include <linux/aio.h>
 36 #include <linux/bit_spinlock.h>
 37 #include <linux/xattr.h>
 38 #include <linux/posix_acl.h>
 39 #include <linux/falloc.h>
 40 #include <linux/slab.h>
 41 #include <linux/ratelimit.h>
 42 #include <linux/mount.h>
 43 #include <linux/btrfs.h>
 44 #include <linux/blkdev.h>
 45 #include <linux/posix_acl_xattr.h>
 46 #include "ctree.h"
 47 #include "disk-io.h"
 48 #include "transaction.h"
 49 #include "btrfs_inode.h"
 50 #include "print-tree.h"
 51 #include "ordered-data.h"
 52 #include "xattr.h"
 53 #include "tree-log.h"
 54 #include "volumes.h"
 55 #include "compression.h"
 56 #include "locking.h"
 57 #include "free-space-cache.h"
 58 #include "inode-map.h"
 59 #include "backref.h"
 60 #include "hash.h"
 61 #include "props.h"
 62 
 63 struct btrfs_iget_args {
 64         struct btrfs_key *location;
 65         struct btrfs_root *root;
 66 };
 67 
 68 static const struct inode_operations btrfs_dir_inode_operations;
 69 static const struct inode_operations btrfs_symlink_inode_operations;
 70 static const struct inode_operations btrfs_dir_ro_inode_operations;
 71 static const struct inode_operations btrfs_special_inode_operations;
 72 static const struct inode_operations btrfs_file_inode_operations;
 73 static const struct address_space_operations btrfs_aops;
 74 static const struct address_space_operations btrfs_symlink_aops;
 75 static const struct file_operations btrfs_dir_file_operations;
 76 static struct extent_io_ops btrfs_extent_io_ops;
 77 
 78 static struct kmem_cache *btrfs_inode_cachep;
 79 static struct kmem_cache *btrfs_delalloc_work_cachep;
 80 struct kmem_cache *btrfs_trans_handle_cachep;
 81 struct kmem_cache *btrfs_transaction_cachep;
 82 struct kmem_cache *btrfs_path_cachep;
 83 struct kmem_cache *btrfs_free_space_cachep;
 84 
 85 #define S_SHIFT 12
 86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 87         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
 88         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
 89         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
 90         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
 91         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
 92         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
 93         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 94 };
 95 
 96 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 97 static int btrfs_truncate(struct inode *inode);
 98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 99 static noinline int cow_file_range(struct inode *inode,
100                                    struct page *locked_page,
101                                    u64 start, u64 end, int *page_started,
102                                    unsigned long *nr_written, int unlock);
103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104                                            u64 len, u64 orig_start,
105                                            u64 block_start, u64 block_len,
106                                            u64 orig_block_len, u64 ram_bytes,
107                                            int type);
108 
109 static int btrfs_dirty_inode(struct inode *inode);
110 
111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
112                                      struct inode *inode,  struct inode *dir,
113                                      const struct qstr *qstr)
114 {
115         int err;
116 
117         err = btrfs_init_acl(trans, inode, dir);
118         if (!err)
119                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
120         return err;
121 }
122 
123 /*
124  * this does all the hard work for inserting an inline extent into
125  * the btree.  The caller should have done a btrfs_drop_extents so that
126  * no overlapping inline items exist in the btree
127  */
128 static int insert_inline_extent(struct btrfs_trans_handle *trans,
129                                 struct btrfs_path *path, int extent_inserted,
130                                 struct btrfs_root *root, struct inode *inode,
131                                 u64 start, size_t size, size_t compressed_size,
132                                 int compress_type,
133                                 struct page **compressed_pages)
134 {
135         struct extent_buffer *leaf;
136         struct page *page = NULL;
137         char *kaddr;
138         unsigned long ptr;
139         struct btrfs_file_extent_item *ei;
140         int err = 0;
141         int ret;
142         size_t cur_size = size;
143         unsigned long offset;
144 
145         if (compressed_size && compressed_pages)
146                 cur_size = compressed_size;
147 
148         inode_add_bytes(inode, size);
149 
150         if (!extent_inserted) {
151                 struct btrfs_key key;
152                 size_t datasize;
153 
154                 key.objectid = btrfs_ino(inode);
155                 key.offset = start;
156                 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
157 
158                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159                 path->leave_spinning = 1;
160                 ret = btrfs_insert_empty_item(trans, root, path, &key,
161                                               datasize);
162                 if (ret) {
163                         err = ret;
164                         goto fail;
165                 }
166         }
167         leaf = path->nodes[0];
168         ei = btrfs_item_ptr(leaf, path->slots[0],
169                             struct btrfs_file_extent_item);
170         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172         btrfs_set_file_extent_encryption(leaf, ei, 0);
173         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175         ptr = btrfs_file_extent_inline_start(ei);
176 
177         if (compress_type != BTRFS_COMPRESS_NONE) {
178                 struct page *cpage;
179                 int i = 0;
180                 while (compressed_size > 0) {
181                         cpage = compressed_pages[i];
182                         cur_size = min_t(unsigned long, compressed_size,
183                                        PAGE_CACHE_SIZE);
184 
185                         kaddr = kmap_atomic(cpage);
186                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
187                         kunmap_atomic(kaddr);
188 
189                         i++;
190                         ptr += cur_size;
191                         compressed_size -= cur_size;
192                 }
193                 btrfs_set_file_extent_compression(leaf, ei,
194                                                   compress_type);
195         } else {
196                 page = find_get_page(inode->i_mapping,
197                                      start >> PAGE_CACHE_SHIFT);
198                 btrfs_set_file_extent_compression(leaf, ei, 0);
199                 kaddr = kmap_atomic(page);
200                 offset = start & (PAGE_CACHE_SIZE - 1);
201                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
202                 kunmap_atomic(kaddr);
203                 page_cache_release(page);
204         }
205         btrfs_mark_buffer_dirty(leaf);
206         btrfs_release_path(path);
207 
208         /*
209          * we're an inline extent, so nobody can
210          * extend the file past i_size without locking
211          * a page we already have locked.
212          *
213          * We must do any isize and inode updates
214          * before we unlock the pages.  Otherwise we
215          * could end up racing with unlink.
216          */
217         BTRFS_I(inode)->disk_i_size = inode->i_size;
218         ret = btrfs_update_inode(trans, root, inode);
219 
220         return ret;
221 fail:
222         return err;
223 }
224 
225 
226 /*
227  * conditionally insert an inline extent into the file.  This
228  * does the checks required to make sure the data is small enough
229  * to fit as an inline extent.
230  */
231 static noinline int cow_file_range_inline(struct btrfs_root *root,
232                                           struct inode *inode, u64 start,
233                                           u64 end, size_t compressed_size,
234                                           int compress_type,
235                                           struct page **compressed_pages)
236 {
237         struct btrfs_trans_handle *trans;
238         u64 isize = i_size_read(inode);
239         u64 actual_end = min(end + 1, isize);
240         u64 inline_len = actual_end - start;
241         u64 aligned_end = ALIGN(end, root->sectorsize);
242         u64 data_len = inline_len;
243         int ret;
244         struct btrfs_path *path;
245         int extent_inserted = 0;
246         u32 extent_item_size;
247 
248         if (compressed_size)
249                 data_len = compressed_size;
250 
251         if (start > 0 ||
252             actual_end >= PAGE_CACHE_SIZE ||
253             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254             (!compressed_size &&
255             (actual_end & (root->sectorsize - 1)) == 0) ||
256             end + 1 < isize ||
257             data_len > root->fs_info->max_inline) {
258                 return 1;
259         }
260 
261         path = btrfs_alloc_path();
262         if (!path)
263                 return -ENOMEM;
264 
265         trans = btrfs_join_transaction(root);
266         if (IS_ERR(trans)) {
267                 btrfs_free_path(path);
268                 return PTR_ERR(trans);
269         }
270         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
271 
272         if (compressed_size && compressed_pages)
273                 extent_item_size = btrfs_file_extent_calc_inline_size(
274                    compressed_size);
275         else
276                 extent_item_size = btrfs_file_extent_calc_inline_size(
277                     inline_len);
278 
279         ret = __btrfs_drop_extents(trans, root, inode, path,
280                                    start, aligned_end, NULL,
281                                    1, 1, extent_item_size, &extent_inserted);
282         if (ret) {
283                 btrfs_abort_transaction(trans, root, ret);
284                 goto out;
285         }
286 
287         if (isize > actual_end)
288                 inline_len = min_t(u64, isize, actual_end);
289         ret = insert_inline_extent(trans, path, extent_inserted,
290                                    root, inode, start,
291                                    inline_len, compressed_size,
292                                    compress_type, compressed_pages);
293         if (ret && ret != -ENOSPC) {
294                 btrfs_abort_transaction(trans, root, ret);
295                 goto out;
296         } else if (ret == -ENOSPC) {
297                 ret = 1;
298                 goto out;
299         }
300 
301         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
302         btrfs_delalloc_release_metadata(inode, end + 1 - start);
303         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
304 out:
305         btrfs_free_path(path);
306         btrfs_end_transaction(trans, root);
307         return ret;
308 }
309 
310 struct async_extent {
311         u64 start;
312         u64 ram_size;
313         u64 compressed_size;
314         struct page **pages;
315         unsigned long nr_pages;
316         int compress_type;
317         struct list_head list;
318 };
319 
320 struct async_cow {
321         struct inode *inode;
322         struct btrfs_root *root;
323         struct page *locked_page;
324         u64 start;
325         u64 end;
326         struct list_head extents;
327         struct btrfs_work work;
328 };
329 
330 static noinline int add_async_extent(struct async_cow *cow,
331                                      u64 start, u64 ram_size,
332                                      u64 compressed_size,
333                                      struct page **pages,
334                                      unsigned long nr_pages,
335                                      int compress_type)
336 {
337         struct async_extent *async_extent;
338 
339         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
340         BUG_ON(!async_extent); /* -ENOMEM */
341         async_extent->start = start;
342         async_extent->ram_size = ram_size;
343         async_extent->compressed_size = compressed_size;
344         async_extent->pages = pages;
345         async_extent->nr_pages = nr_pages;
346         async_extent->compress_type = compress_type;
347         list_add_tail(&async_extent->list, &cow->extents);
348         return 0;
349 }
350 
351 /*
352  * we create compressed extents in two phases.  The first
353  * phase compresses a range of pages that have already been
354  * locked (both pages and state bits are locked).
355  *
356  * This is done inside an ordered work queue, and the compression
357  * is spread across many cpus.  The actual IO submission is step
358  * two, and the ordered work queue takes care of making sure that
359  * happens in the same order things were put onto the queue by
360  * writepages and friends.
361  *
362  * If this code finds it can't get good compression, it puts an
363  * entry onto the work queue to write the uncompressed bytes.  This
364  * makes sure that both compressed inodes and uncompressed inodes
365  * are written in the same order that the flusher thread sent them
366  * down.
367  */
368 static noinline int compress_file_range(struct inode *inode,
369                                         struct page *locked_page,
370                                         u64 start, u64 end,
371                                         struct async_cow *async_cow,
372                                         int *num_added)
373 {
374         struct btrfs_root *root = BTRFS_I(inode)->root;
375         u64 num_bytes;
376         u64 blocksize = root->sectorsize;
377         u64 actual_end;
378         u64 isize = i_size_read(inode);
379         int ret = 0;
380         struct page **pages = NULL;
381         unsigned long nr_pages;
382         unsigned long nr_pages_ret = 0;
383         unsigned long total_compressed = 0;
384         unsigned long total_in = 0;
385         unsigned long max_compressed = 128 * 1024;
386         unsigned long max_uncompressed = 128 * 1024;
387         int i;
388         int will_compress;
389         int compress_type = root->fs_info->compress_type;
390         int redirty = 0;
391 
392         /* if this is a small write inside eof, kick off a defrag */
393         if ((end - start + 1) < 16 * 1024 &&
394             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
395                 btrfs_add_inode_defrag(NULL, inode);
396 
397         /*
398          * skip compression for a small file range(<=blocksize) that
399          * isn't an inline extent, since it dosen't save disk space at all.
400          */
401         if ((end - start + 1) <= blocksize &&
402             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
403                 goto cleanup_and_bail_uncompressed;
404 
405         actual_end = min_t(u64, isize, end + 1);
406 again:
407         will_compress = 0;
408         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
409         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
410 
411         /*
412          * we don't want to send crud past the end of i_size through
413          * compression, that's just a waste of CPU time.  So, if the
414          * end of the file is before the start of our current
415          * requested range of bytes, we bail out to the uncompressed
416          * cleanup code that can deal with all of this.
417          *
418          * It isn't really the fastest way to fix things, but this is a
419          * very uncommon corner.
420          */
421         if (actual_end <= start)
422                 goto cleanup_and_bail_uncompressed;
423 
424         total_compressed = actual_end - start;
425 
426         /* we want to make sure that amount of ram required to uncompress
427          * an extent is reasonable, so we limit the total size in ram
428          * of a compressed extent to 128k.  This is a crucial number
429          * because it also controls how easily we can spread reads across
430          * cpus for decompression.
431          *
432          * We also want to make sure the amount of IO required to do
433          * a random read is reasonably small, so we limit the size of
434          * a compressed extent to 128k.
435          */
436         total_compressed = min(total_compressed, max_uncompressed);
437         num_bytes = ALIGN(end - start + 1, blocksize);
438         num_bytes = max(blocksize,  num_bytes);
439         total_in = 0;
440         ret = 0;
441 
442         /*
443          * we do compression for mount -o compress and when the
444          * inode has not been flagged as nocompress.  This flag can
445          * change at any time if we discover bad compression ratios.
446          */
447         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
448             (btrfs_test_opt(root, COMPRESS) ||
449              (BTRFS_I(inode)->force_compress) ||
450              (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451                 WARN_ON(pages);
452                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453                 if (!pages) {
454                         /* just bail out to the uncompressed code */
455                         goto cont;
456                 }
457 
458                 if (BTRFS_I(inode)->force_compress)
459                         compress_type = BTRFS_I(inode)->force_compress;
460 
461                 /*
462                  * we need to call clear_page_dirty_for_io on each
463                  * page in the range.  Otherwise applications with the file
464                  * mmap'd can wander in and change the page contents while
465                  * we are compressing them.
466                  *
467                  * If the compression fails for any reason, we set the pages
468                  * dirty again later on.
469                  */
470                 extent_range_clear_dirty_for_io(inode, start, end);
471                 redirty = 1;
472                 ret = btrfs_compress_pages(compress_type,
473                                            inode->i_mapping, start,
474                                            total_compressed, pages,
475                                            nr_pages, &nr_pages_ret,
476                                            &total_in,
477                                            &total_compressed,
478                                            max_compressed);
479 
480                 if (!ret) {
481                         unsigned long offset = total_compressed &
482                                 (PAGE_CACHE_SIZE - 1);
483                         struct page *page = pages[nr_pages_ret - 1];
484                         char *kaddr;
485 
486                         /* zero the tail end of the last page, we might be
487                          * sending it down to disk
488                          */
489                         if (offset) {
490                                 kaddr = kmap_atomic(page);
491                                 memset(kaddr + offset, 0,
492                                        PAGE_CACHE_SIZE - offset);
493                                 kunmap_atomic(kaddr);
494                         }
495                         will_compress = 1;
496                 }
497         }
498 cont:
499         if (start == 0) {
500                 /* lets try to make an inline extent */
501                 if (ret || total_in < (actual_end - start)) {
502                         /* we didn't compress the entire range, try
503                          * to make an uncompressed inline extent.
504                          */
505                         ret = cow_file_range_inline(root, inode, start, end,
506                                                     0, 0, NULL);
507                 } else {
508                         /* try making a compressed inline extent */
509                         ret = cow_file_range_inline(root, inode, start, end,
510                                                     total_compressed,
511                                                     compress_type, pages);
512                 }
513                 if (ret <= 0) {
514                         unsigned long clear_flags = EXTENT_DELALLOC |
515                                 EXTENT_DEFRAG;
516                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
517 
518                         /*
519                          * inline extent creation worked or returned error,
520                          * we don't need to create any more async work items.
521                          * Unlock and free up our temp pages.
522                          */
523                         extent_clear_unlock_delalloc(inode, start, end, NULL,
524                                                      clear_flags, PAGE_UNLOCK |
525                                                      PAGE_CLEAR_DIRTY |
526                                                      PAGE_SET_WRITEBACK |
527                                                      PAGE_END_WRITEBACK);
528                         goto free_pages_out;
529                 }
530         }
531 
532         if (will_compress) {
533                 /*
534                  * we aren't doing an inline extent round the compressed size
535                  * up to a block size boundary so the allocator does sane
536                  * things
537                  */
538                 total_compressed = ALIGN(total_compressed, blocksize);
539 
540                 /*
541                  * one last check to make sure the compression is really a
542                  * win, compare the page count read with the blocks on disk
543                  */
544                 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
545                 if (total_compressed >= total_in) {
546                         will_compress = 0;
547                 } else {
548                         num_bytes = total_in;
549                 }
550         }
551         if (!will_compress && pages) {
552                 /*
553                  * the compression code ran but failed to make things smaller,
554                  * free any pages it allocated and our page pointer array
555                  */
556                 for (i = 0; i < nr_pages_ret; i++) {
557                         WARN_ON(pages[i]->mapping);
558                         page_cache_release(pages[i]);
559                 }
560                 kfree(pages);
561                 pages = NULL;
562                 total_compressed = 0;
563                 nr_pages_ret = 0;
564 
565                 /* flag the file so we don't compress in the future */
566                 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
567                     !(BTRFS_I(inode)->force_compress)) {
568                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
569                 }
570         }
571         if (will_compress) {
572                 *num_added += 1;
573 
574                 /* the async work queues will take care of doing actual
575                  * allocation on disk for these compressed pages,
576                  * and will submit them to the elevator.
577                  */
578                 add_async_extent(async_cow, start, num_bytes,
579                                  total_compressed, pages, nr_pages_ret,
580                                  compress_type);
581 
582                 if (start + num_bytes < end) {
583                         start += num_bytes;
584                         pages = NULL;
585                         cond_resched();
586                         goto again;
587                 }
588         } else {
589 cleanup_and_bail_uncompressed:
590                 /*
591                  * No compression, but we still need to write the pages in
592                  * the file we've been given so far.  redirty the locked
593                  * page if it corresponds to our extent and set things up
594                  * for the async work queue to run cow_file_range to do
595                  * the normal delalloc dance
596                  */
597                 if (page_offset(locked_page) >= start &&
598                     page_offset(locked_page) <= end) {
599                         __set_page_dirty_nobuffers(locked_page);
600                         /* unlocked later on in the async handlers */
601                 }
602                 if (redirty)
603                         extent_range_redirty_for_io(inode, start, end);
604                 add_async_extent(async_cow, start, end - start + 1,
605                                  0, NULL, 0, BTRFS_COMPRESS_NONE);
606                 *num_added += 1;
607         }
608 
609 out:
610         return ret;
611 
612 free_pages_out:
613         for (i = 0; i < nr_pages_ret; i++) {
614                 WARN_ON(pages[i]->mapping);
615                 page_cache_release(pages[i]);
616         }
617         kfree(pages);
618 
619         goto out;
620 }
621 
622 /*
623  * phase two of compressed writeback.  This is the ordered portion
624  * of the code, which only gets called in the order the work was
625  * queued.  We walk all the async extents created by compress_file_range
626  * and send them down to the disk.
627  */
628 static noinline int submit_compressed_extents(struct inode *inode,
629                                               struct async_cow *async_cow)
630 {
631         struct async_extent *async_extent;
632         u64 alloc_hint = 0;
633         struct btrfs_key ins;
634         struct extent_map *em;
635         struct btrfs_root *root = BTRFS_I(inode)->root;
636         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
637         struct extent_io_tree *io_tree;
638         int ret = 0;
639 
640         if (list_empty(&async_cow->extents))
641                 return 0;
642 
643 again:
644         while (!list_empty(&async_cow->extents)) {
645                 async_extent = list_entry(async_cow->extents.next,
646                                           struct async_extent, list);
647                 list_del(&async_extent->list);
648 
649                 io_tree = &BTRFS_I(inode)->io_tree;
650 
651 retry:
652                 /* did the compression code fall back to uncompressed IO? */
653                 if (!async_extent->pages) {
654                         int page_started = 0;
655                         unsigned long nr_written = 0;
656 
657                         lock_extent(io_tree, async_extent->start,
658                                          async_extent->start +
659                                          async_extent->ram_size - 1);
660 
661                         /* allocate blocks */
662                         ret = cow_file_range(inode, async_cow->locked_page,
663                                              async_extent->start,
664                                              async_extent->start +
665                                              async_extent->ram_size - 1,
666                                              &page_started, &nr_written, 0);
667 
668                         /* JDM XXX */
669 
670                         /*
671                          * if page_started, cow_file_range inserted an
672                          * inline extent and took care of all the unlocking
673                          * and IO for us.  Otherwise, we need to submit
674                          * all those pages down to the drive.
675                          */
676                         if (!page_started && !ret)
677                                 extent_write_locked_range(io_tree,
678                                                   inode, async_extent->start,
679                                                   async_extent->start +
680                                                   async_extent->ram_size - 1,
681                                                   btrfs_get_extent,
682                                                   WB_SYNC_ALL);
683                         else if (ret)
684                                 unlock_page(async_cow->locked_page);
685                         kfree(async_extent);
686                         cond_resched();
687                         continue;
688                 }
689 
690                 lock_extent(io_tree, async_extent->start,
691                             async_extent->start + async_extent->ram_size - 1);
692 
693                 ret = btrfs_reserve_extent(root,
694                                            async_extent->compressed_size,
695                                            async_extent->compressed_size,
696                                            0, alloc_hint, &ins, 1, 1);
697                 if (ret) {
698                         int i;
699 
700                         for (i = 0; i < async_extent->nr_pages; i++) {
701                                 WARN_ON(async_extent->pages[i]->mapping);
702                                 page_cache_release(async_extent->pages[i]);
703                         }
704                         kfree(async_extent->pages);
705                         async_extent->nr_pages = 0;
706                         async_extent->pages = NULL;
707 
708                         if (ret == -ENOSPC) {
709                                 unlock_extent(io_tree, async_extent->start,
710                                               async_extent->start +
711                                               async_extent->ram_size - 1);
712 
713                                 /*
714                                  * we need to redirty the pages if we decide to
715                                  * fallback to uncompressed IO, otherwise we
716                                  * will not submit these pages down to lower
717                                  * layers.
718                                  */
719                                 extent_range_redirty_for_io(inode,
720                                                 async_extent->start,
721                                                 async_extent->start +
722                                                 async_extent->ram_size - 1);
723 
724                                 goto retry;
725                         }
726                         goto out_free;
727                 }
728 
729                 /*
730                  * here we're doing allocation and writeback of the
731                  * compressed pages
732                  */
733                 btrfs_drop_extent_cache(inode, async_extent->start,
734                                         async_extent->start +
735                                         async_extent->ram_size - 1, 0);
736 
737                 em = alloc_extent_map();
738                 if (!em) {
739                         ret = -ENOMEM;
740                         goto out_free_reserve;
741                 }
742                 em->start = async_extent->start;
743                 em->len = async_extent->ram_size;
744                 em->orig_start = em->start;
745                 em->mod_start = em->start;
746                 em->mod_len = em->len;
747 
748                 em->block_start = ins.objectid;
749                 em->block_len = ins.offset;
750                 em->orig_block_len = ins.offset;
751                 em->ram_bytes = async_extent->ram_size;
752                 em->bdev = root->fs_info->fs_devices->latest_bdev;
753                 em->compress_type = async_extent->compress_type;
754                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
755                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
756                 em->generation = -1;
757 
758                 while (1) {
759                         write_lock(&em_tree->lock);
760                         ret = add_extent_mapping(em_tree, em, 1);
761                         write_unlock(&em_tree->lock);
762                         if (ret != -EEXIST) {
763                                 free_extent_map(em);
764                                 break;
765                         }
766                         btrfs_drop_extent_cache(inode, async_extent->start,
767                                                 async_extent->start +
768                                                 async_extent->ram_size - 1, 0);
769                 }
770 
771                 if (ret)
772                         goto out_free_reserve;
773 
774                 ret = btrfs_add_ordered_extent_compress(inode,
775                                                 async_extent->start,
776                                                 ins.objectid,
777                                                 async_extent->ram_size,
778                                                 ins.offset,
779                                                 BTRFS_ORDERED_COMPRESSED,
780                                                 async_extent->compress_type);
781                 if (ret) {
782                         btrfs_drop_extent_cache(inode, async_extent->start,
783                                                 async_extent->start +
784                                                 async_extent->ram_size - 1, 0);
785                         goto out_free_reserve;
786                 }
787 
788                 /*
789                  * clear dirty, set writeback and unlock the pages.
790                  */
791                 extent_clear_unlock_delalloc(inode, async_extent->start,
792                                 async_extent->start +
793                                 async_extent->ram_size - 1,
794                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
795                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
796                                 PAGE_SET_WRITEBACK);
797                 ret = btrfs_submit_compressed_write(inode,
798                                     async_extent->start,
799                                     async_extent->ram_size,
800                                     ins.objectid,
801                                     ins.offset, async_extent->pages,
802                                     async_extent->nr_pages);
803                 alloc_hint = ins.objectid + ins.offset;
804                 kfree(async_extent);
805                 if (ret)
806                         goto out;
807                 cond_resched();
808         }
809         ret = 0;
810 out:
811         return ret;
812 out_free_reserve:
813         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
814 out_free:
815         extent_clear_unlock_delalloc(inode, async_extent->start,
816                                      async_extent->start +
817                                      async_extent->ram_size - 1,
818                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
819                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
820                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
821                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
822         kfree(async_extent);
823         goto again;
824 }
825 
826 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
827                                       u64 num_bytes)
828 {
829         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
830         struct extent_map *em;
831         u64 alloc_hint = 0;
832 
833         read_lock(&em_tree->lock);
834         em = search_extent_mapping(em_tree, start, num_bytes);
835         if (em) {
836                 /*
837                  * if block start isn't an actual block number then find the
838                  * first block in this inode and use that as a hint.  If that
839                  * block is also bogus then just don't worry about it.
840                  */
841                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
842                         free_extent_map(em);
843                         em = search_extent_mapping(em_tree, 0, 0);
844                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
845                                 alloc_hint = em->block_start;
846                         if (em)
847                                 free_extent_map(em);
848                 } else {
849                         alloc_hint = em->block_start;
850                         free_extent_map(em);
851                 }
852         }
853         read_unlock(&em_tree->lock);
854 
855         return alloc_hint;
856 }
857 
858 /*
859  * when extent_io.c finds a delayed allocation range in the file,
860  * the call backs end up in this code.  The basic idea is to
861  * allocate extents on disk for the range, and create ordered data structs
862  * in ram to track those extents.
863  *
864  * locked_page is the page that writepage had locked already.  We use
865  * it to make sure we don't do extra locks or unlocks.
866  *
867  * *page_started is set to one if we unlock locked_page and do everything
868  * required to start IO on it.  It may be clean and already done with
869  * IO when we return.
870  */
871 static noinline int cow_file_range(struct inode *inode,
872                                    struct page *locked_page,
873                                    u64 start, u64 end, int *page_started,
874                                    unsigned long *nr_written,
875                                    int unlock)
876 {
877         struct btrfs_root *root = BTRFS_I(inode)->root;
878         u64 alloc_hint = 0;
879         u64 num_bytes;
880         unsigned long ram_size;
881         u64 disk_num_bytes;
882         u64 cur_alloc_size;
883         u64 blocksize = root->sectorsize;
884         struct btrfs_key ins;
885         struct extent_map *em;
886         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
887         int ret = 0;
888 
889         if (btrfs_is_free_space_inode(inode)) {
890                 WARN_ON_ONCE(1);
891                 ret = -EINVAL;
892                 goto out_unlock;
893         }
894 
895         num_bytes = ALIGN(end - start + 1, blocksize);
896         num_bytes = max(blocksize,  num_bytes);
897         disk_num_bytes = num_bytes;
898 
899         /* if this is a small write inside eof, kick off defrag */
900         if (num_bytes < 64 * 1024 &&
901             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
902                 btrfs_add_inode_defrag(NULL, inode);
903 
904         if (start == 0) {
905                 /* lets try to make an inline extent */
906                 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
907                                             NULL);
908                 if (ret == 0) {
909                         extent_clear_unlock_delalloc(inode, start, end, NULL,
910                                      EXTENT_LOCKED | EXTENT_DELALLOC |
911                                      EXTENT_DEFRAG, PAGE_UNLOCK |
912                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
913                                      PAGE_END_WRITEBACK);
914 
915                         *nr_written = *nr_written +
916                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
917                         *page_started = 1;
918                         goto out;
919                 } else if (ret < 0) {
920                         goto out_unlock;
921                 }
922         }
923 
924         BUG_ON(disk_num_bytes >
925                btrfs_super_total_bytes(root->fs_info->super_copy));
926 
927         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
928         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
929 
930         while (disk_num_bytes > 0) {
931                 unsigned long op;
932 
933                 cur_alloc_size = disk_num_bytes;
934                 ret = btrfs_reserve_extent(root, cur_alloc_size,
935                                            root->sectorsize, 0, alloc_hint,
936                                            &ins, 1, 1);
937                 if (ret < 0)
938                         goto out_unlock;
939 
940                 em = alloc_extent_map();
941                 if (!em) {
942                         ret = -ENOMEM;
943                         goto out_reserve;
944                 }
945                 em->start = start;
946                 em->orig_start = em->start;
947                 ram_size = ins.offset;
948                 em->len = ins.offset;
949                 em->mod_start = em->start;
950                 em->mod_len = em->len;
951 
952                 em->block_start = ins.objectid;
953                 em->block_len = ins.offset;
954                 em->orig_block_len = ins.offset;
955                 em->ram_bytes = ram_size;
956                 em->bdev = root->fs_info->fs_devices->latest_bdev;
957                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
958                 em->generation = -1;
959 
960                 while (1) {
961                         write_lock(&em_tree->lock);
962                         ret = add_extent_mapping(em_tree, em, 1);
963                         write_unlock(&em_tree->lock);
964                         if (ret != -EEXIST) {
965                                 free_extent_map(em);
966                                 break;
967                         }
968                         btrfs_drop_extent_cache(inode, start,
969                                                 start + ram_size - 1, 0);
970                 }
971                 if (ret)
972                         goto out_reserve;
973 
974                 cur_alloc_size = ins.offset;
975                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
976                                                ram_size, cur_alloc_size, 0);
977                 if (ret)
978                         goto out_drop_extent_cache;
979 
980                 if (root->root_key.objectid ==
981                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
982                         ret = btrfs_reloc_clone_csums(inode, start,
983                                                       cur_alloc_size);
984                         if (ret)
985                                 goto out_drop_extent_cache;
986                 }
987 
988                 if (disk_num_bytes < cur_alloc_size)
989                         break;
990 
991                 /* we're not doing compressed IO, don't unlock the first
992                  * page (which the caller expects to stay locked), don't
993                  * clear any dirty bits and don't set any writeback bits
994                  *
995                  * Do set the Private2 bit so we know this page was properly
996                  * setup for writepage
997                  */
998                 op = unlock ? PAGE_UNLOCK : 0;
999                 op |= PAGE_SET_PRIVATE2;
1000 
1001                 extent_clear_unlock_delalloc(inode, start,
1002                                              start + ram_size - 1, locked_page,
1003                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1004                                              op);
1005                 disk_num_bytes -= cur_alloc_size;
1006                 num_bytes -= cur_alloc_size;
1007                 alloc_hint = ins.objectid + ins.offset;
1008                 start += cur_alloc_size;
1009         }
1010 out:
1011         return ret;
1012 
1013 out_drop_extent_cache:
1014         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1015 out_reserve:
1016         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1017 out_unlock:
1018         extent_clear_unlock_delalloc(inode, start, end, locked_page,
1019                                      EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1020                                      EXTENT_DELALLOC | EXTENT_DEFRAG,
1021                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1022                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1023         goto out;
1024 }
1025 
1026 /*
1027  * work queue call back to started compression on a file and pages
1028  */
1029 static noinline void async_cow_start(struct btrfs_work *work)
1030 {
1031         struct async_cow *async_cow;
1032         int num_added = 0;
1033         async_cow = container_of(work, struct async_cow, work);
1034 
1035         compress_file_range(async_cow->inode, async_cow->locked_page,
1036                             async_cow->start, async_cow->end, async_cow,
1037                             &num_added);
1038         if (num_added == 0) {
1039                 btrfs_add_delayed_iput(async_cow->inode);
1040                 async_cow->inode = NULL;
1041         }
1042 }
1043 
1044 /*
1045  * work queue call back to submit previously compressed pages
1046  */
1047 static noinline void async_cow_submit(struct btrfs_work *work)
1048 {
1049         struct async_cow *async_cow;
1050         struct btrfs_root *root;
1051         unsigned long nr_pages;
1052 
1053         async_cow = container_of(work, struct async_cow, work);
1054 
1055         root = async_cow->root;
1056         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1057                 PAGE_CACHE_SHIFT;
1058 
1059         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1060             5 * 1024 * 1024 &&
1061             waitqueue_active(&root->fs_info->async_submit_wait))
1062                 wake_up(&root->fs_info->async_submit_wait);
1063 
1064         if (async_cow->inode)
1065                 submit_compressed_extents(async_cow->inode, async_cow);
1066 }
1067 
1068 static noinline void async_cow_free(struct btrfs_work *work)
1069 {
1070         struct async_cow *async_cow;
1071         async_cow = container_of(work, struct async_cow, work);
1072         if (async_cow->inode)
1073                 btrfs_add_delayed_iput(async_cow->inode);
1074         kfree(async_cow);
1075 }
1076 
1077 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1078                                 u64 start, u64 end, int *page_started,
1079                                 unsigned long *nr_written)
1080 {
1081         struct async_cow *async_cow;
1082         struct btrfs_root *root = BTRFS_I(inode)->root;
1083         unsigned long nr_pages;
1084         u64 cur_end;
1085         int limit = 10 * 1024 * 1024;
1086 
1087         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1088                          1, 0, NULL, GFP_NOFS);
1089         while (start < end) {
1090                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1091                 BUG_ON(!async_cow); /* -ENOMEM */
1092                 async_cow->inode = igrab(inode);
1093                 async_cow->root = root;
1094                 async_cow->locked_page = locked_page;
1095                 async_cow->start = start;
1096 
1097                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1098                         cur_end = end;
1099                 else
1100                         cur_end = min(end, start + 512 * 1024 - 1);
1101 
1102                 async_cow->end = cur_end;
1103                 INIT_LIST_HEAD(&async_cow->extents);
1104 
1105                 btrfs_init_work(&async_cow->work,
1106                                 btrfs_delalloc_helper,
1107                                 async_cow_start, async_cow_submit,
1108                                 async_cow_free);
1109 
1110                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1111                         PAGE_CACHE_SHIFT;
1112                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1113 
1114                 btrfs_queue_work(root->fs_info->delalloc_workers,
1115                                  &async_cow->work);
1116 
1117                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1118                         wait_event(root->fs_info->async_submit_wait,
1119                            (atomic_read(&root->fs_info->async_delalloc_pages) <
1120                             limit));
1121                 }
1122 
1123                 while (atomic_read(&root->fs_info->async_submit_draining) &&
1124                       atomic_read(&root->fs_info->async_delalloc_pages)) {
1125                         wait_event(root->fs_info->async_submit_wait,
1126                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
1127                            0));
1128                 }
1129 
1130                 *nr_written += nr_pages;
1131                 start = cur_end + 1;
1132         }
1133         *page_started = 1;
1134         return 0;
1135 }
1136 
1137 static noinline int csum_exist_in_range(struct btrfs_root *root,
1138                                         u64 bytenr, u64 num_bytes)
1139 {
1140         int ret;
1141         struct btrfs_ordered_sum *sums;
1142         LIST_HEAD(list);
1143 
1144         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1145                                        bytenr + num_bytes - 1, &list, 0);
1146         if (ret == 0 && list_empty(&list))
1147                 return 0;
1148 
1149         while (!list_empty(&list)) {
1150                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1151                 list_del(&sums->list);
1152                 kfree(sums);
1153         }
1154         return 1;
1155 }
1156 
1157 /*
1158  * when nowcow writeback call back.  This checks for snapshots or COW copies
1159  * of the extents that exist in the file, and COWs the file as required.
1160  *
1161  * If no cow copies or snapshots exist, we write directly to the existing
1162  * blocks on disk
1163  */
1164 static noinline int run_delalloc_nocow(struct inode *inode,
1165                                        struct page *locked_page,
1166                               u64 start, u64 end, int *page_started, int force,
1167                               unsigned long *nr_written)
1168 {
1169         struct btrfs_root *root = BTRFS_I(inode)->root;
1170         struct btrfs_trans_handle *trans;
1171         struct extent_buffer *leaf;
1172         struct btrfs_path *path;
1173         struct btrfs_file_extent_item *fi;
1174         struct btrfs_key found_key;
1175         u64 cow_start;
1176         u64 cur_offset;
1177         u64 extent_end;
1178         u64 extent_offset;
1179         u64 disk_bytenr;
1180         u64 num_bytes;
1181         u64 disk_num_bytes;
1182         u64 ram_bytes;
1183         int extent_type;
1184         int ret, err;
1185         int type;
1186         int nocow;
1187         int check_prev = 1;
1188         bool nolock;
1189         u64 ino = btrfs_ino(inode);
1190 
1191         path = btrfs_alloc_path();
1192         if (!path) {
1193                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1194                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1195                                              EXTENT_DO_ACCOUNTING |
1196                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1197                                              PAGE_CLEAR_DIRTY |
1198                                              PAGE_SET_WRITEBACK |
1199                                              PAGE_END_WRITEBACK);
1200                 return -ENOMEM;
1201         }
1202 
1203         nolock = btrfs_is_free_space_inode(inode);
1204 
1205         if (nolock)
1206                 trans = btrfs_join_transaction_nolock(root);
1207         else
1208                 trans = btrfs_join_transaction(root);
1209 
1210         if (IS_ERR(trans)) {
1211                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1212                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1213                                              EXTENT_DO_ACCOUNTING |
1214                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1215                                              PAGE_CLEAR_DIRTY |
1216                                              PAGE_SET_WRITEBACK |
1217                                              PAGE_END_WRITEBACK);
1218                 btrfs_free_path(path);
1219                 return PTR_ERR(trans);
1220         }
1221 
1222         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1223 
1224         cow_start = (u64)-1;
1225         cur_offset = start;
1226         while (1) {
1227                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1228                                                cur_offset, 0);
1229                 if (ret < 0)
1230                         goto error;
1231                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1232                         leaf = path->nodes[0];
1233                         btrfs_item_key_to_cpu(leaf, &found_key,
1234                                               path->slots[0] - 1);
1235                         if (found_key.objectid == ino &&
1236                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1237                                 path->slots[0]--;
1238                 }
1239                 check_prev = 0;
1240 next_slot:
1241                 leaf = path->nodes[0];
1242                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1243                         ret = btrfs_next_leaf(root, path);
1244                         if (ret < 0)
1245                                 goto error;
1246                         if (ret > 0)
1247                                 break;
1248                         leaf = path->nodes[0];
1249                 }
1250 
1251                 nocow = 0;
1252                 disk_bytenr = 0;
1253                 num_bytes = 0;
1254                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1255 
1256                 if (found_key.objectid > ino ||
1257                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1258                     found_key.offset > end)
1259                         break;
1260 
1261                 if (found_key.offset > cur_offset) {
1262                         extent_end = found_key.offset;
1263                         extent_type = 0;
1264                         goto out_check;
1265                 }
1266 
1267                 fi = btrfs_item_ptr(leaf, path->slots[0],
1268                                     struct btrfs_file_extent_item);
1269                 extent_type = btrfs_file_extent_type(leaf, fi);
1270 
1271                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1272                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1273                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1274                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1275                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1276                         extent_end = found_key.offset +
1277                                 btrfs_file_extent_num_bytes(leaf, fi);
1278                         disk_num_bytes =
1279                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1280                         if (extent_end <= start) {
1281                                 path->slots[0]++;
1282                                 goto next_slot;
1283                         }
1284                         if (disk_bytenr == 0)
1285                                 goto out_check;
1286                         if (btrfs_file_extent_compression(leaf, fi) ||
1287                             btrfs_file_extent_encryption(leaf, fi) ||
1288                             btrfs_file_extent_other_encoding(leaf, fi))
1289                                 goto out_check;
1290                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1291                                 goto out_check;
1292                         if (btrfs_extent_readonly(root, disk_bytenr))
1293                                 goto out_check;
1294                         if (btrfs_cross_ref_exist(trans, root, ino,
1295                                                   found_key.offset -
1296                                                   extent_offset, disk_bytenr))
1297                                 goto out_check;
1298                         disk_bytenr += extent_offset;
1299                         disk_bytenr += cur_offset - found_key.offset;
1300                         num_bytes = min(end + 1, extent_end) - cur_offset;
1301                         /*
1302                          * if there are pending snapshots for this root,
1303                          * we fall into common COW way.
1304                          */
1305                         if (!nolock) {
1306                                 err = btrfs_start_nocow_write(root);
1307                                 if (!err)
1308                                         goto out_check;
1309                         }
1310                         /*
1311                          * force cow if csum exists in the range.
1312                          * this ensure that csum for a given extent are
1313                          * either valid or do not exist.
1314                          */
1315                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1316                                 goto out_check;
1317                         nocow = 1;
1318                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1319                         extent_end = found_key.offset +
1320                                 btrfs_file_extent_inline_len(leaf,
1321                                                      path->slots[0], fi);
1322                         extent_end = ALIGN(extent_end, root->sectorsize);
1323                 } else {
1324                         BUG_ON(1);
1325                 }
1326 out_check:
1327                 if (extent_end <= start) {
1328                         path->slots[0]++;
1329                         if (!nolock && nocow)
1330                                 btrfs_end_nocow_write(root);
1331                         goto next_slot;
1332                 }
1333                 if (!nocow) {
1334                         if (cow_start == (u64)-1)
1335                                 cow_start = cur_offset;
1336                         cur_offset = extent_end;
1337                         if (cur_offset > end)
1338                                 break;
1339                         path->slots[0]++;
1340                         goto next_slot;
1341                 }
1342 
1343                 btrfs_release_path(path);
1344                 if (cow_start != (u64)-1) {
1345                         ret = cow_file_range(inode, locked_page,
1346                                              cow_start, found_key.offset - 1,
1347                                              page_started, nr_written, 1);
1348                         if (ret) {
1349                                 if (!nolock && nocow)
1350                                         btrfs_end_nocow_write(root);
1351                                 goto error;
1352                         }
1353                         cow_start = (u64)-1;
1354                 }
1355 
1356                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1357                         struct extent_map *em;
1358                         struct extent_map_tree *em_tree;
1359                         em_tree = &BTRFS_I(inode)->extent_tree;
1360                         em = alloc_extent_map();
1361                         BUG_ON(!em); /* -ENOMEM */
1362                         em->start = cur_offset;
1363                         em->orig_start = found_key.offset - extent_offset;
1364                         em->len = num_bytes;
1365                         em->block_len = num_bytes;
1366                         em->block_start = disk_bytenr;
1367                         em->orig_block_len = disk_num_bytes;
1368                         em->ram_bytes = ram_bytes;
1369                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1370                         em->mod_start = em->start;
1371                         em->mod_len = em->len;
1372                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1373                         set_bit(EXTENT_FLAG_FILLING, &em->flags);
1374                         em->generation = -1;
1375                         while (1) {
1376                                 write_lock(&em_tree->lock);
1377                                 ret = add_extent_mapping(em_tree, em, 1);
1378                                 write_unlock(&em_tree->lock);
1379                                 if (ret != -EEXIST) {
1380                                         free_extent_map(em);
1381                                         break;
1382                                 }
1383                                 btrfs_drop_extent_cache(inode, em->start,
1384                                                 em->start + em->len - 1, 0);
1385                         }
1386                         type = BTRFS_ORDERED_PREALLOC;
1387                 } else {
1388                         type = BTRFS_ORDERED_NOCOW;
1389                 }
1390 
1391                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1392                                                num_bytes, num_bytes, type);
1393                 BUG_ON(ret); /* -ENOMEM */
1394 
1395                 if (root->root_key.objectid ==
1396                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1397                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1398                                                       num_bytes);
1399                         if (ret) {
1400                                 if (!nolock && nocow)
1401                                         btrfs_end_nocow_write(root);
1402                                 goto error;
1403                         }
1404                 }
1405 
1406                 extent_clear_unlock_delalloc(inode, cur_offset,
1407                                              cur_offset + num_bytes - 1,
1408                                              locked_page, EXTENT_LOCKED |
1409                                              EXTENT_DELALLOC, PAGE_UNLOCK |
1410                                              PAGE_SET_PRIVATE2);
1411                 if (!nolock && nocow)
1412                         btrfs_end_nocow_write(root);
1413                 cur_offset = extent_end;
1414                 if (cur_offset > end)
1415                         break;
1416         }
1417         btrfs_release_path(path);
1418 
1419         if (cur_offset <= end && cow_start == (u64)-1) {
1420                 cow_start = cur_offset;
1421                 cur_offset = end;
1422         }
1423 
1424         if (cow_start != (u64)-1) {
1425                 ret = cow_file_range(inode, locked_page, cow_start, end,
1426                                      page_started, nr_written, 1);
1427                 if (ret)
1428                         goto error;
1429         }
1430 
1431 error:
1432         err = btrfs_end_transaction(trans, root);
1433         if (!ret)
1434                 ret = err;
1435 
1436         if (ret && cur_offset < end)
1437                 extent_clear_unlock_delalloc(inode, cur_offset, end,
1438                                              locked_page, EXTENT_LOCKED |
1439                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1440                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1441                                              PAGE_CLEAR_DIRTY |
1442                                              PAGE_SET_WRITEBACK |
1443                                              PAGE_END_WRITEBACK);
1444         btrfs_free_path(path);
1445         return ret;
1446 }
1447 
1448 /*
1449  * extent_io.c call back to do delayed allocation processing
1450  */
1451 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1452                               u64 start, u64 end, int *page_started,
1453                               unsigned long *nr_written)
1454 {
1455         int ret;
1456         struct btrfs_root *root = BTRFS_I(inode)->root;
1457 
1458         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1459                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1460                                          page_started, 1, nr_written);
1461         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1462                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1463                                          page_started, 0, nr_written);
1464         } else if (!btrfs_test_opt(root, COMPRESS) &&
1465                    !(BTRFS_I(inode)->force_compress) &&
1466                    !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1467                 ret = cow_file_range(inode, locked_page, start, end,
1468                                       page_started, nr_written, 1);
1469         } else {
1470                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1471                         &BTRFS_I(inode)->runtime_flags);
1472                 ret = cow_file_range_async(inode, locked_page, start, end,
1473                                            page_started, nr_written);
1474         }
1475         return ret;
1476 }
1477 
1478 static void btrfs_split_extent_hook(struct inode *inode,
1479                                     struct extent_state *orig, u64 split)
1480 {
1481         /* not delalloc, ignore it */
1482         if (!(orig->state & EXTENT_DELALLOC))
1483                 return;
1484 
1485         spin_lock(&BTRFS_I(inode)->lock);
1486         BTRFS_I(inode)->outstanding_extents++;
1487         spin_unlock(&BTRFS_I(inode)->lock);
1488 }
1489 
1490 /*
1491  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1492  * extents so we can keep track of new extents that are just merged onto old
1493  * extents, such as when we are doing sequential writes, so we can properly
1494  * account for the metadata space we'll need.
1495  */
1496 static void btrfs_merge_extent_hook(struct inode *inode,
1497                                     struct extent_state *new,
1498                                     struct extent_state *other)
1499 {
1500         /* not delalloc, ignore it */
1501         if (!(other->state & EXTENT_DELALLOC))
1502                 return;
1503 
1504         spin_lock(&BTRFS_I(inode)->lock);
1505         BTRFS_I(inode)->outstanding_extents--;
1506         spin_unlock(&BTRFS_I(inode)->lock);
1507 }
1508 
1509 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1510                                       struct inode *inode)
1511 {
1512         spin_lock(&root->delalloc_lock);
1513         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1514                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1515                               &root->delalloc_inodes);
1516                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1517                         &BTRFS_I(inode)->runtime_flags);
1518                 root->nr_delalloc_inodes++;
1519                 if (root->nr_delalloc_inodes == 1) {
1520                         spin_lock(&root->fs_info->delalloc_root_lock);
1521                         BUG_ON(!list_empty(&root->delalloc_root));
1522                         list_add_tail(&root->delalloc_root,
1523                                       &root->fs_info->delalloc_roots);
1524                         spin_unlock(&root->fs_info->delalloc_root_lock);
1525                 }
1526         }
1527         spin_unlock(&root->delalloc_lock);
1528 }
1529 
1530 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1531                                      struct inode *inode)
1532 {
1533         spin_lock(&root->delalloc_lock);
1534         if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1535                 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1536                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1537                           &BTRFS_I(inode)->runtime_flags);
1538                 root->nr_delalloc_inodes--;
1539                 if (!root->nr_delalloc_inodes) {
1540                         spin_lock(&root->fs_info->delalloc_root_lock);
1541                         BUG_ON(list_empty(&root->delalloc_root));
1542                         list_del_init(&root->delalloc_root);
1543                         spin_unlock(&root->fs_info->delalloc_root_lock);
1544                 }
1545         }
1546         spin_unlock(&root->delalloc_lock);
1547 }
1548 
1549 /*
1550  * extent_io.c set_bit_hook, used to track delayed allocation
1551  * bytes in this file, and to maintain the list of inodes that
1552  * have pending delalloc work to be done.
1553  */
1554 static void btrfs_set_bit_hook(struct inode *inode,
1555                                struct extent_state *state, unsigned long *bits)
1556 {
1557 
1558         /*
1559          * set_bit and clear bit hooks normally require _irqsave/restore
1560          * but in this case, we are only testing for the DELALLOC
1561          * bit, which is only set or cleared with irqs on
1562          */
1563         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1564                 struct btrfs_root *root = BTRFS_I(inode)->root;
1565                 u64 len = state->end + 1 - state->start;
1566                 bool do_list = !btrfs_is_free_space_inode(inode);
1567 
1568                 if (*bits & EXTENT_FIRST_DELALLOC) {
1569                         *bits &= ~EXTENT_FIRST_DELALLOC;
1570                 } else {
1571                         spin_lock(&BTRFS_I(inode)->lock);
1572                         BTRFS_I(inode)->outstanding_extents++;
1573                         spin_unlock(&BTRFS_I(inode)->lock);
1574                 }
1575 
1576                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1577                                      root->fs_info->delalloc_batch);
1578                 spin_lock(&BTRFS_I(inode)->lock);
1579                 BTRFS_I(inode)->delalloc_bytes += len;
1580                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1581                                          &BTRFS_I(inode)->runtime_flags))
1582                         btrfs_add_delalloc_inodes(root, inode);
1583                 spin_unlock(&BTRFS_I(inode)->lock);
1584         }
1585 }
1586 
1587 /*
1588  * extent_io.c clear_bit_hook, see set_bit_hook for why
1589  */
1590 static void btrfs_clear_bit_hook(struct inode *inode,
1591                                  struct extent_state *state,
1592                                  unsigned long *bits)
1593 {
1594         /*
1595          * set_bit and clear bit hooks normally require _irqsave/restore
1596          * but in this case, we are only testing for the DELALLOC
1597          * bit, which is only set or cleared with irqs on
1598          */
1599         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1600                 struct btrfs_root *root = BTRFS_I(inode)->root;
1601                 u64 len = state->end + 1 - state->start;
1602                 bool do_list = !btrfs_is_free_space_inode(inode);
1603 
1604                 if (*bits & EXTENT_FIRST_DELALLOC) {
1605                         *bits &= ~EXTENT_FIRST_DELALLOC;
1606                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1607                         spin_lock(&BTRFS_I(inode)->lock);
1608                         BTRFS_I(inode)->outstanding_extents--;
1609                         spin_unlock(&BTRFS_I(inode)->lock);
1610                 }
1611 
1612                 /*
1613                  * We don't reserve metadata space for space cache inodes so we
1614                  * don't need to call dellalloc_release_metadata if there is an
1615                  * error.
1616                  */
1617                 if (*bits & EXTENT_DO_ACCOUNTING &&
1618                     root != root->fs_info->tree_root)
1619                         btrfs_delalloc_release_metadata(inode, len);
1620 
1621                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1622                     && do_list && !(state->state & EXTENT_NORESERVE))
1623                         btrfs_free_reserved_data_space(inode, len);
1624 
1625                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1626                                      root->fs_info->delalloc_batch);
1627                 spin_lock(&BTRFS_I(inode)->lock);
1628                 BTRFS_I(inode)->delalloc_bytes -= len;
1629                 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1630                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1631                              &BTRFS_I(inode)->runtime_flags))
1632                         btrfs_del_delalloc_inode(root, inode);
1633                 spin_unlock(&BTRFS_I(inode)->lock);
1634         }
1635 }
1636 
1637 /*
1638  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1639  * we don't create bios that span stripes or chunks
1640  */
1641 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1642                          size_t size, struct bio *bio,
1643                          unsigned long bio_flags)
1644 {
1645         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1646         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1647         u64 length = 0;
1648         u64 map_length;
1649         int ret;
1650 
1651         if (bio_flags & EXTENT_BIO_COMPRESSED)
1652                 return 0;
1653 
1654         length = bio->bi_iter.bi_size;
1655         map_length = length;
1656         ret = btrfs_map_block(root->fs_info, rw, logical,
1657                               &map_length, NULL, 0);
1658         /* Will always return 0 with map_multi == NULL */
1659         BUG_ON(ret < 0);
1660         if (map_length < length + size)
1661                 return 1;
1662         return 0;
1663 }
1664 
1665 /*
1666  * in order to insert checksums into the metadata in large chunks,
1667  * we wait until bio submission time.   All the pages in the bio are
1668  * checksummed and sums are attached onto the ordered extent record.
1669  *
1670  * At IO completion time the cums attached on the ordered extent record
1671  * are inserted into the btree
1672  */
1673 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1674                                     struct bio *bio, int mirror_num,
1675                                     unsigned long bio_flags,
1676                                     u64 bio_offset)
1677 {
1678         struct btrfs_root *root = BTRFS_I(inode)->root;
1679         int ret = 0;
1680 
1681         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1682         BUG_ON(ret); /* -ENOMEM */
1683         return 0;
1684 }
1685 
1686 /*
1687  * in order to insert checksums into the metadata in large chunks,
1688  * we wait until bio submission time.   All the pages in the bio are
1689  * checksummed and sums are attached onto the ordered extent record.
1690  *
1691  * At IO completion time the cums attached on the ordered extent record
1692  * are inserted into the btree
1693  */
1694 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1695                           int mirror_num, unsigned long bio_flags,
1696                           u64 bio_offset)
1697 {
1698         struct btrfs_root *root = BTRFS_I(inode)->root;
1699         int ret;
1700 
1701         ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1702         if (ret)
1703                 bio_endio(bio, ret);
1704         return ret;
1705 }
1706 
1707 /*
1708  * extent_io.c submission hook. This does the right thing for csum calculation
1709  * on write, or reading the csums from the tree before a read
1710  */
1711 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1712                           int mirror_num, unsigned long bio_flags,
1713                           u64 bio_offset)
1714 {
1715         struct btrfs_root *root = BTRFS_I(inode)->root;
1716         int ret = 0;
1717         int skip_sum;
1718         int metadata = 0;
1719         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1720 
1721         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1722 
1723         if (btrfs_is_free_space_inode(inode))
1724                 metadata = 2;
1725 
1726         if (!(rw & REQ_WRITE)) {
1727                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1728                 if (ret)
1729                         goto out;
1730 
1731                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1732                         ret = btrfs_submit_compressed_read(inode, bio,
1733                                                            mirror_num,
1734                                                            bio_flags);
1735                         goto out;
1736                 } else if (!skip_sum) {
1737                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1738                         if (ret)
1739                                 goto out;
1740                 }
1741                 goto mapit;
1742         } else if (async && !skip_sum) {
1743                 /* csum items have already been cloned */
1744                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1745                         goto mapit;
1746                 /* we're doing a write, do the async checksumming */
1747                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1748                                    inode, rw, bio, mirror_num,
1749                                    bio_flags, bio_offset,
1750                                    __btrfs_submit_bio_start,
1751                                    __btrfs_submit_bio_done);
1752                 goto out;
1753         } else if (!skip_sum) {
1754                 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1755                 if (ret)
1756                         goto out;
1757         }
1758 
1759 mapit:
1760         ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1761 
1762 out:
1763         if (ret < 0)
1764                 bio_endio(bio, ret);
1765         return ret;
1766 }
1767 
1768 /*
1769  * given a list of ordered sums record them in the inode.  This happens
1770  * at IO completion time based on sums calculated at bio submission time.
1771  */
1772 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1773                              struct inode *inode, u64 file_offset,
1774                              struct list_head *list)
1775 {
1776         struct btrfs_ordered_sum *sum;
1777 
1778         list_for_each_entry(sum, list, list) {
1779                 trans->adding_csums = 1;
1780                 btrfs_csum_file_blocks(trans,
1781                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1782                 trans->adding_csums = 0;
1783         }
1784         return 0;
1785 }
1786 
1787 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1788                               struct extent_state **cached_state)
1789 {
1790         WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1791         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1792                                    cached_state, GFP_NOFS);
1793 }
1794 
1795 /* see btrfs_writepage_start_hook for details on why this is required */
1796 struct btrfs_writepage_fixup {
1797         struct page *page;
1798         struct btrfs_work work;
1799 };
1800 
1801 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1802 {
1803         struct btrfs_writepage_fixup *fixup;
1804         struct btrfs_ordered_extent *ordered;
1805         struct extent_state *cached_state = NULL;
1806         struct page *page;
1807         struct inode *inode;
1808         u64 page_start;
1809         u64 page_end;
1810         int ret;
1811 
1812         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1813         page = fixup->page;
1814 again:
1815         lock_page(page);
1816         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1817                 ClearPageChecked(page);
1818                 goto out_page;
1819         }
1820 
1821         inode = page->mapping->host;
1822         page_start = page_offset(page);
1823         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1824 
1825         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1826                          &cached_state);
1827 
1828         /* already ordered? We're done */
1829         if (PagePrivate2(page))
1830                 goto out;
1831 
1832         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1833         if (ordered) {
1834                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1835                                      page_end, &cached_state, GFP_NOFS);
1836                 unlock_page(page);
1837                 btrfs_start_ordered_extent(inode, ordered, 1);
1838                 btrfs_put_ordered_extent(ordered);
1839                 goto again;
1840         }
1841 
1842         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1843         if (ret) {
1844                 mapping_set_error(page->mapping, ret);
1845                 end_extent_writepage(page, ret, page_start, page_end);
1846                 ClearPageChecked(page);
1847                 goto out;
1848          }
1849 
1850         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1851         ClearPageChecked(page);
1852         set_page_dirty(page);
1853 out:
1854         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1855                              &cached_state, GFP_NOFS);
1856 out_page:
1857         unlock_page(page);
1858         page_cache_release(page);
1859         kfree(fixup);
1860 }
1861 
1862 /*
1863  * There are a few paths in the higher layers of the kernel that directly
1864  * set the page dirty bit without asking the filesystem if it is a
1865  * good idea.  This causes problems because we want to make sure COW
1866  * properly happens and the data=ordered rules are followed.
1867  *
1868  * In our case any range that doesn't have the ORDERED bit set
1869  * hasn't been properly setup for IO.  We kick off an async process
1870  * to fix it up.  The async helper will wait for ordered extents, set
1871  * the delalloc bit and make it safe to write the page.
1872  */
1873 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1874 {
1875         struct inode *inode = page->mapping->host;
1876         struct btrfs_writepage_fixup *fixup;
1877         struct btrfs_root *root = BTRFS_I(inode)->root;
1878 
1879         /* this page is properly in the ordered list */
1880         if (TestClearPagePrivate2(page))
1881                 return 0;
1882 
1883         if (PageChecked(page))
1884                 return -EAGAIN;
1885 
1886         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1887         if (!fixup)
1888                 return -EAGAIN;
1889 
1890         SetPageChecked(page);
1891         page_cache_get(page);
1892         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1893                         btrfs_writepage_fixup_worker, NULL, NULL);
1894         fixup->page = page;
1895         btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1896         return -EBUSY;
1897 }
1898 
1899 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1900                                        struct inode *inode, u64 file_pos,
1901                                        u64 disk_bytenr, u64 disk_num_bytes,
1902                                        u64 num_bytes, u64 ram_bytes,
1903                                        u8 compression, u8 encryption,
1904                                        u16 other_encoding, int extent_type)
1905 {
1906         struct btrfs_root *root = BTRFS_I(inode)->root;
1907         struct btrfs_file_extent_item *fi;
1908         struct btrfs_path *path;
1909         struct extent_buffer *leaf;
1910         struct btrfs_key ins;
1911         int extent_inserted = 0;
1912         int ret;
1913 
1914         path = btrfs_alloc_path();
1915         if (!path)
1916                 return -ENOMEM;
1917 
1918         /*
1919          * we may be replacing one extent in the tree with another.
1920          * The new extent is pinned in the extent map, and we don't want
1921          * to drop it from the cache until it is completely in the btree.
1922          *
1923          * So, tell btrfs_drop_extents to leave this extent in the cache.
1924          * the caller is expected to unpin it and allow it to be merged
1925          * with the others.
1926          */
1927         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1928                                    file_pos + num_bytes, NULL, 0,
1929                                    1, sizeof(*fi), &extent_inserted);
1930         if (ret)
1931                 goto out;
1932 
1933         if (!extent_inserted) {
1934                 ins.objectid = btrfs_ino(inode);
1935                 ins.offset = file_pos;
1936                 ins.type = BTRFS_EXTENT_DATA_KEY;
1937 
1938                 path->leave_spinning = 1;
1939                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
1940                                               sizeof(*fi));
1941                 if (ret)
1942                         goto out;
1943         }
1944         leaf = path->nodes[0];
1945         fi = btrfs_item_ptr(leaf, path->slots[0],
1946                             struct btrfs_file_extent_item);
1947         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1948         btrfs_set_file_extent_type(leaf, fi, extent_type);
1949         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1950         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1951         btrfs_set_file_extent_offset(leaf, fi, 0);
1952         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1953         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1954         btrfs_set_file_extent_compression(leaf, fi, compression);
1955         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1956         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1957 
1958         btrfs_mark_buffer_dirty(leaf);
1959         btrfs_release_path(path);
1960 
1961         inode_add_bytes(inode, num_bytes);
1962 
1963         ins.objectid = disk_bytenr;
1964         ins.offset = disk_num_bytes;
1965         ins.type = BTRFS_EXTENT_ITEM_KEY;
1966         ret = btrfs_alloc_reserved_file_extent(trans, root,
1967                                         root->root_key.objectid,
1968                                         btrfs_ino(inode), file_pos, &ins);
1969 out:
1970         btrfs_free_path(path);
1971 
1972         return ret;
1973 }
1974 
1975 /* snapshot-aware defrag */
1976 struct sa_defrag_extent_backref {
1977         struct rb_node node;
1978         struct old_sa_defrag_extent *old;
1979         u64 root_id;
1980         u64 inum;
1981         u64 file_pos;
1982         u64 extent_offset;
1983         u64 num_bytes;
1984         u64 generation;
1985 };
1986 
1987 struct old_sa_defrag_extent {
1988         struct list_head list;
1989         struct new_sa_defrag_extent *new;
1990 
1991         u64 extent_offset;
1992         u64 bytenr;
1993         u64 offset;
1994         u64 len;
1995         int count;
1996 };
1997 
1998 struct new_sa_defrag_extent {
1999         struct rb_root root;
2000         struct list_head head;
2001         struct btrfs_path *path;
2002         struct inode *inode;
2003         u64 file_pos;
2004         u64 len;
2005         u64 bytenr;
2006         u64 disk_len;
2007         u8 compress_type;
2008 };
2009 
2010 static int backref_comp(struct sa_defrag_extent_backref *b1,
2011                         struct sa_defrag_extent_backref *b2)
2012 {
2013         if (b1->root_id < b2->root_id)
2014                 return -1;
2015         else if (b1->root_id > b2->root_id)
2016                 return 1;
2017 
2018         if (b1->inum < b2->inum)
2019                 return -1;
2020         else if (b1->inum > b2->inum)
2021                 return 1;
2022 
2023         if (b1->file_pos < b2->file_pos)
2024                 return -1;
2025         else if (b1->file_pos > b2->file_pos)
2026                 return 1;
2027 
2028         /*
2029          * [------------------------------] ===> (a range of space)
2030          *     |<--->|   |<---->| =============> (fs/file tree A)
2031          * |<---------------------------->| ===> (fs/file tree B)
2032          *
2033          * A range of space can refer to two file extents in one tree while
2034          * refer to only one file extent in another tree.
2035          *
2036          * So we may process a disk offset more than one time(two extents in A)
2037          * and locate at the same extent(one extent in B), then insert two same
2038          * backrefs(both refer to the extent in B).
2039          */
2040         return 0;
2041 }
2042 
2043 static void backref_insert(struct rb_root *root,
2044                            struct sa_defrag_extent_backref *backref)
2045 {
2046         struct rb_node **p = &root->rb_node;
2047         struct rb_node *parent = NULL;
2048         struct sa_defrag_extent_backref *entry;
2049         int ret;
2050 
2051         while (*p) {
2052                 parent = *p;
2053                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2054 
2055                 ret = backref_comp(backref, entry);
2056                 if (ret < 0)
2057                         p = &(*p)->rb_left;
2058                 else
2059                         p = &(*p)->rb_right;
2060         }
2061 
2062         rb_link_node(&backref->node, parent, p);
2063         rb_insert_color(&backref->node, root);
2064 }
2065 
2066 /*
2067  * Note the backref might has changed, and in this case we just return 0.
2068  */
2069 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2070                                        void *ctx)
2071 {
2072         struct btrfs_file_extent_item *extent;
2073         struct btrfs_fs_info *fs_info;
2074         struct old_sa_defrag_extent *old = ctx;
2075         struct new_sa_defrag_extent *new = old->new;
2076         struct btrfs_path *path = new->path;
2077         struct btrfs_key key;
2078         struct btrfs_root *root;
2079         struct sa_defrag_extent_backref *backref;
2080         struct extent_buffer *leaf;
2081         struct inode *inode = new->inode;
2082         int slot;
2083         int ret;
2084         u64 extent_offset;
2085         u64 num_bytes;
2086 
2087         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2088             inum == btrfs_ino(inode))
2089                 return 0;
2090 
2091         key.objectid = root_id;
2092         key.type = BTRFS_ROOT_ITEM_KEY;
2093         key.offset = (u64)-1;
2094 
2095         fs_info = BTRFS_I(inode)->root->fs_info;
2096         root = btrfs_read_fs_root_no_name(fs_info, &key);
2097         if (IS_ERR(root)) {
2098                 if (PTR_ERR(root) == -ENOENT)
2099                         return 0;
2100                 WARN_ON(1);
2101                 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2102                          inum, offset, root_id);
2103                 return PTR_ERR(root);
2104         }
2105 
2106         key.objectid = inum;
2107         key.type = BTRFS_EXTENT_DATA_KEY;
2108         if (offset > (u64)-1 << 32)
2109                 key.offset = 0;
2110         else
2111                 key.offset = offset;
2112 
2113         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2114         if (WARN_ON(ret < 0))
2115                 return ret;
2116         ret = 0;
2117 
2118         while (1) {
2119                 cond_resched();
2120 
2121                 leaf = path->nodes[0];
2122                 slot = path->slots[0];
2123 
2124                 if (slot >= btrfs_header_nritems(leaf)) {
2125                         ret = btrfs_next_leaf(root, path);
2126                         if (ret < 0) {
2127                                 goto out;
2128                         } else if (ret > 0) {
2129                                 ret = 0;
2130                                 goto out;
2131                         }
2132                         continue;
2133                 }
2134 
2135                 path->slots[0]++;
2136 
2137                 btrfs_item_key_to_cpu(leaf, &key, slot);
2138 
2139                 if (key.objectid > inum)
2140                         goto out;
2141 
2142                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2143                         continue;
2144 
2145                 extent = btrfs_item_ptr(leaf, slot,
2146                                         struct btrfs_file_extent_item);
2147 
2148                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2149                         continue;
2150 
2151                 /*
2152                  * 'offset' refers to the exact key.offset,
2153                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2154                  * (key.offset - extent_offset).
2155                  */
2156                 if (key.offset != offset)
2157                         continue;
2158 
2159                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2160                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2161 
2162                 if (extent_offset >= old->extent_offset + old->offset +
2163                     old->len || extent_offset + num_bytes <=
2164                     old->extent_offset + old->offset)
2165                         continue;
2166                 break;
2167         }
2168 
2169         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2170         if (!backref) {
2171                 ret = -ENOENT;
2172                 goto out;
2173         }
2174 
2175         backref->root_id = root_id;
2176         backref->inum = inum;
2177         backref->file_pos = offset;
2178         backref->num_bytes = num_bytes;
2179         backref->extent_offset = extent_offset;
2180         backref->generation = btrfs_file_extent_generation(leaf, extent);
2181         backref->old = old;
2182         backref_insert(&new->root, backref);
2183         old->count++;
2184 out:
2185         btrfs_release_path(path);
2186         WARN_ON(ret);
2187         return ret;
2188 }
2189 
2190 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2191                                    struct new_sa_defrag_extent *new)
2192 {
2193         struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2194         struct old_sa_defrag_extent *old, *tmp;
2195         int ret;
2196 
2197         new->path = path;
2198 
2199         list_for_each_entry_safe(old, tmp, &new->head, list) {
2200                 ret = iterate_inodes_from_logical(old->bytenr +
2201                                                   old->extent_offset, fs_info,
2202                                                   path, record_one_backref,
2203                                                   old);
2204                 if (ret < 0 && ret != -ENOENT)
2205                         return false;
2206 
2207                 /* no backref to be processed for this extent */
2208                 if (!old->count) {
2209                         list_del(&old->list);
2210                         kfree(old);
2211                 }
2212         }
2213 
2214         if (list_empty(&new->head))
2215                 return false;
2216 
2217         return true;
2218 }
2219 
2220 static int relink_is_mergable(struct extent_buffer *leaf,
2221                               struct btrfs_file_extent_item *fi,
2222                               struct new_sa_defrag_extent *new)
2223 {
2224         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2225                 return 0;
2226 
2227         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2228                 return 0;
2229 
2230         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2231                 return 0;
2232 
2233         if (btrfs_file_extent_encryption(leaf, fi) ||
2234             btrfs_file_extent_other_encoding(leaf, fi))
2235                 return 0;
2236 
2237         return 1;
2238 }
2239 
2240 /*
2241  * Note the backref might has changed, and in this case we just return 0.
2242  */
2243 static noinline int relink_extent_backref(struct btrfs_path *path,
2244                                  struct sa_defrag_extent_backref *prev,
2245                                  struct sa_defrag_extent_backref *backref)
2246 {
2247         struct btrfs_file_extent_item *extent;
2248         struct btrfs_file_extent_item *item;
2249         struct btrfs_ordered_extent *ordered;
2250         struct btrfs_trans_handle *trans;
2251         struct btrfs_fs_info *fs_info;
2252         struct btrfs_root *root;
2253         struct btrfs_key key;
2254         struct extent_buffer *leaf;
2255         struct old_sa_defrag_extent *old = backref->old;
2256         struct new_sa_defrag_extent *new = old->new;
2257         struct inode *src_inode = new->inode;
2258         struct inode *inode;
2259         struct extent_state *cached = NULL;
2260         int ret = 0;
2261         u64 start;
2262         u64 len;
2263         u64 lock_start;
2264         u64 lock_end;
2265         bool merge = false;
2266         int index;
2267 
2268         if (prev && prev->root_id == backref->root_id &&
2269             prev->inum == backref->inum &&
2270             prev->file_pos + prev->num_bytes == backref->file_pos)
2271                 merge = true;
2272 
2273         /* step 1: get root */
2274         key.objectid = backref->root_id;
2275         key.type = BTRFS_ROOT_ITEM_KEY;
2276         key.offset = (u64)-1;
2277 
2278         fs_info = BTRFS_I(src_inode)->root->fs_info;
2279         index = srcu_read_lock(&fs_info->subvol_srcu);
2280 
2281         root = btrfs_read_fs_root_no_name(fs_info, &key);
2282         if (IS_ERR(root)) {
2283                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2284                 if (PTR_ERR(root) == -ENOENT)
2285                         return 0;
2286                 return PTR_ERR(root);
2287         }
2288 
2289         if (btrfs_root_readonly(root)) {
2290                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2291                 return 0;
2292         }
2293 
2294         /* step 2: get inode */
2295         key.objectid = backref->inum;
2296         key.type = BTRFS_INODE_ITEM_KEY;
2297         key.offset = 0;
2298 
2299         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2300         if (IS_ERR(inode)) {
2301                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2302                 return 0;
2303         }
2304 
2305         srcu_read_unlock(&fs_info->subvol_srcu, index);
2306 
2307         /* step 3: relink backref */
2308         lock_start = backref->file_pos;
2309         lock_end = backref->file_pos + backref->num_bytes - 1;
2310         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2311                          0, &cached);
2312 
2313         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2314         if (ordered) {
2315                 btrfs_put_ordered_extent(ordered);
2316                 goto out_unlock;
2317         }
2318 
2319         trans = btrfs_join_transaction(root);
2320         if (IS_ERR(trans)) {
2321                 ret = PTR_ERR(trans);
2322                 goto out_unlock;
2323         }
2324 
2325         key.objectid = backref->inum;
2326         key.type = BTRFS_EXTENT_DATA_KEY;
2327         key.offset = backref->file_pos;
2328 
2329         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2330         if (ret < 0) {
2331                 goto out_free_path;
2332         } else if (ret > 0) {
2333                 ret = 0;
2334                 goto out_free_path;
2335         }
2336 
2337         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2338                                 struct btrfs_file_extent_item);
2339 
2340         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2341             backref->generation)
2342                 goto out_free_path;
2343 
2344         btrfs_release_path(path);
2345 
2346         start = backref->file_pos;
2347         if (backref->extent_offset < old->extent_offset + old->offset)
2348                 start += old->extent_offset + old->offset -
2349                          backref->extent_offset;
2350 
2351         len = min(backref->extent_offset + backref->num_bytes,
2352                   old->extent_offset + old->offset + old->len);
2353         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2354 
2355         ret = btrfs_drop_extents(trans, root, inode, start,
2356                                  start + len, 1);
2357         if (ret)
2358                 goto out_free_path;
2359 again:
2360         key.objectid = btrfs_ino(inode);
2361         key.type = BTRFS_EXTENT_DATA_KEY;
2362         key.offset = start;
2363 
2364         path->leave_spinning = 1;
2365         if (merge) {
2366                 struct btrfs_file_extent_item *fi;
2367                 u64 extent_len;
2368                 struct btrfs_key found_key;
2369 
2370                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2371                 if (ret < 0)
2372                         goto out_free_path;
2373 
2374                 path->slots[0]--;
2375                 leaf = path->nodes[0];
2376                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2377 
2378                 fi = btrfs_item_ptr(leaf, path->slots[0],
2379                                     struct btrfs_file_extent_item);
2380                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2381 
2382                 if (extent_len + found_key.offset == start &&
2383                     relink_is_mergable(leaf, fi, new)) {
2384                         btrfs_set_file_extent_num_bytes(leaf, fi,
2385                                                         extent_len + len);
2386                         btrfs_mark_buffer_dirty(leaf);
2387                         inode_add_bytes(inode, len);
2388 
2389                         ret = 1;
2390                         goto out_free_path;
2391                 } else {
2392                         merge = false;
2393                         btrfs_release_path(path);
2394                         goto again;
2395                 }
2396         }
2397 
2398         ret = btrfs_insert_empty_item(trans, root, path, &key,
2399                                         sizeof(*extent));
2400         if (ret) {
2401                 btrfs_abort_transaction(trans, root, ret);
2402                 goto out_free_path;
2403         }
2404 
2405         leaf = path->nodes[0];
2406         item = btrfs_item_ptr(leaf, path->slots[0],
2407                                 struct btrfs_file_extent_item);
2408         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2409         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2410         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2411         btrfs_set_file_extent_num_bytes(leaf, item, len);
2412         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2413         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2414         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2415         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2416         btrfs_set_file_extent_encryption(leaf, item, 0);
2417         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2418 
2419         btrfs_mark_buffer_dirty(leaf);
2420         inode_add_bytes(inode, len);
2421         btrfs_release_path(path);
2422 
2423         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2424                         new->disk_len, 0,
2425                         backref->root_id, backref->inum,
2426                         new->file_pos, 0);      /* start - extent_offset */
2427         if (ret) {
2428                 btrfs_abort_transaction(trans, root, ret);
2429                 goto out_free_path;
2430         }
2431 
2432         ret = 1;
2433 out_free_path:
2434         btrfs_release_path(path);
2435         path->leave_spinning = 0;
2436         btrfs_end_transaction(trans, root);
2437 out_unlock:
2438         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2439                              &cached, GFP_NOFS);
2440         iput(inode);
2441         return ret;
2442 }
2443 
2444 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2445 {
2446         struct old_sa_defrag_extent *old, *tmp;
2447 
2448         if (!new)
2449                 return;
2450 
2451         list_for_each_entry_safe(old, tmp, &new->head, list) {
2452                 list_del(&old->list);
2453                 kfree(old);
2454         }
2455         kfree(new);
2456 }
2457 
2458 static void relink_file_extents(struct new_sa_defrag_extent *new)
2459 {
2460         struct btrfs_path *path;
2461         struct sa_defrag_extent_backref *backref;
2462         struct sa_defrag_extent_backref *prev = NULL;
2463         struct inode *inode;
2464         struct btrfs_root *root;
2465         struct rb_node *node;
2466         int ret;
2467 
2468         inode = new->inode;
2469         root = BTRFS_I(inode)->root;
2470 
2471         path = btrfs_alloc_path();
2472         if (!path)
2473                 return;
2474 
2475         if (!record_extent_backrefs(path, new)) {
2476                 btrfs_free_path(path);
2477                 goto out;
2478         }
2479         btrfs_release_path(path);
2480 
2481         while (1) {
2482                 node = rb_first(&new->root);
2483                 if (!node)
2484                         break;
2485                 rb_erase(node, &new->root);
2486 
2487                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2488 
2489                 ret = relink_extent_backref(path, prev, backref);
2490                 WARN_ON(ret < 0);
2491 
2492                 kfree(prev);
2493 
2494                 if (ret == 1)
2495                         prev = backref;
2496                 else
2497                         prev = NULL;
2498                 cond_resched();
2499         }
2500         kfree(prev);
2501 
2502         btrfs_free_path(path);
2503 out:
2504         free_sa_defrag_extent(new);
2505 
2506         atomic_dec(&root->fs_info->defrag_running);
2507         wake_up(&root->fs_info->transaction_wait);
2508 }
2509 
2510 static struct new_sa_defrag_extent *
2511 record_old_file_extents(struct inode *inode,
2512                         struct btrfs_ordered_extent *ordered)
2513 {
2514         struct btrfs_root *root = BTRFS_I(inode)->root;
2515         struct btrfs_path *path;
2516         struct btrfs_key key;
2517         struct old_sa_defrag_extent *old;
2518         struct new_sa_defrag_extent *new;
2519         int ret;
2520 
2521         new = kmalloc(sizeof(*new), GFP_NOFS);
2522         if (!new)
2523                 return NULL;
2524 
2525         new->inode = inode;
2526         new->file_pos = ordered->file_offset;
2527         new->len = ordered->len;
2528         new->bytenr = ordered->start;
2529         new->disk_len = ordered->disk_len;
2530         new->compress_type = ordered->compress_type;
2531         new->root = RB_ROOT;
2532         INIT_LIST_HEAD(&new->head);
2533 
2534         path = btrfs_alloc_path();
2535         if (!path)
2536                 goto out_kfree;
2537 
2538         key.objectid = btrfs_ino(inode);
2539         key.type = BTRFS_EXTENT_DATA_KEY;
2540         key.offset = new->file_pos;
2541 
2542         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2543         if (ret < 0)
2544                 goto out_free_path;
2545         if (ret > 0 && path->slots[0] > 0)
2546                 path->slots[0]--;
2547 
2548         /* find out all the old extents for the file range */
2549         while (1) {
2550                 struct btrfs_file_extent_item *extent;
2551                 struct extent_buffer *l;
2552                 int slot;
2553                 u64 num_bytes;
2554                 u64 offset;
2555                 u64 end;
2556                 u64 disk_bytenr;
2557                 u64 extent_offset;
2558 
2559                 l = path->nodes[0];
2560                 slot = path->slots[0];
2561 
2562                 if (slot >= btrfs_header_nritems(l)) {
2563                         ret = btrfs_next_leaf(root, path);
2564                         if (ret < 0)
2565                                 goto out_free_path;
2566                         else if (ret > 0)
2567                                 break;
2568                         continue;
2569                 }
2570 
2571                 btrfs_item_key_to_cpu(l, &key, slot);
2572 
2573                 if (key.objectid != btrfs_ino(inode))
2574                         break;
2575                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2576                         break;
2577                 if (key.offset >= new->file_pos + new->len)
2578                         break;
2579 
2580                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2581 
2582                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2583                 if (key.offset + num_bytes < new->file_pos)
2584                         goto next;
2585 
2586                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2587                 if (!disk_bytenr)
2588                         goto next;
2589 
2590                 extent_offset = btrfs_file_extent_offset(l, extent);
2591 
2592                 old = kmalloc(sizeof(*old), GFP_NOFS);
2593                 if (!old)
2594                         goto out_free_path;
2595 
2596                 offset = max(new->file_pos, key.offset);
2597                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2598 
2599                 old->bytenr = disk_bytenr;
2600                 old->extent_offset = extent_offset;
2601                 old->offset = offset - key.offset;
2602                 old->len = end - offset;
2603                 old->new = new;
2604                 old->count = 0;
2605                 list_add_tail(&old->list, &new->head);
2606 next:
2607                 path->slots[0]++;
2608                 cond_resched();
2609         }
2610 
2611         btrfs_free_path(path);
2612         atomic_inc(&root->fs_info->defrag_running);
2613 
2614         return new;
2615 
2616 out_free_path:
2617         btrfs_free_path(path);
2618 out_kfree:
2619         free_sa_defrag_extent(new);
2620         return NULL;
2621 }
2622 
2623 static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2624                                          u64 start, u64 len)
2625 {
2626         struct btrfs_block_group_cache *cache;
2627 
2628         cache = btrfs_lookup_block_group(root->fs_info, start);
2629         ASSERT(cache);
2630 
2631         spin_lock(&cache->lock);
2632         cache->delalloc_bytes -= len;
2633         spin_unlock(&cache->lock);
2634 
2635         btrfs_put_block_group(cache);
2636 }
2637 
2638 /* as ordered data IO finishes, this gets called so we can finish
2639  * an ordered extent if the range of bytes in the file it covers are
2640  * fully written.
2641  */
2642 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2643 {
2644         struct inode *inode = ordered_extent->inode;
2645         struct btrfs_root *root = BTRFS_I(inode)->root;
2646         struct btrfs_trans_handle *trans = NULL;
2647         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2648         struct extent_state *cached_state = NULL;
2649         struct new_sa_defrag_extent *new = NULL;
2650         int compress_type = 0;
2651         int ret = 0;
2652         u64 logical_len = ordered_extent->len;
2653         bool nolock;
2654         bool truncated = false;
2655 
2656         nolock = btrfs_is_free_space_inode(inode);
2657 
2658         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2659                 ret = -EIO;
2660                 goto out;
2661         }
2662 
2663         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2664                 truncated = true;
2665                 logical_len = ordered_extent->truncated_len;
2666                 /* Truncated the entire extent, don't bother adding */
2667                 if (!logical_len)
2668                         goto out;
2669         }
2670 
2671         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2672                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2673                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2674                 if (nolock)
2675                         trans = btrfs_join_transaction_nolock(root);
2676                 else
2677                         trans = btrfs_join_transaction(root);
2678                 if (IS_ERR(trans)) {
2679                         ret = PTR_ERR(trans);
2680                         trans = NULL;
2681                         goto out;
2682                 }
2683                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2684                 ret = btrfs_update_inode_fallback(trans, root, inode);
2685                 if (ret) /* -ENOMEM or corruption */
2686                         btrfs_abort_transaction(trans, root, ret);
2687                 goto out;
2688         }
2689 
2690         lock_extent_bits(io_tree, ordered_extent->file_offset,
2691                          ordered_extent->file_offset + ordered_extent->len - 1,
2692                          0, &cached_state);
2693 
2694         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2695                         ordered_extent->file_offset + ordered_extent->len - 1,
2696                         EXTENT_DEFRAG, 1, cached_state);
2697         if (ret) {
2698                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2699                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2700                         /* the inode is shared */
2701                         new = record_old_file_extents(inode, ordered_extent);
2702 
2703                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2704                         ordered_extent->file_offset + ordered_extent->len - 1,
2705                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2706         }
2707 
2708         if (nolock)
2709                 trans = btrfs_join_transaction_nolock(root);
2710         else
2711                 trans = btrfs_join_transaction(root);
2712         if (IS_ERR(trans)) {
2713                 ret = PTR_ERR(trans);
2714                 trans = NULL;
2715                 goto out_unlock;
2716         }
2717 
2718         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2719 
2720         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2721                 compress_type = ordered_extent->compress_type;
2722         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2723                 BUG_ON(compress_type);
2724                 ret = btrfs_mark_extent_written(trans, inode,
2725                                                 ordered_extent->file_offset,
2726                                                 ordered_extent->file_offset +
2727                                                 logical_len);
2728         } else {
2729                 BUG_ON(root == root->fs_info->tree_root);
2730                 ret = insert_reserved_file_extent(trans, inode,
2731                                                 ordered_extent->file_offset,
2732                                                 ordered_extent->start,
2733                                                 ordered_extent->disk_len,
2734                                                 logical_len, logical_len,
2735                                                 compress_type, 0, 0,
2736                                                 BTRFS_FILE_EXTENT_REG);
2737                 if (!ret)
2738                         btrfs_release_delalloc_bytes(root,
2739                                                      ordered_extent->start,
2740                                                      ordered_extent->disk_len);
2741         }
2742         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2743                            ordered_extent->file_offset, ordered_extent->len,
2744                            trans->transid);
2745         if (ret < 0) {
2746                 btrfs_abort_transaction(trans, root, ret);
2747                 goto out_unlock;
2748         }
2749 
2750         add_pending_csums(trans, inode, ordered_extent->file_offset,
2751                           &ordered_extent->list);
2752 
2753         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2754         ret = btrfs_update_inode_fallback(trans, root, inode);
2755         if (ret) { /* -ENOMEM or corruption */
2756                 btrfs_abort_transaction(trans, root, ret);
2757                 goto out_unlock;
2758         }
2759         ret = 0;
2760 out_unlock:
2761         unlock_extent_cached(io_tree, ordered_extent->file_offset,
2762                              ordered_extent->file_offset +
2763                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
2764 out:
2765         if (root != root->fs_info->tree_root)
2766                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2767         if (trans)
2768                 btrfs_end_transaction(trans, root);
2769 
2770         if (ret || truncated) {
2771                 u64 start, end;
2772 
2773                 if (truncated)
2774                         start = ordered_extent->file_offset + logical_len;
2775                 else
2776                         start = ordered_extent->file_offset;
2777                 end = ordered_extent->file_offset + ordered_extent->len - 1;
2778                 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2779 
2780                 /* Drop the cache for the part of the extent we didn't write. */
2781                 btrfs_drop_extent_cache(inode, start, end, 0);
2782 
2783                 /*
2784                  * If the ordered extent had an IOERR or something else went
2785                  * wrong we need to return the space for this ordered extent
2786                  * back to the allocator.  We only free the extent in the
2787                  * truncated case if we didn't write out the extent at all.
2788                  */
2789                 if ((ret || !logical_len) &&
2790                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2791                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2792                         btrfs_free_reserved_extent(root, ordered_extent->start,
2793                                                    ordered_extent->disk_len, 1);
2794         }
2795 
2796 
2797         /*
2798          * This needs to be done to make sure anybody waiting knows we are done
2799          * updating everything for this ordered extent.
2800          */
2801         btrfs_remove_ordered_extent(inode, ordered_extent);
2802 
2803         /* for snapshot-aware defrag */
2804         if (new) {
2805                 if (ret) {
2806                         free_sa_defrag_extent(new);
2807                         atomic_dec(&root->fs_info->defrag_running);
2808                 } else {
2809                         relink_file_extents(new);
2810                 }
2811         }
2812 
2813         /* once for us */
2814         btrfs_put_ordered_extent(ordered_extent);
2815         /* once for the tree */
2816         btrfs_put_ordered_extent(ordered_extent);
2817 
2818         return ret;
2819 }
2820 
2821 static void finish_ordered_fn(struct btrfs_work *work)
2822 {
2823         struct btrfs_ordered_extent *ordered_extent;
2824         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2825         btrfs_finish_ordered_io(ordered_extent);
2826 }
2827 
2828 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2829                                 struct extent_state *state, int uptodate)
2830 {
2831         struct inode *inode = page->mapping->host;
2832         struct btrfs_root *root = BTRFS_I(inode)->root;
2833         struct btrfs_ordered_extent *ordered_extent = NULL;
2834         struct btrfs_workqueue *wq;
2835         btrfs_work_func_t func;
2836 
2837         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2838 
2839         ClearPagePrivate2(page);
2840         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2841                                             end - start + 1, uptodate))
2842                 return 0;
2843 
2844         if (btrfs_is_free_space_inode(inode)) {
2845                 wq = root->fs_info->endio_freespace_worker;
2846                 func = btrfs_freespace_write_helper;
2847         } else {
2848                 wq = root->fs_info->endio_write_workers;
2849                 func = btrfs_endio_write_helper;
2850         }
2851 
2852         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2853                         NULL);
2854         btrfs_queue_work(wq, &ordered_extent->work);
2855 
2856         return 0;
2857 }
2858 
2859 /*
2860  * when reads are done, we need to check csums to verify the data is correct
2861  * if there's a match, we allow the bio to finish.  If not, the code in
2862  * extent_io.c will try to find good copies for us.
2863  */
2864 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2865                                       u64 phy_offset, struct page *page,
2866                                       u64 start, u64 end, int mirror)
2867 {
2868         size_t offset = start - page_offset(page);
2869         struct inode *inode = page->mapping->host;
2870         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2871         char *kaddr;
2872         struct btrfs_root *root = BTRFS_I(inode)->root;
2873         u32 csum_expected;
2874         u32 csum = ~(u32)0;
2875         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2876                                       DEFAULT_RATELIMIT_BURST);
2877 
2878         if (PageChecked(page)) {
2879                 ClearPageChecked(page);
2880                 goto good;
2881         }
2882 
2883         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2884                 goto good;
2885 
2886         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2887             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2888                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2889                                   GFP_NOFS);
2890                 return 0;
2891         }
2892 
2893         phy_offset >>= inode->i_sb->s_blocksize_bits;
2894         csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2895 
2896         kaddr = kmap_atomic(page);
2897         csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2898         btrfs_csum_final(csum, (char *)&csum);
2899         if (csum != csum_expected)
2900                 goto zeroit;
2901 
2902         kunmap_atomic(kaddr);
2903 good:
2904         return 0;
2905 
2906 zeroit:
2907         if (__ratelimit(&_rs))
2908                 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2909                         btrfs_ino(page->mapping->host), start, csum, csum_expected);
2910         memset(kaddr + offset, 1, end - start + 1);
2911         flush_dcache_page(page);
2912         kunmap_atomic(kaddr);
2913         if (csum_expected == 0)
2914                 return 0;
2915         return -EIO;
2916 }
2917 
2918 struct delayed_iput {
2919         struct list_head list;
2920         struct inode *inode;
2921 };
2922 
2923 /* JDM: If this is fs-wide, why can't we add a pointer to
2924  * btrfs_inode instead and avoid the allocation? */
2925 void btrfs_add_delayed_iput(struct inode *inode)
2926 {
2927         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2928         struct delayed_iput *delayed;
2929 
2930         if (atomic_add_unless(&inode->i_count, -1, 1))
2931                 return;
2932 
2933         delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2934         delayed->inode = inode;
2935 
2936         spin_lock(&fs_info->delayed_iput_lock);
2937         list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2938         spin_unlock(&fs_info->delayed_iput_lock);
2939 }
2940 
2941 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2942 {
2943         LIST_HEAD(list);
2944         struct btrfs_fs_info *fs_info = root->fs_info;
2945         struct delayed_iput *delayed;
2946         int empty;
2947 
2948         spin_lock(&fs_info->delayed_iput_lock);
2949         empty = list_empty(&fs_info->delayed_iputs);
2950         spin_unlock(&fs_info->delayed_iput_lock);
2951         if (empty)
2952                 return;
2953 
2954         spin_lock(&fs_info->delayed_iput_lock);
2955         list_splice_init(&fs_info->delayed_iputs, &list);
2956         spin_unlock(&fs_info->delayed_iput_lock);
2957 
2958         while (!list_empty(&list)) {
2959                 delayed = list_entry(list.next, struct delayed_iput, list);
2960                 list_del(&delayed->list);
2961                 iput(delayed->inode);
2962                 kfree(delayed);
2963         }
2964 }
2965 
2966 /*
2967  * This is called in transaction commit time. If there are no orphan
2968  * files in the subvolume, it removes orphan item and frees block_rsv
2969  * structure.
2970  */
2971 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2972                               struct btrfs_root *root)
2973 {
2974         struct btrfs_block_rsv *block_rsv;
2975         int ret;
2976 
2977         if (atomic_read(&root->orphan_inodes) ||
2978             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2979                 return;
2980 
2981         spin_lock(&root->orphan_lock);
2982         if (atomic_read(&root->orphan_inodes)) {
2983                 spin_unlock(&root->orphan_lock);
2984                 return;
2985         }
2986 
2987         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2988                 spin_unlock(&root->orphan_lock);
2989                 return;
2990         }
2991 
2992         block_rsv = root->orphan_block_rsv;
2993         root->orphan_block_rsv = NULL;
2994         spin_unlock(&root->orphan_lock);
2995 
2996         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2997             btrfs_root_refs(&root->root_item) > 0) {
2998                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2999                                             root->root_key.objectid);
3000                 if (ret)
3001                         btrfs_abort_transaction(trans, root, ret);
3002                 else
3003                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3004                                   &root->state);
3005         }
3006 
3007         if (block_rsv) {
3008                 WARN_ON(block_rsv->size > 0);
3009                 btrfs_free_block_rsv(root, block_rsv);
3010         }
3011 }
3012 
3013 /*
3014  * This creates an orphan entry for the given inode in case something goes
3015  * wrong in the middle of an unlink/truncate.
3016  *
3017  * NOTE: caller of this function should reserve 5 units of metadata for
3018  *       this function.
3019  */
3020 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3021 {
3022         struct btrfs_root *root = BTRFS_I(inode)->root;
3023         struct btrfs_block_rsv *block_rsv = NULL;
3024         int reserve = 0;
3025         int insert = 0;
3026         int ret;
3027 
3028         if (!root->orphan_block_rsv) {
3029                 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3030                 if (!block_rsv)
3031                         return -ENOMEM;
3032         }
3033 
3034         spin_lock(&root->orphan_lock);
3035         if (!root->orphan_block_rsv) {
3036                 root->orphan_block_rsv = block_rsv;
3037         } else if (block_rsv) {
3038                 btrfs_free_block_rsv(root, block_rsv);
3039                 block_rsv = NULL;
3040         }
3041 
3042         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3043                               &BTRFS_I(inode)->runtime_flags)) {
3044 #if 0
3045                 /*
3046                  * For proper ENOSPC handling, we should do orphan
3047                  * cleanup when mounting. But this introduces backward
3048                  * compatibility issue.
3049                  */
3050                 if (!xchg(&root->orphan_item_inserted, 1))
3051                         insert = 2;
3052                 else
3053                         insert = 1;
3054 #endif
3055                 insert = 1;
3056                 atomic_inc(&root->orphan_inodes);
3057         }
3058 
3059         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3060                               &BTRFS_I(inode)->runtime_flags))
3061                 reserve = 1;
3062         spin_unlock(&root->orphan_lock);
3063 
3064         /* grab metadata reservation from transaction handle */
3065         if (reserve) {
3066                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3067                 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3068         }
3069 
3070         /* insert an orphan item to track this unlinked/truncated file */
3071         if (insert >= 1) {
3072                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3073                 if (ret) {
3074                         atomic_dec(&root->orphan_inodes);
3075                         if (reserve) {
3076                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3077                                           &BTRFS_I(inode)->runtime_flags);
3078                                 btrfs_orphan_release_metadata(inode);
3079                         }
3080                         if (ret != -EEXIST) {
3081                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3082                                           &BTRFS_I(inode)->runtime_flags);
3083                                 btrfs_abort_transaction(trans, root, ret);
3084                                 return ret;
3085                         }
3086                 }
3087                 ret = 0;
3088         }
3089 
3090         /* insert an orphan item to track subvolume contains orphan files */
3091         if (insert >= 2) {
3092                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3093                                                root->root_key.objectid);
3094                 if (ret && ret != -EEXIST) {
3095                         btrfs_abort_transaction(trans, root, ret);
3096                         return ret;
3097                 }
3098         }
3099         return 0;
3100 }
3101 
3102 /*
3103  * We have done the truncate/delete so we can go ahead and remove the orphan
3104  * item for this particular inode.
3105  */
3106 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3107                             struct inode *inode)
3108 {
3109         struct btrfs_root *root = BTRFS_I(inode)->root;
3110         int delete_item = 0;
3111         int release_rsv = 0;
3112         int ret = 0;
3113 
3114         spin_lock(&root->orphan_lock);
3115         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3116                                &BTRFS_I(inode)->runtime_flags))
3117                 delete_item = 1;
3118 
3119         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3120                                &BTRFS_I(inode)->runtime_flags))
3121                 release_rsv = 1;
3122         spin_unlock(&root->orphan_lock);
3123 
3124         if (delete_item) {
3125                 atomic_dec(&root->orphan_inodes);
3126                 if (trans)
3127                         ret = btrfs_del_orphan_item(trans, root,
3128                                                     btrfs_ino(inode));
3129         }
3130 
3131         if (release_rsv)
3132                 btrfs_orphan_release_metadata(inode);
3133 
3134         return ret;
3135 }
3136 
3137 /*
3138  * this cleans up any orphans that may be left on the list from the last use
3139  * of this root.
3140  */
3141 int btrfs_orphan_cleanup(struct btrfs_root *root)
3142 {
3143         struct btrfs_path *path;
3144         struct extent_buffer *leaf;
3145         struct btrfs_key key, found_key;
3146         struct btrfs_trans_handle *trans;
3147         struct inode *inode;
3148         u64 last_objectid = 0;
3149         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3150 
3151         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3152                 return 0;
3153 
3154         path = btrfs_alloc_path();
3155         if (!path) {
3156                 ret = -ENOMEM;
3157                 goto out;
3158         }
3159         path->reada = -1;
3160 
3161         key.objectid = BTRFS_ORPHAN_OBJECTID;
3162         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3163         key.offset = (u64)-1;
3164 
3165         while (1) {
3166                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3167                 if (ret < 0)
3168                         goto out;
3169 
3170                 /*
3171                  * if ret == 0 means we found what we were searching for, which
3172                  * is weird, but possible, so only screw with path if we didn't
3173                  * find the key and see if we have stuff that matches
3174                  */
3175                 if (ret > 0) {
3176                         ret = 0;
3177                         if (path->slots[0] == 0)
3178                                 break;
3179                         path->slots[0]--;
3180                 }
3181 
3182                 /* pull out the item */
3183                 leaf = path->nodes[0];
3184                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3185 
3186                 /* make sure the item matches what we want */
3187                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3188                         break;
3189                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3190                         break;
3191 
3192                 /* release the path since we're done with it */
3193                 btrfs_release_path(path);
3194 
3195                 /*
3196                  * this is where we are basically btrfs_lookup, without the
3197                  * crossing root thing.  we store the inode number in the
3198                  * offset of the orphan item.
3199                  */
3200 
3201                 if (found_key.offset == last_objectid) {
3202                         btrfs_err(root->fs_info,
3203                                 "Error removing orphan entry, stopping orphan cleanup");
3204                         ret = -EINVAL;
3205                         goto out;
3206                 }
3207 
3208                 last_objectid = found_key.offset;
3209 
3210                 found_key.objectid = found_key.offset;
3211                 found_key.type = BTRFS_INODE_ITEM_KEY;
3212                 found_key.offset = 0;
3213                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3214                 ret = PTR_ERR_OR_ZERO(inode);
3215                 if (ret && ret != -ESTALE)
3216                         goto out;
3217 
3218                 if (ret == -ESTALE && root == root->fs_info->tree_root) {
3219                         struct btrfs_root *dead_root;
3220                         struct btrfs_fs_info *fs_info = root->fs_info;
3221                         int is_dead_root = 0;
3222 
3223                         /*
3224                          * this is an orphan in the tree root. Currently these
3225                          * could come from 2 sources:
3226                          *  a) a snapshot deletion in progress
3227                          *  b) a free space cache inode
3228                          * We need to distinguish those two, as the snapshot
3229                          * orphan must not get deleted.
3230                          * find_dead_roots already ran before us, so if this
3231                          * is a snapshot deletion, we should find the root
3232                          * in the dead_roots list
3233                          */
3234                         spin_lock(&fs_info->trans_lock);
3235                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3236                                             root_list) {
3237                                 if (dead_root->root_key.objectid ==
3238                                     found_key.objectid) {
3239                                         is_dead_root = 1;
3240                                         break;
3241                                 }
3242                         }
3243                         spin_unlock(&fs_info->trans_lock);
3244                         if (is_dead_root) {
3245                                 /* prevent this orphan from being found again */
3246                                 key.offset = found_key.objectid - 1;
3247                                 continue;
3248                         }
3249                 }
3250                 /*
3251                  * Inode is already gone but the orphan item is still there,
3252                  * kill the orphan item.
3253                  */
3254                 if (ret == -ESTALE) {
3255                         trans = btrfs_start_transaction(root, 1);
3256                         if (IS_ERR(trans)) {
3257                                 ret = PTR_ERR(trans);
3258                                 goto out;
3259                         }
3260                         btrfs_debug(root->fs_info, "auto deleting %Lu",
3261                                 found_key.objectid);
3262                         ret = btrfs_del_orphan_item(trans, root,
3263                                                     found_key.objectid);
3264                         btrfs_end_transaction(trans, root);
3265                         if (ret)
3266                                 goto out;
3267                         continue;
3268                 }
3269 
3270                 /*
3271                  * add this inode to the orphan list so btrfs_orphan_del does
3272                  * the proper thing when we hit it
3273                  */
3274                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3275                         &BTRFS_I(inode)->runtime_flags);
3276                 atomic_inc(&root->orphan_inodes);
3277 
3278                 /* if we have links, this was a truncate, lets do that */
3279                 if (inode->i_nlink) {
3280                         if (WARN_ON(!S_ISREG(inode->i_mode))) {
3281                                 iput(inode);
3282                                 continue;
3283                         }
3284                         nr_truncate++;
3285 
3286                         /* 1 for the orphan item deletion. */
3287                         trans = btrfs_start_transaction(root, 1);
3288                         if (IS_ERR(trans)) {
3289                                 iput(inode);
3290                                 ret = PTR_ERR(trans);
3291                                 goto out;
3292                         }
3293                         ret = btrfs_orphan_add(trans, inode);
3294                         btrfs_end_transaction(trans, root);
3295                         if (ret) {
3296                                 iput(inode);
3297                                 goto out;
3298                         }
3299 
3300                         ret = btrfs_truncate(inode);
3301                         if (ret)
3302                                 btrfs_orphan_del(NULL, inode);
3303                 } else {
3304                         nr_unlink++;
3305                 }
3306 
3307                 /* this will do delete_inode and everything for us */
3308                 iput(inode);
3309                 if (ret)
3310                         goto out;
3311         }
3312         /* release the path since we're done with it */
3313         btrfs_release_path(path);
3314 
3315         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3316 
3317         if (root->orphan_block_rsv)
3318                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3319                                         (u64)-1);
3320 
3321         if (root->orphan_block_rsv ||
3322             test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3323                 trans = btrfs_join_transaction(root);
3324                 if (!IS_ERR(trans))
3325                         btrfs_end_transaction(trans, root);
3326         }
3327 
3328         if (nr_unlink)
3329                 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3330         if (nr_truncate)
3331                 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3332 
3333 out:
3334         if (ret)
3335                 btrfs_crit(root->fs_info,
3336                         "could not do orphan cleanup %d", ret);
3337         btrfs_free_path(path);
3338         return ret;
3339 }
3340 
3341 /*
3342  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3343  * don't find any xattrs, we know there can't be any acls.
3344  *
3345  * slot is the slot the inode is in, objectid is the objectid of the inode
3346  */
3347 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3348                                           int slot, u64 objectid,
3349                                           int *first_xattr_slot)
3350 {
3351         u32 nritems = btrfs_header_nritems(leaf);
3352         struct btrfs_key found_key;
3353         static u64 xattr_access = 0;
3354         static u64 xattr_default = 0;
3355         int scanned = 0;
3356 
3357         if (!xattr_access) {
3358                 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3359                                         strlen(POSIX_ACL_XATTR_ACCESS));
3360                 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3361                                         strlen(POSIX_ACL_XATTR_DEFAULT));
3362         }
3363 
3364         slot++;
3365         *first_xattr_slot = -1;
3366         while (slot < nritems) {
3367                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3368 
3369                 /* we found a different objectid, there must not be acls */
3370                 if (found_key.objectid != objectid)
3371                         return 0;
3372 
3373                 /* we found an xattr, assume we've got an acl */
3374                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3375                         if (*first_xattr_slot == -1)
3376                                 *first_xattr_slot = slot;
3377                         if (found_key.offset == xattr_access ||
3378                             found_key.offset == xattr_default)
3379                                 return 1;
3380                 }
3381 
3382                 /*
3383                  * we found a key greater than an xattr key, there can't
3384                  * be any acls later on
3385                  */
3386                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3387                         return 0;
3388 
3389                 slot++;
3390                 scanned++;
3391 
3392                 /*
3393                  * it goes inode, inode backrefs, xattrs, extents,
3394                  * so if there are a ton of hard links to an inode there can
3395                  * be a lot of backrefs.  Don't waste time searching too hard,
3396                  * this is just an optimization
3397                  */
3398                 if (scanned >= 8)
3399                         break;
3400         }
3401         /* we hit the end of the leaf before we found an xattr or
3402          * something larger than an xattr.  We have to assume the inode
3403          * has acls
3404          */
3405         if (*first_xattr_slot == -1)
3406                 *first_xattr_slot = slot;
3407         return 1;
3408 }
3409 
3410 /*
3411  * read an inode from the btree into the in-memory inode
3412  */
3413 static void btrfs_read_locked_inode(struct inode *inode)
3414 {
3415         struct btrfs_path *path;
3416         struct extent_buffer *leaf;
3417         struct btrfs_inode_item *inode_item;
3418         struct btrfs_timespec *tspec;
3419         struct btrfs_root *root = BTRFS_I(inode)->root;
3420         struct btrfs_key location;
3421         unsigned long ptr;
3422         int maybe_acls;
3423         u32 rdev;
3424         int ret;
3425         bool filled = false;
3426         int first_xattr_slot;
3427 
3428         ret = btrfs_fill_inode(inode, &rdev);
3429         if (!ret)
3430                 filled = true;
3431 
3432         path = btrfs_alloc_path();
3433         if (!path)
3434                 goto make_bad;
3435 
3436         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3437 
3438         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3439         if (ret)
3440                 goto make_bad;
3441 
3442         leaf = path->nodes[0];
3443 
3444         if (filled)
3445                 goto cache_index;
3446 
3447         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3448                                     struct btrfs_inode_item);
3449         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3450         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3451         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3452         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3453         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3454 
3455         tspec = btrfs_inode_atime(inode_item);
3456         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3457         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3458 
3459         tspec = btrfs_inode_mtime(inode_item);
3460         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3461         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3462 
3463         tspec = btrfs_inode_ctime(inode_item);
3464         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3465         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3466 
3467         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3468         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3469         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3470 
3471         /*
3472          * If we were modified in the current generation and evicted from memory
3473          * and then re-read we need to do a full sync since we don't have any
3474          * idea about which extents were modified before we were evicted from
3475          * cache.
3476          */
3477         if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3478                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3479                         &BTRFS_I(inode)->runtime_flags);
3480 
3481         inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3482         inode->i_generation = BTRFS_I(inode)->generation;
3483         inode->i_rdev = 0;
3484         rdev = btrfs_inode_rdev(leaf, inode_item);
3485 
3486         BTRFS_I(inode)->index_cnt = (u64)-1;
3487         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3488 
3489 cache_index:
3490         path->slots[0]++;
3491         if (inode->i_nlink != 1 ||
3492             path->slots[0] >= btrfs_header_nritems(leaf))
3493                 goto cache_acl;
3494 
3495         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3496         if (location.objectid != btrfs_ino(inode))
3497                 goto cache_acl;
3498 
3499         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3500         if (location.type == BTRFS_INODE_REF_KEY) {
3501                 struct btrfs_inode_ref *ref;
3502 
3503                 ref = (struct btrfs_inode_ref *)ptr;
3504                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3505         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3506                 struct btrfs_inode_extref *extref;
3507 
3508                 extref = (struct btrfs_inode_extref *)ptr;
3509                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3510                                                                      extref);
3511         }
3512 cache_acl:
3513         /*
3514          * try to precache a NULL acl entry for files that don't have
3515          * any xattrs or acls
3516          */
3517         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3518                                            btrfs_ino(inode), &first_xattr_slot);
3519         if (first_xattr_slot != -1) {
3520                 path->slots[0] = first_xattr_slot;
3521                 ret = btrfs_load_inode_props(inode, path);
3522                 if (ret)
3523                         btrfs_err(root->fs_info,
3524                                   "error loading props for ino %llu (root %llu): %d",
3525                                   btrfs_ino(inode),
3526                                   root->root_key.objectid, ret);
3527         }
3528         btrfs_free_path(path);
3529 
3530         if (!maybe_acls)
3531                 cache_no_acl(inode);
3532 
3533         switch (inode->i_mode & S_IFMT) {
3534         case S_IFREG:
3535                 inode->i_mapping->a_ops = &btrfs_aops;
3536                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3537                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3538                 inode->i_fop = &btrfs_file_operations;
3539                 inode->i_op = &btrfs_file_inode_operations;
3540                 break;
3541         case S_IFDIR:
3542                 inode->i_fop = &btrfs_dir_file_operations;
3543                 if (root == root->fs_info->tree_root)
3544                         inode->i_op = &btrfs_dir_ro_inode_operations;
3545                 else
3546                         inode->i_op = &btrfs_dir_inode_operations;
3547                 break;
3548         case S_IFLNK:
3549                 inode->i_op = &btrfs_symlink_inode_operations;
3550                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3551                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3552                 break;
3553         default:
3554                 inode->i_op = &btrfs_special_inode_operations;
3555                 init_special_inode(inode, inode->i_mode, rdev);
3556                 break;
3557         }
3558 
3559         btrfs_update_iflags(inode);
3560         return;
3561 
3562 make_bad:
3563         btrfs_free_path(path);
3564         make_bad_inode(inode);
3565 }
3566 
3567 /*
3568  * given a leaf and an inode, copy the inode fields into the leaf
3569  */
3570 static void fill_inode_item(struct btrfs_trans_handle *trans,
3571                             struct extent_buffer *leaf,
3572                             struct btrfs_inode_item *item,
3573                             struct inode *inode)
3574 {
3575         struct btrfs_map_token token;
3576 
3577         btrfs_init_map_token(&token);
3578 
3579         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3580         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3581         btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3582                                    &token);
3583         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3584         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3585 
3586         btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3587                                      inode->i_atime.tv_sec, &token);
3588         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3589                                       inode->i_atime.tv_nsec, &token);
3590 
3591         btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3592                                      inode->i_mtime.tv_sec, &token);
3593         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3594                                       inode->i_mtime.tv_nsec, &token);
3595 
3596         btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3597                                      inode->i_ctime.tv_sec, &token);
3598         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3599                                       inode->i_ctime.tv_nsec, &token);
3600 
3601         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3602                                      &token);
3603         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3604                                          &token);
3605         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3606         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3607         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3608         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3609         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3610 }
3611 
3612 /*
3613  * copy everything in the in-memory inode into the btree.
3614  */
3615 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3616                                 struct btrfs_root *root, struct inode *inode)
3617 {
3618         struct btrfs_inode_item *inode_item;
3619         struct btrfs_path *path;
3620         struct extent_buffer *leaf;
3621         int ret;
3622 
3623         path = btrfs_alloc_path();
3624         if (!path)
3625                 return -ENOMEM;
3626 
3627         path->leave_spinning = 1;
3628         ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3629                                  1);
3630         if (ret) {
3631                 if (ret > 0)
3632                         ret = -ENOENT;
3633                 goto failed;
3634         }
3635 
3636         leaf = path->nodes[0];
3637         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3638                                     struct btrfs_inode_item);
3639 
3640         fill_inode_item(trans, leaf, inode_item, inode);
3641         btrfs_mark_buffer_dirty(leaf);
3642         btrfs_set_inode_last_trans(trans, inode);
3643         ret = 0;
3644 failed:
3645         btrfs_free_path(path);
3646         return ret;
3647 }
3648 
3649 /*
3650  * copy everything in the in-memory inode into the btree.
3651  */
3652 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3653                                 struct btrfs_root *root, struct inode *inode)
3654 {
3655         int ret;
3656 
3657         /*
3658          * If the inode is a free space inode, we can deadlock during commit
3659          * if we put it into the delayed code.
3660          *
3661          * The data relocation inode should also be directly updated
3662          * without delay
3663          */
3664         if (!btrfs_is_free_space_inode(inode)
3665             && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3666             && !root->fs_info->log_root_recovering) {
3667                 btrfs_update_root_times(trans, root);
3668 
3669                 ret = btrfs_delayed_update_inode(trans, root, inode);
3670                 if (!ret)
3671                         btrfs_set_inode_last_trans(trans, inode);
3672                 return ret;
3673         }
3674 
3675         return btrfs_update_inode_item(trans, root, inode);
3676 }
3677 
3678 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3679                                          struct btrfs_root *root,
3680                                          struct inode *inode)
3681 {
3682         int ret;
3683 
3684         ret = btrfs_update_inode(trans, root, inode);
3685         if (ret == -ENOSPC)
3686                 return btrfs_update_inode_item(trans, root, inode);
3687         return ret;
3688 }
3689 
3690 /*
3691  * unlink helper that gets used here in inode.c and in the tree logging
3692  * recovery code.  It remove a link in a directory with a given name, and
3693  * also drops the back refs in the inode to the directory
3694  */
3695 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3696                                 struct btrfs_root *root,
3697                                 struct inode *dir, struct inode *inode,
3698                                 const char *name, int name_len)
3699 {
3700         struct btrfs_path *path;
3701         int ret = 0;
3702         struct extent_buffer *leaf;
3703         struct btrfs_dir_item *di;
3704         struct btrfs_key key;
3705         u64 index;
3706         u64 ino = btrfs_ino(inode);
3707         u64 dir_ino = btrfs_ino(dir);
3708 
3709         path = btrfs_alloc_path();
3710         if (!path) {
3711                 ret = -ENOMEM;
3712                 goto out;
3713         }
3714 
3715         path->leave_spinning = 1;
3716         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3717                                     name, name_len, -1);
3718         if (IS_ERR(di)) {
3719                 ret = PTR_ERR(di);
3720                 goto err;
3721         }
3722         if (!di) {
3723                 ret = -ENOENT;
3724                 goto err;
3725         }
3726         leaf = path->nodes[0];
3727         btrfs_dir_item_key_to_cpu(leaf, di, &key);
3728         ret = btrfs_delete_one_dir_name(trans, root, path, di);
3729         if (ret)
3730                 goto err;
3731         btrfs_release_path(path);
3732 
3733         /*
3734          * If we don't have dir index, we have to get it by looking up
3735          * the inode ref, since we get the inode ref, remove it directly,
3736          * it is unnecessary to do delayed deletion.
3737          *
3738          * But if we have dir index, needn't search inode ref to get it.
3739          * Since the inode ref is close to the inode item, it is better
3740          * that we delay to delete it, and just do this deletion when
3741          * we update the inode item.
3742          */
3743         if (BTRFS_I(inode)->dir_index) {
3744                 ret = btrfs_delayed_delete_inode_ref(inode);
3745                 if (!ret) {
3746                         index = BTRFS_I(inode)->dir_index;
3747                         goto skip_backref;
3748                 }
3749         }
3750 
3751         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3752                                   dir_ino, &index);
3753         if (ret) {
3754                 btrfs_info(root->fs_info,
3755                         "failed to delete reference to %.*s, inode %llu parent %llu",
3756                         name_len, name, ino, dir_ino);
3757                 btrfs_abort_transaction(trans, root, ret);
3758                 goto err;
3759         }
3760 skip_backref:
3761         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3762         if (ret) {
3763                 btrfs_abort_transaction(trans, root, ret);
3764                 goto err;
3765         }
3766 
3767         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3768                                          inode, dir_ino);
3769         if (ret != 0 && ret != -ENOENT) {
3770                 btrfs_abort_transaction(trans, root, ret);
3771                 goto err;
3772         }
3773 
3774         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3775                                            dir, index);
3776         if (ret == -ENOENT)
3777                 ret = 0;
3778         else if (ret)
3779                 btrfs_abort_transaction(trans, root, ret);
3780 err:
3781         btrfs_free_path(path);
3782         if (ret)
3783                 goto out;
3784 
3785         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3786         inode_inc_iversion(inode);
3787         inode_inc_iversion(dir);
3788         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3789         ret = btrfs_update_inode(trans, root, dir);
3790 out:
3791         return ret;
3792 }
3793 
3794 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3795                        struct btrfs_root *root,
3796                        struct inode *dir, struct inode *inode,
3797                        const char *name, int name_len)
3798 {
3799         int ret;
3800         ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3801         if (!ret) {
3802                 drop_nlink(inode);
3803                 ret = btrfs_update_inode(trans, root, inode);
3804         }
3805         return ret;
3806 }
3807 
3808 /*
3809  * helper to start transaction for unlink and rmdir.
3810  *
3811  * unlink and rmdir are special in btrfs, they do not always free space, so
3812  * if we cannot make our reservations the normal way try and see if there is
3813  * plenty of slack room in the global reserve to migrate, otherwise we cannot
3814  * allow the unlink to occur.
3815  */
3816 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3817 {
3818         struct btrfs_trans_handle *trans;
3819         struct btrfs_root *root = BTRFS_I(dir)->root;
3820         int ret;
3821 
3822         /*
3823          * 1 for the possible orphan item
3824          * 1 for the dir item
3825          * 1 for the dir index
3826          * 1 for the inode ref
3827          * 1 for the inode
3828          */
3829         trans = btrfs_start_transaction(root, 5);
3830         if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3831                 return trans;
3832 
3833         if (PTR_ERR(trans) == -ENOSPC) {
3834                 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3835 
3836                 trans = btrfs_start_transaction(root, 0);
3837                 if (IS_ERR(trans))
3838                         return trans;
3839                 ret = btrfs_cond_migrate_bytes(root->fs_info,
3840                                                &root->fs_info->trans_block_rsv,
3841                                                num_bytes, 5);
3842                 if (ret) {
3843                         btrfs_end_transaction(trans, root);
3844                         return ERR_PTR(ret);
3845                 }
3846                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3847                 trans->bytes_reserved = num_bytes;
3848         }
3849         return trans;
3850 }
3851 
3852 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3853 {
3854         struct btrfs_root *root = BTRFS_I(dir)->root;
3855         struct btrfs_trans_handle *trans;
3856         struct inode *inode = dentry->d_inode;
3857         int ret;
3858 
3859         trans = __unlink_start_trans(dir);
3860         if (IS_ERR(trans))
3861                 return PTR_ERR(trans);
3862 
3863         btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3864 
3865         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3866                                  dentry->d_name.name, dentry->d_name.len);
3867         if (ret)
3868                 goto out;
3869 
3870         if (inode->i_nlink == 0) {
3871                 ret = btrfs_orphan_add(trans, inode);
3872                 if (ret)
3873                         goto out;
3874         }
3875 
3876 out:
3877         btrfs_end_transaction(trans, root);
3878         btrfs_btree_balance_dirty(root);
3879         return ret;
3880 }
3881 
3882 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3883                         struct btrfs_root *root,
3884                         struct inode *dir, u64 objectid,
3885                         const char *name, int name_len)
3886 {
3887         struct btrfs_path *path;
3888         struct extent_buffer *leaf;
3889         struct btrfs_dir_item *di;
3890         struct btrfs_key key;
3891         u64 index;
3892         int ret;
3893         u64 dir_ino = btrfs_ino(dir);
3894 
3895         path = btrfs_alloc_path();
3896         if (!path)
3897                 return -ENOMEM;
3898 
3899         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3900                                    name, name_len, -1);
3901         if (IS_ERR_OR_NULL(di)) {
3902                 if (!di)
3903                         ret = -ENOENT;
3904                 else
3905                         ret = PTR_ERR(di);
3906                 goto out;
3907         }
3908 
3909         leaf = path->nodes[0];
3910         btrfs_dir_item_key_to_cpu(leaf, di, &key);
3911         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3912         ret = btrfs_delete_one_dir_name(trans, root, path, di);
3913         if (ret) {
3914                 btrfs_abort_transaction(trans, root, ret);
3915                 goto out;
3916         }
3917         btrfs_release_path(path);
3918 
3919         ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3920                                  objectid, root->root_key.objectid,
3921                                  dir_ino, &index, name, name_len);
3922         if (ret < 0) {
3923                 if (ret != -ENOENT) {
3924                         btrfs_abort_transaction(trans, root, ret);
3925                         goto out;
3926                 }
3927                 di = btrfs_search_dir_index_item(root, path, dir_ino,
3928                                                  name, name_len);
3929                 if (IS_ERR_OR_NULL(di)) {
3930                         if (!di)
3931                                 ret = -ENOENT;
3932                         else
3933                                 ret = PTR_ERR(di);
3934                         btrfs_abort_transaction(trans, root, ret);
3935                         goto out;
3936                 }
3937 
3938                 leaf = path->nodes[0];
3939                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3940                 btrfs_release_path(path);
3941                 index = key.offset;
3942         }
3943         btrfs_release_path(path);
3944 
3945         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3946         if (ret) {
3947                 btrfs_abort_transaction(trans, root, ret);
3948                 goto out;
3949         }
3950 
3951         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3952         inode_inc_iversion(dir);
3953         dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3954         ret = btrfs_update_inode_fallback(trans, root, dir);
3955         if (ret)
3956                 btrfs_abort_transaction(trans, root, ret);
3957 out:
3958         btrfs_free_path(path);
3959         return ret;
3960 }
3961 
3962 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3963 {
3964         struct inode *inode = dentry->d_inode;
3965         int err = 0;
3966         struct btrfs_root *root = BTRFS_I(dir)->root;
3967         struct btrfs_trans_handle *trans;
3968 
3969         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3970                 return -ENOTEMPTY;
3971         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3972                 return -EPERM;
3973 
3974         trans = __unlink_start_trans(dir);
3975         if (IS_ERR(trans))
3976                 return PTR_ERR(trans);
3977 
3978         if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3979                 err = btrfs_unlink_subvol(trans, root, dir,
3980                                           BTRFS_I(inode)->location.objectid,
3981                                           dentry->d_name.name,
3982                                           dentry->d_name.len);
3983                 goto out;
3984         }
3985 
3986         err = btrfs_orphan_add(trans, inode);
3987         if (err)
3988                 goto out;
3989 
3990         /* now the directory is empty */
3991         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3992                                  dentry->d_name.name, dentry->d_name.len);
3993         if (!err)
3994                 btrfs_i_size_write(inode, 0);
3995 out:
3996         btrfs_end_transaction(trans, root);
3997         btrfs_btree_balance_dirty(root);
3998 
3999         return err;
4000 }
4001 
4002 /*
4003  * this can truncate away extent items, csum items and directory items.
4004  * It starts at a high offset and removes keys until it can't find
4005  * any higher than new_size
4006  *
4007  * csum items that cross the new i_size are truncated to the new size
4008  * as well.
4009  *
4010  * min_type is the minimum key type to truncate down to.  If set to 0, this
4011  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4012  */
4013 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4014                                struct btrfs_root *root,
4015                                struct inode *inode,
4016                                u64 new_size, u32 min_type)
4017 {
4018         struct btrfs_path *path;
4019         struct extent_buffer *leaf;
4020         struct btrfs_file_extent_item *fi;
4021         struct btrfs_key key;
4022         struct btrfs_key found_key;
4023         u64 extent_start = 0;
4024         u64 extent_num_bytes = 0;
4025         u64 extent_offset = 0;
4026         u64 item_end = 0;
4027         u64 last_size = (u64)-1;
4028         u32 found_type = (u8)-1;
4029         int found_extent;
4030         int del_item;
4031         int pending_del_nr = 0;
4032         int pending_del_slot = 0;
4033         int extent_type = -1;
4034         int ret;
4035         int err = 0;
4036         u64 ino = btrfs_ino(inode);
4037 
4038         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4039 
4040         path = btrfs_alloc_path();
4041         if (!path)
4042                 return -ENOMEM;
4043         path->reada = -1;
4044 
4045         /*
4046          * We want to drop from the next block forward in case this new size is
4047          * not block aligned since we will be keeping the last block of the
4048          * extent just the way it is.
4049          */
4050         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4051             root == root->fs_info->tree_root)
4052                 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4053                                         root->sectorsize), (u64)-1, 0);
4054 
4055         /*
4056          * This function is also used to drop the items in the log tree before
4057          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4058          * it is used to drop the loged items. So we shouldn't kill the delayed
4059          * items.
4060          */
4061         if (min_type == 0 && root == BTRFS_I(inode)->root)
4062                 btrfs_kill_delayed_inode_items(inode);
4063 
4064         key.objectid = ino;
4065         key.offset = (u64)-1;
4066         key.type = (u8)-1;
4067 
4068 search_again:
4069         path->leave_spinning = 1;
4070         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4071         if (ret < 0) {
4072                 err = ret;
4073                 goto out;
4074         }
4075 
4076         if (ret > 0) {
4077                 /* there are no items in the tree for us to truncate, we're
4078                  * done
4079                  */
4080                 if (path->slots[0] == 0)
4081                         goto out;
4082                 path->slots[0]--;
4083         }
4084 
4085         while (1) {
4086                 fi = NULL;
4087                 leaf = path->nodes[0];
4088                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4089                 found_type = btrfs_key_type(&found_key);
4090 
4091                 if (found_key.objectid != ino)
4092                         break;
4093 
4094                 if (found_type < min_type)
4095                         break;
4096 
4097                 item_end = found_key.offset;
4098                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
4099                         fi = btrfs_item_ptr(leaf, path->slots[0],
4100                                             struct btrfs_file_extent_item);
4101                         extent_type = btrfs_file_extent_type(leaf, fi);
4102                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4103                                 item_end +=
4104                                     btrfs_file_extent_num_bytes(leaf, fi);
4105                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4106                                 item_end += btrfs_file_extent_inline_len(leaf,
4107                                                          path->slots[0], fi);
4108                         }
4109                         item_end--;
4110                 }
4111                 if (found_type > min_type) {
4112                         del_item = 1;
4113                 } else {
4114                         if (item_end < new_size)
4115                                 break;
4116                         if (found_key.offset >= new_size)
4117                                 del_item = 1;
4118                         else
4119                                 del_item = 0;
4120                 }
4121                 found_extent = 0;
4122                 /* FIXME, shrink the extent if the ref count is only 1 */
4123                 if (found_type != BTRFS_EXTENT_DATA_KEY)
4124                         goto delete;
4125 
4126                 if (del_item)
4127                         last_size = found_key.offset;
4128                 else
4129                         last_size = new_size;
4130 
4131                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4132                         u64 num_dec;
4133                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4134                         if (!del_item) {
4135                                 u64 orig_num_bytes =
4136                                         btrfs_file_extent_num_bytes(leaf, fi);
4137                                 extent_num_bytes = ALIGN(new_size -
4138                                                 found_key.offset,
4139                                                 root->sectorsize);
4140                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4141                                                          extent_num_bytes);
4142                                 num_dec = (orig_num_bytes -
4143                                            extent_num_bytes);
4144                                 if (test_bit(BTRFS_ROOT_REF_COWS,
4145                                              &root->state) &&
4146                                     extent_start != 0)
4147                                         inode_sub_bytes(inode, num_dec);
4148                                 btrfs_mark_buffer_dirty(leaf);
4149                         } else {
4150                                 extent_num_bytes =
4151                                         btrfs_file_extent_disk_num_bytes(leaf,
4152                                                                          fi);
4153                                 extent_offset = found_key.offset -
4154                                         btrfs_file_extent_offset(leaf, fi);
4155 
4156                                 /* FIXME blocksize != 4096 */
4157                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4158                                 if (extent_start != 0) {
4159                                         found_extent = 1;
4160                                         if (test_bit(BTRFS_ROOT_REF_COWS,
4161                                                      &root->state))
4162                                                 inode_sub_bytes(inode, num_dec);
4163                                 }
4164                         }
4165                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4166                         /*
4167                          * we can't truncate inline items that have had
4168                          * special encodings
4169                          */
4170                         if (!del_item &&
4171                             btrfs_file_extent_compression(leaf, fi) == 0 &&
4172                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
4173                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4174                                 u32 size = new_size - found_key.offset;
4175 
4176                                 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4177                                         inode_sub_bytes(inode, item_end + 1 -
4178                                                         new_size);
4179 
4180                                 /*
4181                                  * update the ram bytes to properly reflect
4182                                  * the new size of our item
4183                                  */
4184                                 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4185                                 size =
4186                                     btrfs_file_extent_calc_inline_size(size);
4187                                 btrfs_truncate_item(root, path, size, 1);
4188                         } else if (test_bit(BTRFS_ROOT_REF_COWS,
4189                                             &root->state)) {
4190                                 inode_sub_bytes(inode, item_end + 1 -
4191                                                 found_key.offset);
4192                         }
4193                 }
4194 delete:
4195                 if (del_item) {
4196                         if (!pending_del_nr) {
4197                                 /* no pending yet, add ourselves */
4198                                 pending_del_slot = path->slots[0];
4199                                 pending_del_nr = 1;
4200                         } else if (pending_del_nr &&
4201                                    path->slots[0] + 1 == pending_del_slot) {
4202                                 /* hop on the pending chunk */
4203                                 pending_del_nr++;
4204                                 pending_del_slot = path->slots[0];
4205                         } else {
4206                                 BUG();
4207                         }
4208                 } else {
4209                         break;
4210                 }
4211                 if (found_extent &&
4212                     (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4213                      root == root->fs_info->tree_root)) {
4214                         btrfs_set_path_blocking(path);
4215                         ret = btrfs_free_extent(trans, root, extent_start,
4216                                                 extent_num_bytes, 0,
4217                                                 btrfs_header_owner(leaf),
4218                                                 ino, extent_offset, 0);
4219                         BUG_ON(ret);
4220                 }
4221 
4222                 if (found_type == BTRFS_INODE_ITEM_KEY)
4223                         break;
4224 
4225                 if (path->slots[0] == 0 ||
4226                     path->slots[0] != pending_del_slot) {
4227                         if (pending_del_nr) {
4228                                 ret = btrfs_del_items(trans, root, path,
4229                                                 pending_del_slot,
4230                                                 pending_del_nr);
4231                                 if (ret) {
4232                                         btrfs_abort_transaction(trans,
4233                                                                 root, ret);
4234                                         goto error;
4235                                 }
4236                                 pending_del_nr = 0;
4237                         }
4238                         btrfs_release_path(path);
4239                         goto search_again;
4240                 } else {
4241                         path->slots[0]--;
4242                 }
4243         }
4244 out:
4245         if (pending_del_nr) {
4246                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
4247                                       pending_del_nr);
4248                 if (ret)
4249                         btrfs_abort_transaction(trans, root, ret);
4250         }
4251 error:
4252         if (last_size != (u64)-1 &&
4253             root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4254                 btrfs_ordered_update_i_size(inode, last_size, NULL);
4255         btrfs_free_path(path);
4256         return err;
4257 }
4258 
4259 /*
4260  * btrfs_truncate_page - read, zero a chunk and write a page
4261  * @inode - inode that we're zeroing
4262  * @from - the offset to start zeroing
4263  * @len - the length to zero, 0 to zero the entire range respective to the
4264  *      offset
4265  * @front - zero up to the offset instead of from the offset on
4266  *
4267  * This will find the page for the "from" offset and cow the page and zero the
4268  * part we want to zero.  This is used with truncate and hole punching.
4269  */
4270 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4271                         int front)
4272 {
4273         struct address_space *mapping = inode->i_mapping;
4274         struct btrfs_root *root = BTRFS_I(inode)->root;
4275         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4276         struct btrfs_ordered_extent *ordered;
4277         struct extent_state *cached_state = NULL;
4278         char *kaddr;
4279         u32 blocksize = root->sectorsize;
4280         pgoff_t index = from >> PAGE_CACHE_SHIFT;
4281         unsigned offset = from & (PAGE_CACHE_SIZE-1);
4282         struct page *page;
4283         gfp_t mask = btrfs_alloc_write_mask(mapping);
4284         int ret = 0;
4285         u64 page_start;
4286         u64 page_end;
4287 
4288         if ((offset & (blocksize - 1)) == 0 &&
4289             (!len || ((len & (blocksize - 1)) == 0)))
4290                 goto out;
4291         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4292         if (ret)
4293                 goto out;
4294 
4295 again:
4296         page = find_or_create_page(mapping, index, mask);
4297         if (!page) {
4298                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4299                 ret = -ENOMEM;
4300                 goto out;
4301         }
4302 
4303         page_start = page_offset(page);
4304         page_end = page_start + PAGE_CACHE_SIZE - 1;
4305 
4306         if (!PageUptodate(page)) {
4307                 ret = btrfs_readpage(NULL, page);
4308                 lock_page(page);
4309                 if (page->mapping != mapping) {
4310                         unlock_page(page);
4311                         page_cache_release(page);
4312                         goto again;
4313                 }
4314                 if (!PageUptodate(page)) {
4315                         ret = -EIO;
4316                         goto out_unlock;
4317                 }
4318         }
4319         wait_on_page_writeback(page);
4320 
4321         lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4322         set_page_extent_mapped(page);
4323 
4324         ordered = btrfs_lookup_ordered_extent(inode, page_start);
4325         if (ordered) {
4326                 unlock_extent_cached(io_tree, page_start, page_end,
4327                                      &cached_state, GFP_NOFS);
4328                 unlock_page(page);
4329                 page_cache_release(page);
4330                 btrfs_start_ordered_extent(inode, ordered, 1);
4331                 btrfs_put_ordered_extent(ordered);
4332                 goto again;
4333         }
4334 
4335         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4336                           EXTENT_DIRTY | EXTENT_DELALLOC |
4337                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4338                           0, 0, &cached_state, GFP_NOFS);
4339 
4340         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4341                                         &cached_state);
4342         if (ret) {
4343                 unlock_extent_cached(io_tree, page_start, page_end,
4344                                      &cached_state, GFP_NOFS);
4345                 goto out_unlock;
4346         }
4347 
4348         if (offset != PAGE_CACHE_SIZE) {
4349                 if (!len)
4350                         len = PAGE_CACHE_SIZE - offset;
4351                 kaddr = kmap(page);
4352                 if (front)
4353                         memset(kaddr, 0, offset);
4354                 else
4355                         memset(kaddr + offset, 0, len);
4356                 flush_dcache_page(page);
4357                 kunmap(page);
4358         }
4359         ClearPageChecked(page);
4360         set_page_dirty(page);
4361         unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4362                              GFP_NOFS);
4363 
4364 out_unlock:
4365         if (ret)
4366                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4367         unlock_page(page);
4368         page_cache_release(page);
4369 out:
4370         return ret;
4371 }
4372 
4373 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4374                              u64 offset, u64 len)
4375 {
4376         struct btrfs_trans_handle *trans;
4377         int ret;
4378 
4379         /*
4380          * Still need to make sure the inode looks like it's been updated so
4381          * that any holes get logged if we fsync.
4382          */
4383         if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4384                 BTRFS_I(inode)->last_trans = root->fs_info->generation;
4385                 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4386                 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4387                 return 0;
4388         }
4389 
4390         /*
4391          * 1 - for the one we're dropping
4392          * 1 - for the one we're adding
4393          * 1 - for updating the inode.
4394          */
4395         trans = btrfs_start_transaction(root, 3);
4396         if (IS_ERR(trans))
4397                 return PTR_ERR(trans);
4398 
4399         ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4400         if (ret) {
4401                 btrfs_abort_transaction(trans, root, ret);
4402                 btrfs_end_transaction(trans, root);
4403                 return ret;
4404         }
4405 
4406         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4407                                        0, 0, len, 0, len, 0, 0, 0);
4408         if (ret)
4409                 btrfs_abort_transaction(trans, root, ret);
4410         else
4411                 btrfs_update_inode(trans, root, inode);
4412         btrfs_end_transaction(trans, root);
4413         return ret;
4414 }
4415 
4416 /*
4417  * This function puts in dummy file extents for the area we're creating a hole
4418  * for.  So if we are truncating this file to a larger size we need to insert
4419  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4420  * the range between oldsize and size
4421  */
4422 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4423 {
4424         struct btrfs_root *root = BTRFS_I(inode)->root;
4425         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4426         struct extent_map *em = NULL;
4427         struct extent_state *cached_state = NULL;
4428         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4429         u64 hole_start = ALIGN(oldsize, root->sectorsize);
4430         u64 block_end = ALIGN(size, root->sectorsize);
4431         u64 last_byte;
4432         u64 cur_offset;
4433         u64 hole_size;
4434         int err = 0;
4435 
4436         /*
4437          * If our size started in the middle of a page we need to zero out the
4438          * rest of the page before we expand the i_size, otherwise we could
4439          * expose stale data.
4440          */
4441         err = btrfs_truncate_page(inode, oldsize, 0, 0);
4442         if (err)
4443                 return err;
4444 
4445         if (size <= hole_start)
4446                 return 0;
4447 
4448         while (1) {
4449                 struct btrfs_ordered_extent *ordered;
4450 
4451                 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4452                                  &cached_state);
4453                 ordered = btrfs_lookup_ordered_range(inode, hole_start,
4454                                                      block_end - hole_start);
4455                 if (!ordered)
4456                         break;
4457                 unlock_extent_cached(io_tree, hole_start, block_end - 1,
4458                                      &cached_state, GFP_NOFS);
4459                 btrfs_start_ordered_extent(inode, ordered, 1);
4460                 btrfs_put_ordered_extent(ordered);
4461         }
4462 
4463         cur_offset = hole_start;
4464         while (1) {
4465                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4466                                 block_end - cur_offset, 0);
4467                 if (IS_ERR(em)) {
4468                         err = PTR_ERR(em);
4469                         em = NULL;
4470                         break;
4471                 }
4472                 last_byte = min(extent_map_end(em), block_end);
4473                 last_byte = ALIGN(last_byte , root->sectorsize);
4474                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4475                         struct extent_map *hole_em;
4476                         hole_size = last_byte - cur_offset;
4477 
4478                         err = maybe_insert_hole(root, inode, cur_offset,
4479                                                 hole_size);
4480                         if (err)
4481                                 break;
4482                         btrfs_drop_extent_cache(inode, cur_offset,
4483                                                 cur_offset + hole_size - 1, 0);
4484                         hole_em = alloc_extent_map();
4485                         if (!hole_em) {
4486                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4487                                         &BTRFS_I(inode)->runtime_flags);
4488                                 goto next;
4489                         }
4490                         hole_em->start = cur_offset;
4491                         hole_em->len = hole_size;
4492                         hole_em->orig_start = cur_offset;
4493 
4494                         hole_em->block_start = EXTENT_MAP_HOLE;
4495                         hole_em->block_len = 0;
4496                         hole_em->orig_block_len = 0;
4497                         hole_em->ram_bytes = hole_size;
4498                         hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4499                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
4500                         hole_em->generation = root->fs_info->generation;
4501 
4502                         while (1) {
4503                                 write_lock(&em_tree->lock);
4504                                 err = add_extent_mapping(em_tree, hole_em, 1);
4505                                 write_unlock(&em_tree->lock);
4506                                 if (err != -EEXIST)
4507                                         break;
4508                                 btrfs_drop_extent_cache(inode, cur_offset,
4509                                                         cur_offset +
4510                                                         hole_size - 1, 0);
4511                         }
4512                         free_extent_map(hole_em);
4513                 }
4514 next:
4515                 free_extent_map(em);
4516                 em = NULL;
4517                 cur_offset = last_byte;
4518                 if (cur_offset >= block_end)
4519                         break;
4520         }
4521         free_extent_map(em);
4522         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4523                              GFP_NOFS);
4524         return err;
4525 }
4526 
4527 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4528 {
4529         struct btrfs_root *root = BTRFS_I(inode)->root;
4530         struct btrfs_trans_handle *trans;
4531         loff_t oldsize = i_size_read(inode);
4532         loff_t newsize = attr->ia_size;
4533         int mask = attr->ia_valid;
4534         int ret;
4535 
4536         /*
4537          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4538          * special case where we need to update the times despite not having
4539          * these flags set.  For all other operations the VFS set these flags
4540          * explicitly if it wants a timestamp update.
4541          */
4542         if (newsize != oldsize) {
4543                 inode_inc_iversion(inode);
4544                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4545                         inode->i_ctime = inode->i_mtime =
4546                                 current_fs_time(inode->i_sb);
4547         }
4548 
4549         if (newsize > oldsize) {
4550                 truncate_pagecache(inode, newsize);
4551                 ret = btrfs_cont_expand(inode, oldsize, newsize);
4552                 if (ret)
4553                         return ret;
4554 
4555                 trans = btrfs_start_transaction(root, 1);
4556                 if (IS_ERR(trans))
4557                         return PTR_ERR(trans);
4558 
4559                 i_size_write(inode, newsize);
4560                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4561                 ret = btrfs_update_inode(trans, root, inode);
4562                 btrfs_end_transaction(trans, root);
4563         } else {
4564 
4565                 /*
4566                  * We're truncating a file that used to have good data down to
4567                  * zero. Make sure it gets into the ordered flush list so that
4568                  * any new writes get down to disk quickly.
4569                  */
4570                 if (newsize == 0)
4571                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4572                                 &BTRFS_I(inode)->runtime_flags);
4573 
4574                 /*
4575                  * 1 for the orphan item we're going to add
4576                  * 1 for the orphan item deletion.
4577                  */
4578                 trans = btrfs_start_transaction(root, 2);
4579                 if (IS_ERR(trans))
4580                         return PTR_ERR(trans);
4581 
4582                 /*
4583                  * We need to do this in case we fail at _any_ point during the
4584                  * actual truncate.  Once we do the truncate_setsize we could
4585                  * invalidate pages which forces any outstanding ordered io to
4586                  * be instantly completed which will give us extents that need
4587                  * to be truncated.  If we fail to get an orphan inode down we
4588                  * could have left over extents that were never meant to live,
4589                  * so we need to garuntee from this point on that everything
4590                  * will be consistent.
4591                  */
4592                 ret = btrfs_orphan_add(trans, inode);
4593                 btrfs_end_transaction(trans, root);
4594                 if (ret)
4595                         return ret;
4596 
4597                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
4598                 truncate_setsize(inode, newsize);
4599 
4600                 /* Disable nonlocked read DIO to avoid the end less truncate */
4601                 btrfs_inode_block_unlocked_dio(inode);
4602                 inode_dio_wait(inode);
4603                 btrfs_inode_resume_unlocked_dio(inode);
4604 
4605                 ret = btrfs_truncate(inode);
4606                 if (ret && inode->i_nlink) {
4607                         int err;
4608 
4609                         /*
4610                          * failed to truncate, disk_i_size is only adjusted down
4611                          * as we remove extents, so it should represent the true
4612                          * size of the inode, so reset the in memory size and
4613                          * delete our orphan entry.
4614                          */
4615                         trans = btrfs_join_transaction(root);
4616                         if (IS_ERR(trans)) {
4617                                 btrfs_orphan_del(NULL, inode);
4618                                 return ret;
4619                         }
4620                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4621                         err = btrfs_orphan_del(trans, inode);
4622                         if (err)
4623                                 btrfs_abort_transaction(trans, root, err);
4624                         btrfs_end_transaction(trans, root);
4625                 }
4626         }
4627 
4628         return ret;
4629 }
4630 
4631 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4632 {
4633         struct inode *inode = dentry->d_inode;
4634         struct btrfs_root *root = BTRFS_I(inode)->root;
4635         int err;
4636 
4637         if (btrfs_root_readonly(root))
4638                 return -EROFS;
4639 
4640         err = inode_change_ok(inode, attr);
4641         if (err)
4642                 return err;
4643 
4644         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4645                 err = btrfs_setsize(inode, attr);
4646                 if (err)
4647                         return err;
4648         }
4649 
4650         if (attr->ia_valid) {
4651                 setattr_copy(inode, attr);
4652                 inode_inc_iversion(inode);
4653                 err = btrfs_dirty_inode(inode);
4654 
4655                 if (!err && attr->ia_valid & ATTR_MODE)
4656                         err = posix_acl_chmod(inode, inode->i_mode);
4657         }
4658 
4659         return err;
4660 }
4661 
4662 /*
4663  * While truncating the inode pages during eviction, we get the VFS calling
4664  * btrfs_invalidatepage() against each page of the inode. This is slow because
4665  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4666  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4667  * extent_state structures over and over, wasting lots of time.
4668  *
4669  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4670  * those expensive operations on a per page basis and do only the ordered io
4671  * finishing, while we release here the extent_map and extent_state structures,
4672  * without the excessive merging and splitting.
4673  */
4674 static void evict_inode_truncate_pages(struct inode *inode)
4675 {
4676         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4677         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4678         struct rb_node *node;
4679 
4680         ASSERT(inode->i_state & I_FREEING);
4681         truncate_inode_pages_final(&inode->i_data);
4682 
4683         write_lock(&map_tree->lock);
4684         while (!RB_EMPTY_ROOT(&map_tree->map)) {
4685                 struct extent_map *em;
4686 
4687                 node = rb_first(&map_tree->map);
4688                 em = rb_entry(node, struct extent_map, rb_node);
4689                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4690                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4691                 remove_extent_mapping(map_tree, em);
4692                 free_extent_map(em);
4693                 if (need_resched()) {
4694                         write_unlock(&map_tree->lock);
4695                         cond_resched();
4696                         write_lock(&map_tree->lock);
4697                 }
4698         }
4699         write_unlock(&map_tree->lock);
4700 
4701         spin_lock(&io_tree->lock);
4702         while (!RB_EMPTY_ROOT(&io_tree->state)) {
4703                 struct extent_state *state;
4704                 struct extent_state *cached_state = NULL;
4705 
4706                 node = rb_first(&io_tree->state);
4707                 state = rb_entry(node, struct extent_state, rb_node);
4708                 atomic_inc(&state->refs);
4709                 spin_unlock(&io_tree->lock);
4710 
4711                 lock_extent_bits(io_tree, state->start, state->end,
4712                                  0, &cached_state);
4713                 clear_extent_bit(io_tree, state->start, state->end,
4714                                  EXTENT_LOCKED | EXTENT_DIRTY |
4715                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4716                                  EXTENT_DEFRAG, 1, 1,
4717                                  &cached_state, GFP_NOFS);
4718                 free_extent_state(state);
4719 
4720                 cond_resched();
4721                 spin_lock(&io_tree->lock);
4722         }
4723         spin_unlock(&io_tree->lock);
4724 }
4725 
4726 void btrfs_evict_inode(struct inode *inode)
4727 {
4728         struct btrfs_trans_handle *trans;
4729         struct btrfs_root *root = BTRFS_I(inode)->root;
4730         struct btrfs_block_rsv *rsv, *global_rsv;
4731         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4732         int ret;
4733 
4734         trace_btrfs_inode_evict(inode);
4735 
4736         evict_inode_truncate_pages(inode);
4737 
4738         if (inode->i_nlink &&
4739             ((btrfs_root_refs(&root->root_item) != 0 &&
4740               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4741              btrfs_is_free_space_inode(inode)))
4742                 goto no_delete;
4743 
4744         if (is_bad_inode(inode)) {
4745                 btrfs_orphan_del(NULL, inode);
4746                 goto no_delete;
4747         }
4748         /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4749         btrfs_wait_ordered_range(inode, 0, (u64)-1);
4750 
4751         if (root->fs_info->log_root_recovering) {
4752                 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4753                                  &BTRFS_I(inode)->runtime_flags));
4754                 goto no_delete;
4755         }
4756 
4757         if (inode->i_nlink > 0) {
4758                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
4759                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
4760                 goto no_delete;
4761         }
4762 
4763         ret = btrfs_commit_inode_delayed_inode(inode);
4764         if (ret) {
4765                 btrfs_orphan_del(NULL, inode);
4766                 goto no_delete;
4767         }
4768 
4769         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4770         if (!rsv) {
4771                 btrfs_orphan_del(NULL, inode);
4772                 goto no_delete;
4773         }
4774         rsv->size = min_size;
4775         rsv->failfast = 1;
4776         global_rsv = &root->fs_info->global_block_rsv;
4777 
4778         btrfs_i_size_write(inode, 0);
4779 
4780         /*
4781          * This is a bit simpler than btrfs_truncate since we've already
4782          * reserved our space for our orphan item in the unlink, so we just
4783          * need to reserve some slack space in case we add bytes and update
4784          * inode item when doing the truncate.
4785          */
4786         while (1) {
4787                 ret = btrfs_block_rsv_refill(root, rsv, min_size,
4788                                              BTRFS_RESERVE_FLUSH_LIMIT);
4789 
4790                 /*
4791                  * Try and steal from the global reserve since we will
4792                  * likely not use this space anyway, we want to try as
4793                  * hard as possible to get this to work.
4794                  */
4795                 if (ret)
4796                         ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4797 
4798                 if (ret) {
4799                         btrfs_warn(root->fs_info,
4800                                 "Could not get space for a delete, will truncate on mount %d",
4801                                 ret);
4802                         btrfs_orphan_del(NULL, inode);
4803                         btrfs_free_block_rsv(root, rsv);
4804                         goto no_delete;
4805                 }
4806 
4807                 trans = btrfs_join_transaction(root);
4808                 if (IS_ERR(trans)) {
4809                         btrfs_orphan_del(NULL, inode);
4810                         btrfs_free_block_rsv(root, rsv);
4811                         goto no_delete;
4812                 }
4813 
4814                 trans->block_rsv = rsv;
4815 
4816                 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4817                 if (ret != -ENOSPC)
4818                         break;
4819 
4820                 trans->block_rsv = &root->fs_info->trans_block_rsv;
4821                 btrfs_end_transaction(trans, root);
4822                 trans = NULL;
4823                 btrfs_btree_balance_dirty(root);
4824         }
4825 
4826         btrfs_free_block_rsv(root, rsv);
4827 
4828         /*
4829          * Errors here aren't a big deal, it just means we leave orphan items
4830          * in the tree.  They will be cleaned up on the next mount.
4831          */
4832         if (ret == 0) {
4833                 trans->block_rsv = root->orphan_block_rsv;
4834                 btrfs_orphan_del(trans, inode);
4835         } else {
4836                 btrfs_orphan_del(NULL, inode);
4837         }
4838 
4839         trans->block_rsv = &root->fs_info->trans_block_rsv;
4840         if (!(root == root->fs_info->tree_root ||
4841               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4842                 btrfs_return_ino(root, btrfs_ino(inode));
4843 
4844         btrfs_end_transaction(trans, root);
4845         btrfs_btree_balance_dirty(root);
4846 no_delete:
4847         btrfs_remove_delayed_node(inode);
4848         clear_inode(inode);
4849         return;
4850 }
4851 
4852 /*
4853  * this returns the key found in the dir entry in the location pointer.
4854  * If no dir entries were found, location->objectid is 0.
4855  */
4856 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4857                                struct btrfs_key *location)
4858 {
4859         const char *name = dentry->d_name.name;
4860         int namelen = dentry->d_name.len;
4861         struct btrfs_dir_item *di;
4862         struct btrfs_path *path;
4863         struct btrfs_root *root = BTRFS_I(dir)->root;
4864         int ret = 0;
4865 
4866         path = btrfs_alloc_path();
4867         if (!path)
4868                 return -ENOMEM;
4869 
4870         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4871                                     namelen, 0);
4872         if (IS_ERR(di))
4873                 ret = PTR_ERR(di);
4874 
4875         if (IS_ERR_OR_NULL(di))
4876                 goto out_err;
4877 
4878         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4879 out:
4880         btrfs_free_path(path);
4881         return ret;
4882 out_err:
4883         location->objectid = 0;
4884         goto out;
4885 }
4886 
4887 /*
4888  * when we hit a tree root in a directory, the btrfs part of the inode
4889  * needs to be changed to reflect the root directory of the tree root.  This
4890  * is kind of like crossing a mount point.
4891  */
4892 static int fixup_tree_root_location(struct btrfs_root *root,
4893                                     struct inode *dir,
4894                                     struct dentry *dentry,
4895                                     struct btrfs_key *location,
4896                                     struct btrfs_root **sub_root)
4897 {
4898         struct btrfs_path *path;
4899         struct btrfs_root *new_root;
4900         struct btrfs_root_ref *ref;
4901         struct extent_buffer *leaf;
4902         int ret;
4903         int err = 0;
4904 
4905         path = btrfs_alloc_path();
4906         if (!path) {
4907                 err = -ENOMEM;
4908                 goto out;
4909         }
4910 
4911         err = -ENOENT;
4912         ret = btrfs_find_item(root->fs_info->tree_root, path,
4913                                 BTRFS_I(dir)->root->root_key.objectid,
4914                                 location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4915         if (ret) {
4916                 if (ret < 0)
4917                         err = ret;
4918                 goto out;
4919         }
4920 
4921         leaf = path->nodes[0];
4922         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4923         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4924             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4925                 goto out;
4926 
4927         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4928                                    (unsigned long)(ref + 1),
4929                                    dentry->d_name.len);
4930         if (ret)
4931                 goto out;
4932 
4933         btrfs_release_path(path);
4934 
4935         new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4936         if (IS_ERR(new_root)) {
4937                 err = PTR_ERR(new_root);
4938                 goto out;
4939         }
4940 
4941         *sub_root = new_root;
4942         location->objectid = btrfs_root_dirid(&new_root->root_item);
4943         location->type = BTRFS_INODE_ITEM_KEY;
4944         location->offset = 0;
4945         err = 0;
4946 out:
4947         btrfs_free_path(path);
4948         return err;
4949 }
4950 
4951 static void inode_tree_add(struct inode *inode)
4952 {
4953         struct btrfs_root *root = BTRFS_I(inode)->root;
4954         struct btrfs_inode *entry;
4955         struct rb_node **p;
4956         struct rb_node *parent;
4957         struct rb_node *new = &BTRFS_I(inode)->rb_node;
4958         u64 ino = btrfs_ino(inode);
4959 
4960         if (inode_unhashed(inode))
4961                 return;
4962         parent = NULL;
4963         spin_lock(&root->inode_lock);
4964         p = &root->inode_tree.rb_node;
4965         while (*p) {
4966                 parent = *p;
4967                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
4968 
4969                 if (ino < btrfs_ino(&entry->vfs_inode))
4970                         p = &parent->rb_left;
4971                 else if (ino > btrfs_ino(&entry->vfs_inode))
4972                         p = &parent->rb_right;
4973                 else {
4974                         WARN_ON(!(entry->vfs_inode.i_state &
4975                                   (I_WILL_FREE | I_FREEING)));
4976                         rb_replace_node(parent, new, &root->inode_tree);
4977                         RB_CLEAR_NODE(parent);
4978                         spin_unlock(&root->inode_lock);
4979                         return;
4980                 }
4981         }
4982         rb_link_node(new, parent, p);
4983         rb_insert_color(new, &root->inode_tree);
4984         spin_unlock(&root->inode_lock);
4985 }
4986 
4987 static void inode_tree_del(struct inode *inode)
4988 {
4989         struct btrfs_root *root = BTRFS_I(inode)->root;
4990         int empty = 0;
4991 
4992         spin_lock(&root->inode_lock);
4993         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4994                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4995                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4996                 empty = RB_EMPTY_ROOT(&root->inode_tree);
4997         }
4998         spin_unlock(&root->inode_lock);
4999 
5000         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5001                 synchronize_srcu(&root->fs_info->subvol_srcu);
5002                 spin_lock(&root->inode_lock);
5003                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5004                 spin_unlock(&root->inode_lock);
5005                 if (empty)
5006                         btrfs_add_dead_root(root);
5007         }
5008 }
5009 
5010 void btrfs_invalidate_inodes(struct btrfs_root *root)
5011 {
5012         struct rb_node *node;
5013         struct rb_node *prev;
5014         struct btrfs_inode *entry;
5015         struct inode *inode;
5016         u64 objectid = 0;
5017 
5018         if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5019                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5020 
5021         spin_lock(&root->inode_lock);
5022 again:
5023         node = root->inode_tree.rb_node;
5024         prev = NULL;
5025         while (node) {
5026                 prev = node;
5027                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5028 
5029                 if (objectid < btrfs_ino(&entry->vfs_inode))
5030                         node = node->rb_left;
5031                 else if (objectid > btrfs_ino(&entry->vfs_inode))
5032                         node = node->rb_right;
5033                 else
5034                         break;
5035         }
5036         if (!node) {
5037                 while (prev) {
5038                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
5039                         if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5040                                 node = prev;
5041                                 break;
5042                         }
5043                         prev = rb_next(prev);
5044                 }
5045         }
5046         while (node) {
5047                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5048                 objectid = btrfs_ino(&entry->vfs_inode) + 1;
5049                 inode = igrab(&entry->vfs_inode);
5050                 if (inode) {
5051                         spin_unlock(&root->inode_lock);
5052                         if (atomic_read(&inode->i_count) > 1)
5053                                 d_prune_aliases(inode);
5054                         /*
5055                          * btrfs_drop_inode will have it removed from
5056                          * the inode cache when its usage count
5057                          * hits zero.
5058                          */
5059                         iput(inode);
5060                         cond_resched();
5061                         spin_lock(&root->inode_lock);
5062                         goto again;
5063                 }
5064 
5065                 if (cond_resched_lock(&root->inode_lock))
5066                         goto again;
5067 
5068                 node = rb_next(node);
5069         }
5070         spin_unlock(&root->inode_lock);
5071 }
5072 
5073 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5074 {
5075         struct btrfs_iget_args *args = p;
5076         inode->i_ino = args->location->objectid;
5077         memcpy(&BTRFS_I(inode)->location, args->location,
5078                sizeof(*args->location));
5079         BTRFS_I(inode)->root = args->root;
5080         return 0;
5081 }
5082 
5083 static int btrfs_find_actor(struct inode *inode, void *opaque)
5084 {
5085         struct btrfs_iget_args *args = opaque;
5086         return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5087                 args->root == BTRFS_I(inode)->root;
5088 }
5089 
5090 static struct inode *btrfs_iget_locked(struct super_block *s,
5091                                        struct btrfs_key *location,
5092                                        struct btrfs_root *root)
5093 {
5094         struct inode *inode;
5095         struct btrfs_iget_args args;
5096         unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5097 
5098         args.location = location;
5099         args.root = root;
5100 
5101         inode = iget5_locked(s, hashval, btrfs_find_actor,
5102                              btrfs_init_locked_inode,
5103                              (void *)&args);
5104         return inode;
5105 }
5106 
5107 /* Get an inode object given its location and corresponding root.
5108  * Returns in *is_new if the inode was read from disk
5109  */
5110 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5111                          struct btrfs_root *root, int *new)
5112 {
5113         struct inode *inode;
5114 
5115         inode = btrfs_iget_locked(s, location, root);
5116         if (!inode)
5117                 return ERR_PTR(-ENOMEM);
5118 
5119         if (inode->i_state & I_NEW) {
5120                 btrfs_read_locked_inode(inode);
5121                 if (!is_bad_inode(inode)) {
5122                         inode_tree_add(inode);
5123                         unlock_new_inode(inode);
5124                         if (new)
5125                                 *new = 1;
5126                 } else {
5127                         unlock_new_inode(inode);
5128                         iput(inode);
5129                         inode = ERR_PTR(-ESTALE);
5130                 }
5131         }
5132 
5133         return inode;
5134 }
5135 
5136 static struct inode *new_simple_dir(struct super_block *s,
5137                                     struct btrfs_key *key,
5138                                     struct btrfs_root *root)
5139 {
5140         struct inode *inode = new_inode(s);
5141 
5142         if (!inode)
5143                 return ERR_PTR(-ENOMEM);
5144 
5145         BTRFS_I(inode)->root = root;
5146         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5147         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5148 
5149         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5150         inode->i_op = &btrfs_dir_ro_inode_operations;
5151         inode->i_fop = &simple_dir_operations;
5152         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5153         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5154 
5155         return inode;
5156 }
5157 
5158 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5159 {
5160         struct inode *inode;
5161         struct btrfs_root *root = BTRFS_I(dir)->root;
5162         struct btrfs_root *sub_root = root;
5163         struct btrfs_key location;
5164         int index;
5165         int ret = 0;
5166 
5167         if (dentry->d_name.len > BTRFS_NAME_LEN)
5168                 return ERR_PTR(-ENAMETOOLONG);
5169 
5170         ret = btrfs_inode_by_name(dir, dentry, &location);
5171         if (ret < 0)
5172                 return ERR_PTR(ret);
5173 
5174         if (location.objectid == 0)
5175                 return ERR_PTR(-ENOENT);
5176 
5177         if (location.type == BTRFS_INODE_ITEM_KEY) {
5178                 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5179                 return inode;
5180         }
5181 
5182         BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5183 
5184         index = srcu_read_lock(&root->fs_info->subvol_srcu);
5185         ret = fixup_tree_root_location(root, dir, dentry,
5186                                        &location, &sub_root);
5187         if (ret < 0) {
5188                 if (ret != -ENOENT)
5189                         inode = ERR_PTR(ret);
5190                 else
5191                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
5192         } else {
5193                 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5194         }
5195         srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5196 
5197         if (!IS_ERR(inode) && root != sub_root) {
5198                 down_read(&root->fs_info->cleanup_work_sem);
5199                 if (!(inode->i_sb->s_flags & MS_RDONLY))
5200                         ret = btrfs_orphan_cleanup(sub_root);
5201                 up_read(&root->fs_info->cleanup_work_sem);
5202                 if (ret) {
5203                         iput(inode);
5204                         inode = ERR_PTR(ret);
5205                 }
5206         }
5207 
5208         return inode;
5209 }
5210 
5211 static int btrfs_dentry_delete(const struct dentry *dentry)
5212 {
5213         struct btrfs_root *root;
5214         struct inode *inode = dentry->d_inode;
5215 
5216         if (!inode && !IS_ROOT(dentry))
5217                 inode = dentry->d_parent->d_inode;
5218 
5219         if (inode) {
5220                 root = BTRFS_I(inode)->root;
5221                 if (btrfs_root_refs(&root->root_item) == 0)
5222                         return 1;
5223 
5224                 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5225                         return 1;
5226         }
5227         return 0;
5228 }
5229 
5230 static void btrfs_dentry_release(struct dentry *dentry)
5231 {
5232         kfree(dentry->d_fsdata);
5233 }
5234 
5235 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5236                                    unsigned int flags)
5237 {
5238         struct inode *inode;
5239 
5240         inode = btrfs_lookup_dentry(dir, dentry);
5241         if (IS_ERR(inode)) {
5242                 if (PTR_ERR(inode) == -ENOENT)
5243                         inode = NULL;
5244                 else
5245                         return ERR_CAST(inode);
5246         }
5247 
5248         return d_materialise_unique(dentry, inode);
5249 }
5250 
5251 unsigned char btrfs_filetype_table[] = {
5252         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5253 };
5254 
5255 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5256 {
5257         struct inode *inode = file_inode(file);
5258         struct btrfs_root *root = BTRFS_I(inode)->root;
5259         struct btrfs_item *item;
5260         struct btrfs_dir_item *di;
5261         struct btrfs_key key;
5262         struct btrfs_key found_key;
5263         struct btrfs_path *path;
5264         struct list_head ins_list;
5265         struct list_head del_list;
5266         int ret;
5267         struct extent_buffer *leaf;
5268         int slot;
5269         unsigned char d_type;
5270         int over = 0;
5271         u32 di_cur;
5272         u32 di_total;
5273         u32 di_len;
5274         int key_type = BTRFS_DIR_INDEX_KEY;
5275         char tmp_name[32];
5276         char *name_ptr;
5277         int name_len;
5278         int is_curr = 0;        /* ctx->pos points to the current index? */
5279 
5280         /* FIXME, use a real flag for deciding about the key type */
5281         if (root->fs_info->tree_root == root)
5282                 key_type = BTRFS_DIR_ITEM_KEY;
5283 
5284         if (!dir_emit_dots(file, ctx))
5285                 return 0;
5286 
5287         path = btrfs_alloc_path();
5288         if (!path)
5289                 return -ENOMEM;
5290 
5291         path->reada = 1;
5292 
5293         if (key_type == BTRFS_DIR_INDEX_KEY) {
5294                 INIT_LIST_HEAD(&ins_list);
5295                 INIT_LIST_HEAD(&del_list);
5296                 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5297         }
5298 
5299         btrfs_set_key_type(&key, key_type);
5300         key.offset = ctx->pos;
5301         key.objectid = btrfs_ino(inode);
5302 
5303         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5304         if (ret < 0)
5305                 goto err;
5306 
5307         while (1) {
5308                 leaf = path->nodes[0];
5309                 slot = path->slots[0];
5310                 if (slot >= btrfs_header_nritems(leaf)) {
5311                         ret = btrfs_next_leaf(root, path);
5312                         if (ret < 0)
5313                                 goto err;
5314                         else if (ret > 0)
5315                                 break;
5316                         continue;
5317                 }
5318 
5319                 item = btrfs_item_nr(slot);
5320                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5321 
5322                 if (found_key.objectid != key.objectid)
5323                         break;
5324                 if (btrfs_key_type(&found_key) != key_type)
5325                         break;
5326                 if (found_key.offset < ctx->pos)
5327                         goto next;
5328                 if (key_type == BTRFS_DIR_INDEX_KEY &&
5329                     btrfs_should_delete_dir_index(&del_list,
5330                                                   found_key.offset))
5331                         goto next;
5332 
5333                 ctx->pos = found_key.offset;
5334                 is_curr = 1;
5335 
5336                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5337                 di_cur = 0;
5338                 di_total = btrfs_item_size(leaf, item);
5339 
5340                 while (di_cur < di_total) {
5341                         struct btrfs_key location;
5342 
5343                         if (verify_dir_item(root, leaf, di))
5344                                 break;
5345 
5346                         name_len = btrfs_dir_name_len(leaf, di);
5347                         if (name_len <= sizeof(tmp_name)) {
5348                                 name_ptr = tmp_name;
5349                         } else {
5350                                 name_ptr = kmalloc(name_len, GFP_NOFS);
5351                                 if (!name_ptr) {
5352                                         ret = -ENOMEM;
5353                                         goto err;
5354                                 }
5355                         }
5356                         read_extent_buffer(leaf, name_ptr,
5357                                            (unsigned long)(di + 1), name_len);
5358 
5359                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5360                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
5361 
5362 
5363                         /* is this a reference to our own snapshot? If so
5364                          * skip it.
5365                          *
5366                          * In contrast to old kernels, we insert the snapshot's
5367                          * dir item and dir index after it has been created, so
5368                          * we won't find a reference to our own snapshot. We
5369                          * still keep the following code for backward
5370                          * compatibility.
5371                          */
5372                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
5373                             location.objectid == root->root_key.objectid) {
5374                                 over = 0;
5375                                 goto skip;
5376                         }
5377                         over = !dir_emit(ctx, name_ptr, name_len,
5378                                        location.objectid, d_type);
5379 
5380 skip:
5381                         if (name_ptr != tmp_name)
5382                                 kfree(name_ptr);
5383 
5384                         if (over)
5385                                 goto nopos;
5386                         di_len = btrfs_dir_name_len(leaf, di) +
5387                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
5388                         di_cur += di_len;
5389                         di = (struct btrfs_dir_item *)((char *)di + di_len);
5390                 }
5391 next:
5392                 path->slots[0]++;
5393         }
5394 
5395         if (key_type == BTRFS_DIR_INDEX_KEY) {
5396                 if (is_curr)
5397                         ctx->pos++;
5398                 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5399                 if (ret)
5400                         goto nopos;
5401         }
5402 
5403         /* Reached end of directory/root. Bump pos past the last item. */
5404         ctx->pos++;
5405 
5406         /*
5407          * Stop new entries from being returned after we return the last
5408          * entry.
5409          *
5410          * New directory entries are assigned a strictly increasing
5411          * offset.  This means that new entries created during readdir
5412          * are *guaranteed* to be seen in the future by that readdir.
5413          * This has broken buggy programs which operate on names as
5414          * they're returned by readdir.  Until we re-use freed offsets
5415          * we have this hack to stop new entries from being returned
5416          * under the assumption that they'll never reach this huge
5417          * offset.
5418          *
5419          * This is being careful not to overflow 32bit loff_t unless the
5420          * last entry requires it because doing so has broken 32bit apps
5421          * in the past.
5422          */
5423         if (key_type == BTRFS_DIR_INDEX_KEY) {
5424                 if (ctx->pos >= INT_MAX)
5425                         ctx->pos = LLONG_MAX;
5426                 else
5427                         ctx->pos = INT_MAX;
5428         }
5429 nopos:
5430         ret = 0;
5431 err:
5432         if (key_type == BTRFS_DIR_INDEX_KEY)
5433                 btrfs_put_delayed_items(&ins_list, &del_list);
5434         btrfs_free_path(path);
5435         return ret;
5436 }
5437 
5438 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5439 {
5440         struct btrfs_root *root = BTRFS_I(inode)->root;
5441         struct btrfs_trans_handle *trans;
5442         int ret = 0;
5443         bool nolock = false;
5444 
5445         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5446                 return 0;
5447 
5448         if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5449                 nolock = true;
5450 
5451         if (wbc->sync_mode == WB_SYNC_ALL) {
5452                 if (nolock)
5453                         trans = btrfs_join_transaction_nolock(root);
5454                 else
5455                         trans = btrfs_join_transaction(root);
5456                 if (IS_ERR(trans))
5457                         return PTR_ERR(trans);
5458                 ret = btrfs_commit_transaction(trans, root);
5459         }
5460         return ret;
5461 }
5462 
5463 /*
5464  * This is somewhat expensive, updating the tree every time the
5465  * inode changes.  But, it is most likely to find the inode in cache.
5466  * FIXME, needs more benchmarking...there are no reasons other than performance
5467  * to keep or drop this code.
5468  */
5469 static int btrfs_dirty_inode(struct inode *inode)
5470 {
5471         struct btrfs_root *root = BTRFS_I(inode)->root;
5472         struct btrfs_trans_handle *trans;
5473         int ret;
5474 
5475         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5476                 return 0;
5477 
5478         trans = btrfs_join_transaction(root);
5479         if (IS_ERR(trans))
5480                 return PTR_ERR(trans);
5481 
5482         ret = btrfs_update_inode(trans, root, inode);
5483         if (ret && ret == -ENOSPC) {
5484                 /* whoops, lets try again with the full transaction */
5485                 btrfs_end_transaction(trans, root);
5486                 trans = btrfs_start_transaction(root, 1);
5487                 if (IS_ERR(trans))
5488                         return PTR_ERR(trans);
5489 
5490                 ret = btrfs_update_inode(trans, root, inode);
5491         }
5492         btrfs_end_transaction(trans, root);
5493         if (BTRFS_I(inode)->delayed_node)
5494                 btrfs_balance_delayed_items(root);
5495 
5496         return ret;
5497 }
5498 
5499 /*
5500  * This is a copy of file_update_time.  We need this so we can return error on
5501  * ENOSPC for updating the inode in the case of file write and mmap writes.
5502  */
5503 static int btrfs_update_time(struct inode *inode, struct timespec *now,
5504                              int flags)
5505 {
5506         struct btrfs_root *root = BTRFS_I(inode)->root;
5507 
5508         if (btrfs_root_readonly(root))
5509                 return -EROFS;
5510 
5511         if (flags & S_VERSION)
5512                 inode_inc_iversion(inode);
5513         if (flags & S_CTIME)
5514                 inode->i_ctime = *now;
5515         if (flags & S_MTIME)
5516                 inode->i_mtime = *now;
5517         if (flags & S_ATIME)
5518                 inode->i_atime = *now;
5519         return btrfs_dirty_inode(inode);
5520 }
5521 
5522 /*
5523  * find the highest existing sequence number in a directory
5524  * and then set the in-memory index_cnt variable to reflect
5525  * free sequence numbers
5526  */
5527 static int btrfs_set_inode_index_count(struct inode *inode)
5528 {
5529         struct btrfs_root *root = BTRFS_I(inode)->root;
5530         struct btrfs_key key, found_key;
5531         struct btrfs_path *path;
5532         struct extent_buffer *leaf;
5533         int ret;
5534 
5535         key.objectid = btrfs_ino(inode);
5536         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5537         key.offset = (u64)-1;
5538 
5539         path = btrfs_alloc_path();
5540         if (!path)
5541                 return -ENOMEM;
5542 
5543         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5544         if (ret < 0)
5545                 goto out;
5546         /* FIXME: we should be able to handle this */
5547         if (ret == 0)
5548                 goto out;
5549         ret = 0;
5550 
5551         /*
5552          * MAGIC NUMBER EXPLANATION:
5553          * since we search a directory based on f_pos we have to start at 2
5554          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5555          * else has to start at 2
5556          */
5557         if (path->slots[0] == 0) {
5558                 BTRFS_I(inode)->index_cnt = 2;
5559                 goto out;
5560         }
5561 
5562         path->slots[0]--;
5563 
5564         leaf = path->nodes[0];
5565         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5566 
5567         if (found_key.objectid != btrfs_ino(inode) ||
5568             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5569                 BTRFS_I(inode)->index_cnt = 2;
5570                 goto out;
5571         }
5572 
5573         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5574 out:
5575         btrfs_free_path(path);
5576         return ret;
5577 }
5578 
5579 /*
5580  * helper to find a free sequence number in a given directory.  This current
5581  * code is very simple, later versions will do smarter things in the btree
5582  */
5583 int btrfs_set_inode_index(struct inode *dir, u64 *index)
5584 {
5585         int ret = 0;
5586 
5587         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5588                 ret = btrfs_inode_delayed_dir_index_count(dir);
5589                 if (ret) {
5590                         ret = btrfs_set_inode_index_count(dir);
5591                         if (ret)
5592                                 return ret;
5593                 }
5594         }
5595 
5596         *index = BTRFS_I(dir)->index_cnt;
5597         BTRFS_I(dir)->index_cnt++;
5598 
5599         return ret;
5600 }
5601 
5602 static int btrfs_insert_inode_locked(struct inode *inode)
5603 {
5604         struct btrfs_iget_args args;
5605         args.location = &BTRFS_I(inode)->location;
5606         args.root = BTRFS_I(inode)->root;
5607 
5608         return insert_inode_locked4(inode,
5609                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5610                    btrfs_find_actor, &args);
5611 }
5612 
5613 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5614                                      struct btrfs_root *root,
5615                                      struct inode *dir,
5616                                      const char *name, int name_len,
5617                                      u64 ref_objectid, u64 objectid,
5618                                      umode_t mode, u64 *index)
5619 {
5620         struct inode *inode;
5621         struct btrfs_inode_item *inode_item;
5622         struct btrfs_key *location;
5623         struct btrfs_path *path;
5624         struct btrfs_inode_ref *ref;
5625         struct btrfs_key key[2];
5626         u32 sizes[2];
5627         int nitems = name ? 2 : 1;
5628         unsigned long ptr;
5629         int ret;
5630 
5631         path = btrfs_alloc_path();
5632         if (!path)
5633                 return ERR_PTR(-ENOMEM);
5634 
5635         inode = new_inode(root->fs_info->sb);
5636         if (!inode) {
5637