~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/tree-log.c

Version: ~ [ linux-5.6-rc1 ] ~ [ linux-5.5.2 ] ~ [ linux-5.4.17 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.102 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.170 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.213 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.213 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.81 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Copyright (C) 2008 Oracle.  All rights reserved.
  3  *
  4  * This program is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU General Public
  6  * License v2 as published by the Free Software Foundation.
  7  *
  8  * This program is distributed in the hope that it will be useful,
  9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11  * General Public License for more details.
 12  *
 13  * You should have received a copy of the GNU General Public
 14  * License along with this program; if not, write to the
 15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 16  * Boston, MA 021110-1307, USA.
 17  */
 18 
 19 #include <linux/sched.h>
 20 #include <linux/slab.h>
 21 #include <linux/blkdev.h>
 22 #include <linux/list_sort.h>
 23 #include "tree-log.h"
 24 #include "disk-io.h"
 25 #include "locking.h"
 26 #include "print-tree.h"
 27 #include "backref.h"
 28 #include "hash.h"
 29 
 30 /* magic values for the inode_only field in btrfs_log_inode:
 31  *
 32  * LOG_INODE_ALL means to log everything
 33  * LOG_INODE_EXISTS means to log just enough to recreate the inode
 34  * during log replay
 35  */
 36 #define LOG_INODE_ALL 0
 37 #define LOG_INODE_EXISTS 1
 38 
 39 /*
 40  * directory trouble cases
 41  *
 42  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
 43  * log, we must force a full commit before doing an fsync of the directory
 44  * where the unlink was done.
 45  * ---> record transid of last unlink/rename per directory
 46  *
 47  * mkdir foo/some_dir
 48  * normal commit
 49  * rename foo/some_dir foo2/some_dir
 50  * mkdir foo/some_dir
 51  * fsync foo/some_dir/some_file
 52  *
 53  * The fsync above will unlink the original some_dir without recording
 54  * it in its new location (foo2).  After a crash, some_dir will be gone
 55  * unless the fsync of some_file forces a full commit
 56  *
 57  * 2) we must log any new names for any file or dir that is in the fsync
 58  * log. ---> check inode while renaming/linking.
 59  *
 60  * 2a) we must log any new names for any file or dir during rename
 61  * when the directory they are being removed from was logged.
 62  * ---> check inode and old parent dir during rename
 63  *
 64  *  2a is actually the more important variant.  With the extra logging
 65  *  a crash might unlink the old name without recreating the new one
 66  *
 67  * 3) after a crash, we must go through any directories with a link count
 68  * of zero and redo the rm -rf
 69  *
 70  * mkdir f1/foo
 71  * normal commit
 72  * rm -rf f1/foo
 73  * fsync(f1)
 74  *
 75  * The directory f1 was fully removed from the FS, but fsync was never
 76  * called on f1, only its parent dir.  After a crash the rm -rf must
 77  * be replayed.  This must be able to recurse down the entire
 78  * directory tree.  The inode link count fixup code takes care of the
 79  * ugly details.
 80  */
 81 
 82 /*
 83  * stages for the tree walking.  The first
 84  * stage (0) is to only pin down the blocks we find
 85  * the second stage (1) is to make sure that all the inodes
 86  * we find in the log are created in the subvolume.
 87  *
 88  * The last stage is to deal with directories and links and extents
 89  * and all the other fun semantics
 90  */
 91 #define LOG_WALK_PIN_ONLY 0
 92 #define LOG_WALK_REPLAY_INODES 1
 93 #define LOG_WALK_REPLAY_DIR_INDEX 2
 94 #define LOG_WALK_REPLAY_ALL 3
 95 
 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 97                            struct btrfs_root *root, struct inode *inode,
 98                            int inode_only,
 99                            const loff_t start,
100                            const loff_t end,
101                            struct btrfs_log_ctx *ctx);
102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
103                              struct btrfs_root *root,
104                              struct btrfs_path *path, u64 objectid);
105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
106                                        struct btrfs_root *root,
107                                        struct btrfs_root *log,
108                                        struct btrfs_path *path,
109                                        u64 dirid, int del_all);
110 
111 /*
112  * tree logging is a special write ahead log used to make sure that
113  * fsyncs and O_SYNCs can happen without doing full tree commits.
114  *
115  * Full tree commits are expensive because they require commonly
116  * modified blocks to be recowed, creating many dirty pages in the
117  * extent tree an 4x-6x higher write load than ext3.
118  *
119  * Instead of doing a tree commit on every fsync, we use the
120  * key ranges and transaction ids to find items for a given file or directory
121  * that have changed in this transaction.  Those items are copied into
122  * a special tree (one per subvolume root), that tree is written to disk
123  * and then the fsync is considered complete.
124  *
125  * After a crash, items are copied out of the log-tree back into the
126  * subvolume tree.  Any file data extents found are recorded in the extent
127  * allocation tree, and the log-tree freed.
128  *
129  * The log tree is read three times, once to pin down all the extents it is
130  * using in ram and once, once to create all the inodes logged in the tree
131  * and once to do all the other items.
132  */
133 
134 /*
135  * start a sub transaction and setup the log tree
136  * this increments the log tree writer count to make the people
137  * syncing the tree wait for us to finish
138  */
139 static int start_log_trans(struct btrfs_trans_handle *trans,
140                            struct btrfs_root *root,
141                            struct btrfs_log_ctx *ctx)
142 {
143         int index;
144         int ret;
145 
146         mutex_lock(&root->log_mutex);
147         if (root->log_root) {
148                 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
149                         ret = -EAGAIN;
150                         goto out;
151                 }
152                 if (!root->log_start_pid) {
153                         root->log_start_pid = current->pid;
154                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
155                 } else if (root->log_start_pid != current->pid) {
156                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
157                 }
158 
159                 atomic_inc(&root->log_batch);
160                 atomic_inc(&root->log_writers);
161                 if (ctx) {
162                         index = root->log_transid % 2;
163                         list_add_tail(&ctx->list, &root->log_ctxs[index]);
164                         ctx->log_transid = root->log_transid;
165                 }
166                 mutex_unlock(&root->log_mutex);
167                 return 0;
168         }
169 
170         ret = 0;
171         mutex_lock(&root->fs_info->tree_log_mutex);
172         if (!root->fs_info->log_root_tree)
173                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
174         mutex_unlock(&root->fs_info->tree_log_mutex);
175         if (ret)
176                 goto out;
177 
178         if (!root->log_root) {
179                 ret = btrfs_add_log_tree(trans, root);
180                 if (ret)
181                         goto out;
182         }
183         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
184         root->log_start_pid = current->pid;
185         atomic_inc(&root->log_batch);
186         atomic_inc(&root->log_writers);
187         if (ctx) {
188                 index = root->log_transid % 2;
189                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
190                 ctx->log_transid = root->log_transid;
191         }
192 out:
193         mutex_unlock(&root->log_mutex);
194         return ret;
195 }
196 
197 /*
198  * returns 0 if there was a log transaction running and we were able
199  * to join, or returns -ENOENT if there were not transactions
200  * in progress
201  */
202 static int join_running_log_trans(struct btrfs_root *root)
203 {
204         int ret = -ENOENT;
205 
206         smp_mb();
207         if (!root->log_root)
208                 return -ENOENT;
209 
210         mutex_lock(&root->log_mutex);
211         if (root->log_root) {
212                 ret = 0;
213                 atomic_inc(&root->log_writers);
214         }
215         mutex_unlock(&root->log_mutex);
216         return ret;
217 }
218 
219 /*
220  * This either makes the current running log transaction wait
221  * until you call btrfs_end_log_trans() or it makes any future
222  * log transactions wait until you call btrfs_end_log_trans()
223  */
224 int btrfs_pin_log_trans(struct btrfs_root *root)
225 {
226         int ret = -ENOENT;
227 
228         mutex_lock(&root->log_mutex);
229         atomic_inc(&root->log_writers);
230         mutex_unlock(&root->log_mutex);
231         return ret;
232 }
233 
234 /*
235  * indicate we're done making changes to the log tree
236  * and wake up anyone waiting to do a sync
237  */
238 void btrfs_end_log_trans(struct btrfs_root *root)
239 {
240         if (atomic_dec_and_test(&root->log_writers)) {
241                 smp_mb();
242                 if (waitqueue_active(&root->log_writer_wait))
243                         wake_up(&root->log_writer_wait);
244         }
245 }
246 
247 
248 /*
249  * the walk control struct is used to pass state down the chain when
250  * processing the log tree.  The stage field tells us which part
251  * of the log tree processing we are currently doing.  The others
252  * are state fields used for that specific part
253  */
254 struct walk_control {
255         /* should we free the extent on disk when done?  This is used
256          * at transaction commit time while freeing a log tree
257          */
258         int free;
259 
260         /* should we write out the extent buffer?  This is used
261          * while flushing the log tree to disk during a sync
262          */
263         int write;
264 
265         /* should we wait for the extent buffer io to finish?  Also used
266          * while flushing the log tree to disk for a sync
267          */
268         int wait;
269 
270         /* pin only walk, we record which extents on disk belong to the
271          * log trees
272          */
273         int pin;
274 
275         /* what stage of the replay code we're currently in */
276         int stage;
277 
278         /* the root we are currently replaying */
279         struct btrfs_root *replay_dest;
280 
281         /* the trans handle for the current replay */
282         struct btrfs_trans_handle *trans;
283 
284         /* the function that gets used to process blocks we find in the
285          * tree.  Note the extent_buffer might not be up to date when it is
286          * passed in, and it must be checked or read if you need the data
287          * inside it
288          */
289         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
290                             struct walk_control *wc, u64 gen);
291 };
292 
293 /*
294  * process_func used to pin down extents, write them or wait on them
295  */
296 static int process_one_buffer(struct btrfs_root *log,
297                               struct extent_buffer *eb,
298                               struct walk_control *wc, u64 gen)
299 {
300         int ret = 0;
301 
302         /*
303          * If this fs is mixed then we need to be able to process the leaves to
304          * pin down any logged extents, so we have to read the block.
305          */
306         if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
307                 ret = btrfs_read_buffer(eb, gen);
308                 if (ret)
309                         return ret;
310         }
311 
312         if (wc->pin)
313                 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
314                                                       eb->start, eb->len);
315 
316         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
317                 if (wc->pin && btrfs_header_level(eb) == 0)
318                         ret = btrfs_exclude_logged_extents(log, eb);
319                 if (wc->write)
320                         btrfs_write_tree_block(eb);
321                 if (wc->wait)
322                         btrfs_wait_tree_block_writeback(eb);
323         }
324         return ret;
325 }
326 
327 /*
328  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
329  * to the src data we are copying out.
330  *
331  * root is the tree we are copying into, and path is a scratch
332  * path for use in this function (it should be released on entry and
333  * will be released on exit).
334  *
335  * If the key is already in the destination tree the existing item is
336  * overwritten.  If the existing item isn't big enough, it is extended.
337  * If it is too large, it is truncated.
338  *
339  * If the key isn't in the destination yet, a new item is inserted.
340  */
341 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
342                                    struct btrfs_root *root,
343                                    struct btrfs_path *path,
344                                    struct extent_buffer *eb, int slot,
345                                    struct btrfs_key *key)
346 {
347         int ret;
348         u32 item_size;
349         u64 saved_i_size = 0;
350         int save_old_i_size = 0;
351         unsigned long src_ptr;
352         unsigned long dst_ptr;
353         int overwrite_root = 0;
354         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
355 
356         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
357                 overwrite_root = 1;
358 
359         item_size = btrfs_item_size_nr(eb, slot);
360         src_ptr = btrfs_item_ptr_offset(eb, slot);
361 
362         /* look for the key in the destination tree */
363         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
364         if (ret < 0)
365                 return ret;
366 
367         if (ret == 0) {
368                 char *src_copy;
369                 char *dst_copy;
370                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
371                                                   path->slots[0]);
372                 if (dst_size != item_size)
373                         goto insert;
374 
375                 if (item_size == 0) {
376                         btrfs_release_path(path);
377                         return 0;
378                 }
379                 dst_copy = kmalloc(item_size, GFP_NOFS);
380                 src_copy = kmalloc(item_size, GFP_NOFS);
381                 if (!dst_copy || !src_copy) {
382                         btrfs_release_path(path);
383                         kfree(dst_copy);
384                         kfree(src_copy);
385                         return -ENOMEM;
386                 }
387 
388                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
389 
390                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
391                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
392                                    item_size);
393                 ret = memcmp(dst_copy, src_copy, item_size);
394 
395                 kfree(dst_copy);
396                 kfree(src_copy);
397                 /*
398                  * they have the same contents, just return, this saves
399                  * us from cowing blocks in the destination tree and doing
400                  * extra writes that may not have been done by a previous
401                  * sync
402                  */
403                 if (ret == 0) {
404                         btrfs_release_path(path);
405                         return 0;
406                 }
407 
408                 /*
409                  * We need to load the old nbytes into the inode so when we
410                  * replay the extents we've logged we get the right nbytes.
411                  */
412                 if (inode_item) {
413                         struct btrfs_inode_item *item;
414                         u64 nbytes;
415                         u32 mode;
416 
417                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
418                                               struct btrfs_inode_item);
419                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
420                         item = btrfs_item_ptr(eb, slot,
421                                               struct btrfs_inode_item);
422                         btrfs_set_inode_nbytes(eb, item, nbytes);
423 
424                         /*
425                          * If this is a directory we need to reset the i_size to
426                          * 0 so that we can set it up properly when replaying
427                          * the rest of the items in this log.
428                          */
429                         mode = btrfs_inode_mode(eb, item);
430                         if (S_ISDIR(mode))
431                                 btrfs_set_inode_size(eb, item, 0);
432                 }
433         } else if (inode_item) {
434                 struct btrfs_inode_item *item;
435                 u32 mode;
436 
437                 /*
438                  * New inode, set nbytes to 0 so that the nbytes comes out
439                  * properly when we replay the extents.
440                  */
441                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
442                 btrfs_set_inode_nbytes(eb, item, 0);
443 
444                 /*
445                  * If this is a directory we need to reset the i_size to 0 so
446                  * that we can set it up properly when replaying the rest of
447                  * the items in this log.
448                  */
449                 mode = btrfs_inode_mode(eb, item);
450                 if (S_ISDIR(mode))
451                         btrfs_set_inode_size(eb, item, 0);
452         }
453 insert:
454         btrfs_release_path(path);
455         /* try to insert the key into the destination tree */
456         ret = btrfs_insert_empty_item(trans, root, path,
457                                       key, item_size);
458 
459         /* make sure any existing item is the correct size */
460         if (ret == -EEXIST) {
461                 u32 found_size;
462                 found_size = btrfs_item_size_nr(path->nodes[0],
463                                                 path->slots[0]);
464                 if (found_size > item_size)
465                         btrfs_truncate_item(root, path, item_size, 1);
466                 else if (found_size < item_size)
467                         btrfs_extend_item(root, path,
468                                           item_size - found_size);
469         } else if (ret) {
470                 return ret;
471         }
472         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
473                                         path->slots[0]);
474 
475         /* don't overwrite an existing inode if the generation number
476          * was logged as zero.  This is done when the tree logging code
477          * is just logging an inode to make sure it exists after recovery.
478          *
479          * Also, don't overwrite i_size on directories during replay.
480          * log replay inserts and removes directory items based on the
481          * state of the tree found in the subvolume, and i_size is modified
482          * as it goes
483          */
484         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
485                 struct btrfs_inode_item *src_item;
486                 struct btrfs_inode_item *dst_item;
487 
488                 src_item = (struct btrfs_inode_item *)src_ptr;
489                 dst_item = (struct btrfs_inode_item *)dst_ptr;
490 
491                 if (btrfs_inode_generation(eb, src_item) == 0) {
492                         struct extent_buffer *dst_eb = path->nodes[0];
493 
494                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
495                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
496                                 struct btrfs_map_token token;
497                                 u64 ino_size = btrfs_inode_size(eb, src_item);
498 
499                                 btrfs_init_map_token(&token);
500                                 btrfs_set_token_inode_size(dst_eb, dst_item,
501                                                            ino_size, &token);
502                         }
503                         goto no_copy;
504                 }
505 
506                 if (overwrite_root &&
507                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
508                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
509                         save_old_i_size = 1;
510                         saved_i_size = btrfs_inode_size(path->nodes[0],
511                                                         dst_item);
512                 }
513         }
514 
515         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
516                            src_ptr, item_size);
517 
518         if (save_old_i_size) {
519                 struct btrfs_inode_item *dst_item;
520                 dst_item = (struct btrfs_inode_item *)dst_ptr;
521                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
522         }
523 
524         /* make sure the generation is filled in */
525         if (key->type == BTRFS_INODE_ITEM_KEY) {
526                 struct btrfs_inode_item *dst_item;
527                 dst_item = (struct btrfs_inode_item *)dst_ptr;
528                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
529                         btrfs_set_inode_generation(path->nodes[0], dst_item,
530                                                    trans->transid);
531                 }
532         }
533 no_copy:
534         btrfs_mark_buffer_dirty(path->nodes[0]);
535         btrfs_release_path(path);
536         return 0;
537 }
538 
539 /*
540  * simple helper to read an inode off the disk from a given root
541  * This can only be called for subvolume roots and not for the log
542  */
543 static noinline struct inode *read_one_inode(struct btrfs_root *root,
544                                              u64 objectid)
545 {
546         struct btrfs_key key;
547         struct inode *inode;
548 
549         key.objectid = objectid;
550         key.type = BTRFS_INODE_ITEM_KEY;
551         key.offset = 0;
552         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
553         if (IS_ERR(inode)) {
554                 inode = NULL;
555         } else if (is_bad_inode(inode)) {
556                 iput(inode);
557                 inode = NULL;
558         }
559         return inode;
560 }
561 
562 /* replays a single extent in 'eb' at 'slot' with 'key' into the
563  * subvolume 'root'.  path is released on entry and should be released
564  * on exit.
565  *
566  * extents in the log tree have not been allocated out of the extent
567  * tree yet.  So, this completes the allocation, taking a reference
568  * as required if the extent already exists or creating a new extent
569  * if it isn't in the extent allocation tree yet.
570  *
571  * The extent is inserted into the file, dropping any existing extents
572  * from the file that overlap the new one.
573  */
574 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
575                                       struct btrfs_root *root,
576                                       struct btrfs_path *path,
577                                       struct extent_buffer *eb, int slot,
578                                       struct btrfs_key *key)
579 {
580         int found_type;
581         u64 extent_end;
582         u64 start = key->offset;
583         u64 nbytes = 0;
584         struct btrfs_file_extent_item *item;
585         struct inode *inode = NULL;
586         unsigned long size;
587         int ret = 0;
588 
589         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
590         found_type = btrfs_file_extent_type(eb, item);
591 
592         if (found_type == BTRFS_FILE_EXTENT_REG ||
593             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
594                 nbytes = btrfs_file_extent_num_bytes(eb, item);
595                 extent_end = start + nbytes;
596 
597                 /*
598                  * We don't add to the inodes nbytes if we are prealloc or a
599                  * hole.
600                  */
601                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
602                         nbytes = 0;
603         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
604                 size = btrfs_file_extent_inline_len(eb, slot, item);
605                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
606                 extent_end = ALIGN(start + size, root->sectorsize);
607         } else {
608                 ret = 0;
609                 goto out;
610         }
611 
612         inode = read_one_inode(root, key->objectid);
613         if (!inode) {
614                 ret = -EIO;
615                 goto out;
616         }
617 
618         /*
619          * first check to see if we already have this extent in the
620          * file.  This must be done before the btrfs_drop_extents run
621          * so we don't try to drop this extent.
622          */
623         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
624                                        start, 0);
625 
626         if (ret == 0 &&
627             (found_type == BTRFS_FILE_EXTENT_REG ||
628              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
629                 struct btrfs_file_extent_item cmp1;
630                 struct btrfs_file_extent_item cmp2;
631                 struct btrfs_file_extent_item *existing;
632                 struct extent_buffer *leaf;
633 
634                 leaf = path->nodes[0];
635                 existing = btrfs_item_ptr(leaf, path->slots[0],
636                                           struct btrfs_file_extent_item);
637 
638                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
639                                    sizeof(cmp1));
640                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
641                                    sizeof(cmp2));
642 
643                 /*
644                  * we already have a pointer to this exact extent,
645                  * we don't have to do anything
646                  */
647                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
648                         btrfs_release_path(path);
649                         goto out;
650                 }
651         }
652         btrfs_release_path(path);
653 
654         /* drop any overlapping extents */
655         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
656         if (ret)
657                 goto out;
658 
659         if (found_type == BTRFS_FILE_EXTENT_REG ||
660             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
661                 u64 offset;
662                 unsigned long dest_offset;
663                 struct btrfs_key ins;
664 
665                 ret = btrfs_insert_empty_item(trans, root, path, key,
666                                               sizeof(*item));
667                 if (ret)
668                         goto out;
669                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
670                                                     path->slots[0]);
671                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
672                                 (unsigned long)item,  sizeof(*item));
673 
674                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
675                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
676                 ins.type = BTRFS_EXTENT_ITEM_KEY;
677                 offset = key->offset - btrfs_file_extent_offset(eb, item);
678 
679                 if (ins.objectid > 0) {
680                         u64 csum_start;
681                         u64 csum_end;
682                         LIST_HEAD(ordered_sums);
683                         /*
684                          * is this extent already allocated in the extent
685                          * allocation tree?  If so, just add a reference
686                          */
687                         ret = btrfs_lookup_data_extent(root, ins.objectid,
688                                                 ins.offset);
689                         if (ret == 0) {
690                                 ret = btrfs_inc_extent_ref(trans, root,
691                                                 ins.objectid, ins.offset,
692                                                 0, root->root_key.objectid,
693                                                 key->objectid, offset, 0);
694                                 if (ret)
695                                         goto out;
696                         } else {
697                                 /*
698                                  * insert the extent pointer in the extent
699                                  * allocation tree
700                                  */
701                                 ret = btrfs_alloc_logged_file_extent(trans,
702                                                 root, root->root_key.objectid,
703                                                 key->objectid, offset, &ins);
704                                 if (ret)
705                                         goto out;
706                         }
707                         btrfs_release_path(path);
708 
709                         if (btrfs_file_extent_compression(eb, item)) {
710                                 csum_start = ins.objectid;
711                                 csum_end = csum_start + ins.offset;
712                         } else {
713                                 csum_start = ins.objectid +
714                                         btrfs_file_extent_offset(eb, item);
715                                 csum_end = csum_start +
716                                         btrfs_file_extent_num_bytes(eb, item);
717                         }
718 
719                         ret = btrfs_lookup_csums_range(root->log_root,
720                                                 csum_start, csum_end - 1,
721                                                 &ordered_sums, 0);
722                         if (ret)
723                                 goto out;
724                         while (!list_empty(&ordered_sums)) {
725                                 struct btrfs_ordered_sum *sums;
726                                 sums = list_entry(ordered_sums.next,
727                                                 struct btrfs_ordered_sum,
728                                                 list);
729                                 if (!ret)
730                                         ret = btrfs_csum_file_blocks(trans,
731                                                 root->fs_info->csum_root,
732                                                 sums);
733                                 list_del(&sums->list);
734                                 kfree(sums);
735                         }
736                         if (ret)
737                                 goto out;
738                 } else {
739                         btrfs_release_path(path);
740                 }
741         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
742                 /* inline extents are easy, we just overwrite them */
743                 ret = overwrite_item(trans, root, path, eb, slot, key);
744                 if (ret)
745                         goto out;
746         }
747 
748         inode_add_bytes(inode, nbytes);
749         ret = btrfs_update_inode(trans, root, inode);
750 out:
751         if (inode)
752                 iput(inode);
753         return ret;
754 }
755 
756 /*
757  * when cleaning up conflicts between the directory names in the
758  * subvolume, directory names in the log and directory names in the
759  * inode back references, we may have to unlink inodes from directories.
760  *
761  * This is a helper function to do the unlink of a specific directory
762  * item
763  */
764 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
765                                       struct btrfs_root *root,
766                                       struct btrfs_path *path,
767                                       struct inode *dir,
768                                       struct btrfs_dir_item *di)
769 {
770         struct inode *inode;
771         char *name;
772         int name_len;
773         struct extent_buffer *leaf;
774         struct btrfs_key location;
775         int ret;
776 
777         leaf = path->nodes[0];
778 
779         btrfs_dir_item_key_to_cpu(leaf, di, &location);
780         name_len = btrfs_dir_name_len(leaf, di);
781         name = kmalloc(name_len, GFP_NOFS);
782         if (!name)
783                 return -ENOMEM;
784 
785         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
786         btrfs_release_path(path);
787 
788         inode = read_one_inode(root, location.objectid);
789         if (!inode) {
790                 ret = -EIO;
791                 goto out;
792         }
793 
794         ret = link_to_fixup_dir(trans, root, path, location.objectid);
795         if (ret)
796                 goto out;
797 
798         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
799         if (ret)
800                 goto out;
801         else
802                 ret = btrfs_run_delayed_items(trans, root);
803 out:
804         kfree(name);
805         iput(inode);
806         return ret;
807 }
808 
809 /*
810  * helper function to see if a given name and sequence number found
811  * in an inode back reference are already in a directory and correctly
812  * point to this inode
813  */
814 static noinline int inode_in_dir(struct btrfs_root *root,
815                                  struct btrfs_path *path,
816                                  u64 dirid, u64 objectid, u64 index,
817                                  const char *name, int name_len)
818 {
819         struct btrfs_dir_item *di;
820         struct btrfs_key location;
821         int match = 0;
822 
823         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
824                                          index, name, name_len, 0);
825         if (di && !IS_ERR(di)) {
826                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
827                 if (location.objectid != objectid)
828                         goto out;
829         } else
830                 goto out;
831         btrfs_release_path(path);
832 
833         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
834         if (di && !IS_ERR(di)) {
835                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
836                 if (location.objectid != objectid)
837                         goto out;
838         } else
839                 goto out;
840         match = 1;
841 out:
842         btrfs_release_path(path);
843         return match;
844 }
845 
846 /*
847  * helper function to check a log tree for a named back reference in
848  * an inode.  This is used to decide if a back reference that is
849  * found in the subvolume conflicts with what we find in the log.
850  *
851  * inode backreferences may have multiple refs in a single item,
852  * during replay we process one reference at a time, and we don't
853  * want to delete valid links to a file from the subvolume if that
854  * link is also in the log.
855  */
856 static noinline int backref_in_log(struct btrfs_root *log,
857                                    struct btrfs_key *key,
858                                    u64 ref_objectid,
859                                    char *name, int namelen)
860 {
861         struct btrfs_path *path;
862         struct btrfs_inode_ref *ref;
863         unsigned long ptr;
864         unsigned long ptr_end;
865         unsigned long name_ptr;
866         int found_name_len;
867         int item_size;
868         int ret;
869         int match = 0;
870 
871         path = btrfs_alloc_path();
872         if (!path)
873                 return -ENOMEM;
874 
875         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
876         if (ret != 0)
877                 goto out;
878 
879         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
880 
881         if (key->type == BTRFS_INODE_EXTREF_KEY) {
882                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
883                                                    name, namelen, NULL))
884                         match = 1;
885 
886                 goto out;
887         }
888 
889         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
890         ptr_end = ptr + item_size;
891         while (ptr < ptr_end) {
892                 ref = (struct btrfs_inode_ref *)ptr;
893                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
894                 if (found_name_len == namelen) {
895                         name_ptr = (unsigned long)(ref + 1);
896                         ret = memcmp_extent_buffer(path->nodes[0], name,
897                                                    name_ptr, namelen);
898                         if (ret == 0) {
899                                 match = 1;
900                                 goto out;
901                         }
902                 }
903                 ptr = (unsigned long)(ref + 1) + found_name_len;
904         }
905 out:
906         btrfs_free_path(path);
907         return match;
908 }
909 
910 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
911                                   struct btrfs_root *root,
912                                   struct btrfs_path *path,
913                                   struct btrfs_root *log_root,
914                                   struct inode *dir, struct inode *inode,
915                                   struct extent_buffer *eb,
916                                   u64 inode_objectid, u64 parent_objectid,
917                                   u64 ref_index, char *name, int namelen,
918                                   int *search_done)
919 {
920         int ret;
921         char *victim_name;
922         int victim_name_len;
923         struct extent_buffer *leaf;
924         struct btrfs_dir_item *di;
925         struct btrfs_key search_key;
926         struct btrfs_inode_extref *extref;
927 
928 again:
929         /* Search old style refs */
930         search_key.objectid = inode_objectid;
931         search_key.type = BTRFS_INODE_REF_KEY;
932         search_key.offset = parent_objectid;
933         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
934         if (ret == 0) {
935                 struct btrfs_inode_ref *victim_ref;
936                 unsigned long ptr;
937                 unsigned long ptr_end;
938 
939                 leaf = path->nodes[0];
940 
941                 /* are we trying to overwrite a back ref for the root directory
942                  * if so, just jump out, we're done
943                  */
944                 if (search_key.objectid == search_key.offset)
945                         return 1;
946 
947                 /* check all the names in this back reference to see
948                  * if they are in the log.  if so, we allow them to stay
949                  * otherwise they must be unlinked as a conflict
950                  */
951                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
952                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
953                 while (ptr < ptr_end) {
954                         victim_ref = (struct btrfs_inode_ref *)ptr;
955                         victim_name_len = btrfs_inode_ref_name_len(leaf,
956                                                                    victim_ref);
957                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
958                         if (!victim_name)
959                                 return -ENOMEM;
960 
961                         read_extent_buffer(leaf, victim_name,
962                                            (unsigned long)(victim_ref + 1),
963                                            victim_name_len);
964 
965                         if (!backref_in_log(log_root, &search_key,
966                                             parent_objectid,
967                                             victim_name,
968                                             victim_name_len)) {
969                                 inc_nlink(inode);
970                                 btrfs_release_path(path);
971 
972                                 ret = btrfs_unlink_inode(trans, root, dir,
973                                                          inode, victim_name,
974                                                          victim_name_len);
975                                 kfree(victim_name);
976                                 if (ret)
977                                         return ret;
978                                 ret = btrfs_run_delayed_items(trans, root);
979                                 if (ret)
980                                         return ret;
981                                 *search_done = 1;
982                                 goto again;
983                         }
984                         kfree(victim_name);
985 
986                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
987                 }
988 
989                 /*
990                  * NOTE: we have searched root tree and checked the
991                  * coresponding ref, it does not need to check again.
992                  */
993                 *search_done = 1;
994         }
995         btrfs_release_path(path);
996 
997         /* Same search but for extended refs */
998         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
999                                            inode_objectid, parent_objectid, 0,
1000                                            0);
1001         if (!IS_ERR_OR_NULL(extref)) {
1002                 u32 item_size;
1003                 u32 cur_offset = 0;
1004                 unsigned long base;
1005                 struct inode *victim_parent;
1006 
1007                 leaf = path->nodes[0];
1008 
1009                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1010                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1011 
1012                 while (cur_offset < item_size) {
1013                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1014 
1015                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1016 
1017                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1018                                 goto next;
1019 
1020                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1021                         if (!victim_name)
1022                                 return -ENOMEM;
1023                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1024                                            victim_name_len);
1025 
1026                         search_key.objectid = inode_objectid;
1027                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1028                         search_key.offset = btrfs_extref_hash(parent_objectid,
1029                                                               victim_name,
1030                                                               victim_name_len);
1031                         ret = 0;
1032                         if (!backref_in_log(log_root, &search_key,
1033                                             parent_objectid, victim_name,
1034                                             victim_name_len)) {
1035                                 ret = -ENOENT;
1036                                 victim_parent = read_one_inode(root,
1037                                                                parent_objectid);
1038                                 if (victim_parent) {
1039                                         inc_nlink(inode);
1040                                         btrfs_release_path(path);
1041 
1042                                         ret = btrfs_unlink_inode(trans, root,
1043                                                                  victim_parent,
1044                                                                  inode,
1045                                                                  victim_name,
1046                                                                  victim_name_len);
1047                                         if (!ret)
1048                                                 ret = btrfs_run_delayed_items(
1049                                                                   trans, root);
1050                                 }
1051                                 iput(victim_parent);
1052                                 kfree(victim_name);
1053                                 if (ret)
1054                                         return ret;
1055                                 *search_done = 1;
1056                                 goto again;
1057                         }
1058                         kfree(victim_name);
1059                         if (ret)
1060                                 return ret;
1061 next:
1062                         cur_offset += victim_name_len + sizeof(*extref);
1063                 }
1064                 *search_done = 1;
1065         }
1066         btrfs_release_path(path);
1067 
1068         /* look for a conflicting sequence number */
1069         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1070                                          ref_index, name, namelen, 0);
1071         if (di && !IS_ERR(di)) {
1072                 ret = drop_one_dir_item(trans, root, path, dir, di);
1073                 if (ret)
1074                         return ret;
1075         }
1076         btrfs_release_path(path);
1077 
1078         /* look for a conflicing name */
1079         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1080                                    name, namelen, 0);
1081         if (di && !IS_ERR(di)) {
1082                 ret = drop_one_dir_item(trans, root, path, dir, di);
1083                 if (ret)
1084                         return ret;
1085         }
1086         btrfs_release_path(path);
1087 
1088         return 0;
1089 }
1090 
1091 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1092                              u32 *namelen, char **name, u64 *index,
1093                              u64 *parent_objectid)
1094 {
1095         struct btrfs_inode_extref *extref;
1096 
1097         extref = (struct btrfs_inode_extref *)ref_ptr;
1098 
1099         *namelen = btrfs_inode_extref_name_len(eb, extref);
1100         *name = kmalloc(*namelen, GFP_NOFS);
1101         if (*name == NULL)
1102                 return -ENOMEM;
1103 
1104         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1105                            *namelen);
1106 
1107         *index = btrfs_inode_extref_index(eb, extref);
1108         if (parent_objectid)
1109                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1110 
1111         return 0;
1112 }
1113 
1114 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1115                           u32 *namelen, char **name, u64 *index)
1116 {
1117         struct btrfs_inode_ref *ref;
1118 
1119         ref = (struct btrfs_inode_ref *)ref_ptr;
1120 
1121         *namelen = btrfs_inode_ref_name_len(eb, ref);
1122         *name = kmalloc(*namelen, GFP_NOFS);
1123         if (*name == NULL)
1124                 return -ENOMEM;
1125 
1126         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1127 
1128         *index = btrfs_inode_ref_index(eb, ref);
1129 
1130         return 0;
1131 }
1132 
1133 /*
1134  * replay one inode back reference item found in the log tree.
1135  * eb, slot and key refer to the buffer and key found in the log tree.
1136  * root is the destination we are replaying into, and path is for temp
1137  * use by this function.  (it should be released on return).
1138  */
1139 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1140                                   struct btrfs_root *root,
1141                                   struct btrfs_root *log,
1142                                   struct btrfs_path *path,
1143                                   struct extent_buffer *eb, int slot,
1144                                   struct btrfs_key *key)
1145 {
1146         struct inode *dir = NULL;
1147         struct inode *inode = NULL;
1148         unsigned long ref_ptr;
1149         unsigned long ref_end;
1150         char *name = NULL;
1151         int namelen;
1152         int ret;
1153         int search_done = 0;
1154         int log_ref_ver = 0;
1155         u64 parent_objectid;
1156         u64 inode_objectid;
1157         u64 ref_index = 0;
1158         int ref_struct_size;
1159 
1160         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1161         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1162 
1163         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1164                 struct btrfs_inode_extref *r;
1165 
1166                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1167                 log_ref_ver = 1;
1168                 r = (struct btrfs_inode_extref *)ref_ptr;
1169                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1170         } else {
1171                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1172                 parent_objectid = key->offset;
1173         }
1174         inode_objectid = key->objectid;
1175 
1176         /*
1177          * it is possible that we didn't log all the parent directories
1178          * for a given inode.  If we don't find the dir, just don't
1179          * copy the back ref in.  The link count fixup code will take
1180          * care of the rest
1181          */
1182         dir = read_one_inode(root, parent_objectid);
1183         if (!dir) {
1184                 ret = -ENOENT;
1185                 goto out;
1186         }
1187 
1188         inode = read_one_inode(root, inode_objectid);
1189         if (!inode) {
1190                 ret = -EIO;
1191                 goto out;
1192         }
1193 
1194         while (ref_ptr < ref_end) {
1195                 if (log_ref_ver) {
1196                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1197                                                 &ref_index, &parent_objectid);
1198                         /*
1199                          * parent object can change from one array
1200                          * item to another.
1201                          */
1202                         if (!dir)
1203                                 dir = read_one_inode(root, parent_objectid);
1204                         if (!dir) {
1205                                 ret = -ENOENT;
1206                                 goto out;
1207                         }
1208                 } else {
1209                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1210                                              &ref_index);
1211                 }
1212                 if (ret)
1213                         goto out;
1214 
1215                 /* if we already have a perfect match, we're done */
1216                 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1217                                   ref_index, name, namelen)) {
1218                         /*
1219                          * look for a conflicting back reference in the
1220                          * metadata. if we find one we have to unlink that name
1221                          * of the file before we add our new link.  Later on, we
1222                          * overwrite any existing back reference, and we don't
1223                          * want to create dangling pointers in the directory.
1224                          */
1225 
1226                         if (!search_done) {
1227                                 ret = __add_inode_ref(trans, root, path, log,
1228                                                       dir, inode, eb,
1229                                                       inode_objectid,
1230                                                       parent_objectid,
1231                                                       ref_index, name, namelen,
1232                                                       &search_done);
1233                                 if (ret) {
1234                                         if (ret == 1)
1235                                                 ret = 0;
1236                                         goto out;
1237                                 }
1238                         }
1239 
1240                         /* insert our name */
1241                         ret = btrfs_add_link(trans, dir, inode, name, namelen,
1242                                              0, ref_index);
1243                         if (ret)
1244                                 goto out;
1245 
1246                         btrfs_update_inode(trans, root, inode);
1247                 }
1248 
1249                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1250                 kfree(name);
1251                 name = NULL;
1252                 if (log_ref_ver) {
1253                         iput(dir);
1254                         dir = NULL;
1255                 }
1256         }
1257 
1258         /* finally write the back reference in the inode */
1259         ret = overwrite_item(trans, root, path, eb, slot, key);
1260 out:
1261         btrfs_release_path(path);
1262         kfree(name);
1263         iput(dir);
1264         iput(inode);
1265         return ret;
1266 }
1267 
1268 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1269                               struct btrfs_root *root, u64 ino)
1270 {
1271         int ret;
1272 
1273         ret = btrfs_insert_orphan_item(trans, root, ino);
1274         if (ret == -EEXIST)
1275                 ret = 0;
1276 
1277         return ret;
1278 }
1279 
1280 static int count_inode_extrefs(struct btrfs_root *root,
1281                                struct inode *inode, struct btrfs_path *path)
1282 {
1283         int ret = 0;
1284         int name_len;
1285         unsigned int nlink = 0;
1286         u32 item_size;
1287         u32 cur_offset = 0;
1288         u64 inode_objectid = btrfs_ino(inode);
1289         u64 offset = 0;
1290         unsigned long ptr;
1291         struct btrfs_inode_extref *extref;
1292         struct extent_buffer *leaf;
1293 
1294         while (1) {
1295                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1296                                             &extref, &offset);
1297                 if (ret)
1298                         break;
1299 
1300                 leaf = path->nodes[0];
1301                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1302                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1303 
1304                 while (cur_offset < item_size) {
1305                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1306                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1307 
1308                         nlink++;
1309 
1310                         cur_offset += name_len + sizeof(*extref);
1311                 }
1312 
1313                 offset++;
1314                 btrfs_release_path(path);
1315         }
1316         btrfs_release_path(path);
1317 
1318         if (ret < 0)
1319                 return ret;
1320         return nlink;
1321 }
1322 
1323 static int count_inode_refs(struct btrfs_root *root,
1324                                struct inode *inode, struct btrfs_path *path)
1325 {
1326         int ret;
1327         struct btrfs_key key;
1328         unsigned int nlink = 0;
1329         unsigned long ptr;
1330         unsigned long ptr_end;
1331         int name_len;
1332         u64 ino = btrfs_ino(inode);
1333 
1334         key.objectid = ino;
1335         key.type = BTRFS_INODE_REF_KEY;
1336         key.offset = (u64)-1;
1337 
1338         while (1) {
1339                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1340                 if (ret < 0)
1341                         break;
1342                 if (ret > 0) {
1343                         if (path->slots[0] == 0)
1344                                 break;
1345                         path->slots[0]--;
1346                 }
1347 process_slot:
1348                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1349                                       path->slots[0]);
1350                 if (key.objectid != ino ||
1351                     key.type != BTRFS_INODE_REF_KEY)
1352                         break;
1353                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1354                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1355                                                    path->slots[0]);
1356                 while (ptr < ptr_end) {
1357                         struct btrfs_inode_ref *ref;
1358 
1359                         ref = (struct btrfs_inode_ref *)ptr;
1360                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1361                                                             ref);
1362                         ptr = (unsigned long)(ref + 1) + name_len;
1363                         nlink++;
1364                 }
1365 
1366                 if (key.offset == 0)
1367                         break;
1368                 if (path->slots[0] > 0) {
1369                         path->slots[0]--;
1370                         goto process_slot;
1371                 }
1372                 key.offset--;
1373                 btrfs_release_path(path);
1374         }
1375         btrfs_release_path(path);
1376 
1377         return nlink;
1378 }
1379 
1380 /*
1381  * There are a few corners where the link count of the file can't
1382  * be properly maintained during replay.  So, instead of adding
1383  * lots of complexity to the log code, we just scan the backrefs
1384  * for any file that has been through replay.
1385  *
1386  * The scan will update the link count on the inode to reflect the
1387  * number of back refs found.  If it goes down to zero, the iput
1388  * will free the inode.
1389  */
1390 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1391                                            struct btrfs_root *root,
1392                                            struct inode *inode)
1393 {
1394         struct btrfs_path *path;
1395         int ret;
1396         u64 nlink = 0;
1397         u64 ino = btrfs_ino(inode);
1398 
1399         path = btrfs_alloc_path();
1400         if (!path)
1401                 return -ENOMEM;
1402 
1403         ret = count_inode_refs(root, inode, path);
1404         if (ret < 0)
1405                 goto out;
1406 
1407         nlink = ret;
1408 
1409         ret = count_inode_extrefs(root, inode, path);
1410         if (ret == -ENOENT)
1411                 ret = 0;
1412 
1413         if (ret < 0)
1414                 goto out;
1415 
1416         nlink += ret;
1417 
1418         ret = 0;
1419 
1420         if (nlink != inode->i_nlink) {
1421                 set_nlink(inode, nlink);
1422                 btrfs_update_inode(trans, root, inode);
1423         }
1424         BTRFS_I(inode)->index_cnt = (u64)-1;
1425 
1426         if (inode->i_nlink == 0) {
1427                 if (S_ISDIR(inode->i_mode)) {
1428                         ret = replay_dir_deletes(trans, root, NULL, path,
1429                                                  ino, 1);
1430                         if (ret)
1431                                 goto out;
1432                 }
1433                 ret = insert_orphan_item(trans, root, ino);
1434         }
1435 
1436 out:
1437         btrfs_free_path(path);
1438         return ret;
1439 }
1440 
1441 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1442                                             struct btrfs_root *root,
1443                                             struct btrfs_path *path)
1444 {
1445         int ret;
1446         struct btrfs_key key;
1447         struct inode *inode;
1448 
1449         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1450         key.type = BTRFS_ORPHAN_ITEM_KEY;
1451         key.offset = (u64)-1;
1452         while (1) {
1453                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1454                 if (ret < 0)
1455                         break;
1456 
1457                 if (ret == 1) {
1458                         if (path->slots[0] == 0)
1459                                 break;
1460                         path->slots[0]--;
1461                 }
1462 
1463                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1464                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1465                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1466                         break;
1467 
1468                 ret = btrfs_del_item(trans, root, path);
1469                 if (ret)
1470                         goto out;
1471 
1472                 btrfs_release_path(path);
1473                 inode = read_one_inode(root, key.offset);
1474                 if (!inode)
1475                         return -EIO;
1476 
1477                 ret = fixup_inode_link_count(trans, root, inode);
1478                 iput(inode);
1479                 if (ret)
1480                         goto out;
1481 
1482                 /*
1483                  * fixup on a directory may create new entries,
1484                  * make sure we always look for the highset possible
1485                  * offset
1486                  */
1487                 key.offset = (u64)-1;
1488         }
1489         ret = 0;
1490 out:
1491         btrfs_release_path(path);
1492         return ret;
1493 }
1494 
1495 
1496 /*
1497  * record a given inode in the fixup dir so we can check its link
1498  * count when replay is done.  The link count is incremented here
1499  * so the inode won't go away until we check it
1500  */
1501 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1502                                       struct btrfs_root *root,
1503                                       struct btrfs_path *path,
1504                                       u64 objectid)
1505 {
1506         struct btrfs_key key;
1507         int ret = 0;
1508         struct inode *inode;
1509 
1510         inode = read_one_inode(root, objectid);
1511         if (!inode)
1512                 return -EIO;
1513 
1514         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1515         key.type = BTRFS_ORPHAN_ITEM_KEY;
1516         key.offset = objectid;
1517 
1518         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1519 
1520         btrfs_release_path(path);
1521         if (ret == 0) {
1522                 if (!inode->i_nlink)
1523                         set_nlink(inode, 1);
1524                 else
1525                         inc_nlink(inode);
1526                 ret = btrfs_update_inode(trans, root, inode);
1527         } else if (ret == -EEXIST) {
1528                 ret = 0;
1529         } else {
1530                 BUG(); /* Logic Error */
1531         }
1532         iput(inode);
1533 
1534         return ret;
1535 }
1536 
1537 /*
1538  * when replaying the log for a directory, we only insert names
1539  * for inodes that actually exist.  This means an fsync on a directory
1540  * does not implicitly fsync all the new files in it
1541  */
1542 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1543                                     struct btrfs_root *root,
1544                                     struct btrfs_path *path,
1545                                     u64 dirid, u64 index,
1546                                     char *name, int name_len, u8 type,
1547                                     struct btrfs_key *location)
1548 {
1549         struct inode *inode;
1550         struct inode *dir;
1551         int ret;
1552 
1553         inode = read_one_inode(root, location->objectid);
1554         if (!inode)
1555                 return -ENOENT;
1556 
1557         dir = read_one_inode(root, dirid);
1558         if (!dir) {
1559                 iput(inode);
1560                 return -EIO;
1561         }
1562 
1563         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1564 
1565         /* FIXME, put inode into FIXUP list */
1566 
1567         iput(inode);
1568         iput(dir);
1569         return ret;
1570 }
1571 
1572 /*
1573  * take a single entry in a log directory item and replay it into
1574  * the subvolume.
1575  *
1576  * if a conflicting item exists in the subdirectory already,
1577  * the inode it points to is unlinked and put into the link count
1578  * fix up tree.
1579  *
1580  * If a name from the log points to a file or directory that does
1581  * not exist in the FS, it is skipped.  fsyncs on directories
1582  * do not force down inodes inside that directory, just changes to the
1583  * names or unlinks in a directory.
1584  */
1585 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1586                                     struct btrfs_root *root,
1587                                     struct btrfs_path *path,
1588                                     struct extent_buffer *eb,
1589                                     struct btrfs_dir_item *di,
1590                                     struct btrfs_key *key)
1591 {
1592         char *name;
1593         int name_len;
1594         struct btrfs_dir_item *dst_di;
1595         struct btrfs_key found_key;
1596         struct btrfs_key log_key;
1597         struct inode *dir;
1598         u8 log_type;
1599         int exists;
1600         int ret = 0;
1601         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1602 
1603         dir = read_one_inode(root, key->objectid);
1604         if (!dir)
1605                 return -EIO;
1606 
1607         name_len = btrfs_dir_name_len(eb, di);
1608         name = kmalloc(name_len, GFP_NOFS);
1609         if (!name) {
1610                 ret = -ENOMEM;
1611                 goto out;
1612         }
1613 
1614         log_type = btrfs_dir_type(eb, di);
1615         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1616                    name_len);
1617 
1618         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1619         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1620         if (exists == 0)
1621                 exists = 1;
1622         else
1623                 exists = 0;
1624         btrfs_release_path(path);
1625 
1626         if (key->type == BTRFS_DIR_ITEM_KEY) {
1627                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1628                                        name, name_len, 1);
1629         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1630                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1631                                                      key->objectid,
1632                                                      key->offset, name,
1633                                                      name_len, 1);
1634         } else {
1635                 /* Corruption */
1636                 ret = -EINVAL;
1637                 goto out;
1638         }
1639         if (IS_ERR_OR_NULL(dst_di)) {
1640                 /* we need a sequence number to insert, so we only
1641                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1642                  */
1643                 if (key->type != BTRFS_DIR_INDEX_KEY)
1644                         goto out;
1645                 goto insert;
1646         }
1647 
1648         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1649         /* the existing item matches the logged item */
1650         if (found_key.objectid == log_key.objectid &&
1651             found_key.type == log_key.type &&
1652             found_key.offset == log_key.offset &&
1653             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1654                 update_size = false;
1655                 goto out;
1656         }
1657 
1658         /*
1659          * don't drop the conflicting directory entry if the inode
1660          * for the new entry doesn't exist
1661          */
1662         if (!exists)
1663                 goto out;
1664 
1665         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1666         if (ret)
1667                 goto out;
1668 
1669         if (key->type == BTRFS_DIR_INDEX_KEY)
1670                 goto insert;
1671 out:
1672         btrfs_release_path(path);
1673         if (!ret && update_size) {
1674                 btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1675                 ret = btrfs_update_inode(trans, root, dir);
1676         }
1677         kfree(name);
1678         iput(dir);
1679         return ret;
1680 
1681 insert:
1682         btrfs_release_path(path);
1683         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1684                               name, name_len, log_type, &log_key);
1685         if (ret && ret != -ENOENT)
1686                 goto out;
1687         update_size = false;
1688         ret = 0;
1689         goto out;
1690 }
1691 
1692 /*
1693  * find all the names in a directory item and reconcile them into
1694  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1695  * one name in a directory item, but the same code gets used for
1696  * both directory index types
1697  */
1698 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1699                                         struct btrfs_root *root,
1700                                         struct btrfs_path *path,
1701                                         struct extent_buffer *eb, int slot,
1702                                         struct btrfs_key *key)
1703 {
1704         int ret;
1705         u32 item_size = btrfs_item_size_nr(eb, slot);
1706         struct btrfs_dir_item *di;
1707         int name_len;
1708         unsigned long ptr;
1709         unsigned long ptr_end;
1710 
1711         ptr = btrfs_item_ptr_offset(eb, slot);
1712         ptr_end = ptr + item_size;
1713         while (ptr < ptr_end) {
1714                 di = (struct btrfs_dir_item *)ptr;
1715                 if (verify_dir_item(root, eb, di))
1716                         return -EIO;
1717                 name_len = btrfs_dir_name_len(eb, di);
1718                 ret = replay_one_name(trans, root, path, eb, di, key);
1719                 if (ret)
1720                         return ret;
1721                 ptr = (unsigned long)(di + 1);
1722                 ptr += name_len;
1723         }
1724         return 0;
1725 }
1726 
1727 /*
1728  * directory replay has two parts.  There are the standard directory
1729  * items in the log copied from the subvolume, and range items
1730  * created in the log while the subvolume was logged.
1731  *
1732  * The range items tell us which parts of the key space the log
1733  * is authoritative for.  During replay, if a key in the subvolume
1734  * directory is in a logged range item, but not actually in the log
1735  * that means it was deleted from the directory before the fsync
1736  * and should be removed.
1737  */
1738 static noinline int find_dir_range(struct btrfs_root *root,
1739                                    struct btrfs_path *path,
1740                                    u64 dirid, int key_type,
1741                                    u64 *start_ret, u64 *end_ret)
1742 {
1743         struct btrfs_key key;
1744         u64 found_end;
1745         struct btrfs_dir_log_item *item;
1746         int ret;
1747         int nritems;
1748 
1749         if (*start_ret == (u64)-1)
1750                 return 1;
1751 
1752         key.objectid = dirid;
1753         key.type = key_type;
1754         key.offset = *start_ret;
1755 
1756         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1757         if (ret < 0)
1758                 goto out;
1759         if (ret > 0) {
1760                 if (path->slots[0] == 0)
1761                         goto out;
1762                 path->slots[0]--;
1763         }
1764         if (ret != 0)
1765                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1766 
1767         if (key.type != key_type || key.objectid != dirid) {
1768                 ret = 1;
1769                 goto next;
1770         }
1771         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1772                               struct btrfs_dir_log_item);
1773         found_end = btrfs_dir_log_end(path->nodes[0], item);
1774 
1775         if (*start_ret >= key.offset && *start_ret <= found_end) {
1776                 ret = 0;
1777                 *start_ret = key.offset;
1778                 *end_ret = found_end;
1779                 goto out;
1780         }
1781         ret = 1;
1782 next:
1783         /* check the next slot in the tree to see if it is a valid item */
1784         nritems = btrfs_header_nritems(path->nodes[0]);
1785         if (path->slots[0] >= nritems) {
1786                 ret = btrfs_next_leaf(root, path);
1787                 if (ret)
1788                         goto out;
1789         } else {
1790                 path->slots[0]++;
1791         }
1792 
1793         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1794 
1795         if (key.type != key_type || key.objectid != dirid) {
1796                 ret = 1;
1797                 goto out;
1798         }
1799         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1800                               struct btrfs_dir_log_item);
1801         found_end = btrfs_dir_log_end(path->nodes[0], item);
1802         *start_ret = key.offset;
1803         *end_ret = found_end;
1804         ret = 0;
1805 out:
1806         btrfs_release_path(path);
1807         return ret;
1808 }
1809 
1810 /*
1811  * this looks for a given directory item in the log.  If the directory
1812  * item is not in the log, the item is removed and the inode it points
1813  * to is unlinked
1814  */
1815 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1816                                       struct btrfs_root *root,
1817                                       struct btrfs_root *log,
1818                                       struct btrfs_path *path,
1819                                       struct btrfs_path *log_path,
1820                                       struct inode *dir,
1821                                       struct btrfs_key *dir_key)
1822 {
1823         int ret;
1824         struct extent_buffer *eb;
1825         int slot;
1826         u32 item_size;
1827         struct btrfs_dir_item *di;
1828         struct btrfs_dir_item *log_di;
1829         int name_len;
1830         unsigned long ptr;
1831         unsigned long ptr_end;
1832         char *name;
1833         struct inode *inode;
1834         struct btrfs_key location;
1835 
1836 again:
1837         eb = path->nodes[0];
1838         slot = path->slots[0];
1839         item_size = btrfs_item_size_nr(eb, slot);
1840         ptr = btrfs_item_ptr_offset(eb, slot);
1841         ptr_end = ptr + item_size;
1842         while (ptr < ptr_end) {
1843                 di = (struct btrfs_dir_item *)ptr;
1844                 if (verify_dir_item(root, eb, di)) {
1845                         ret = -EIO;
1846                         goto out;
1847                 }
1848 
1849                 name_len = btrfs_dir_name_len(eb, di);
1850                 name = kmalloc(name_len, GFP_NOFS);
1851                 if (!name) {
1852                         ret = -ENOMEM;
1853                         goto out;
1854                 }
1855                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1856                                   name_len);
1857                 log_di = NULL;
1858                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1859                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1860                                                        dir_key->objectid,
1861                                                        name, name_len, 0);
1862                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1863                         log_di = btrfs_lookup_dir_index_item(trans, log,
1864                                                      log_path,
1865                                                      dir_key->objectid,
1866                                                      dir_key->offset,
1867                                                      name, name_len, 0);
1868                 }
1869                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
1870                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1871                         btrfs_release_path(path);
1872                         btrfs_release_path(log_path);
1873                         inode = read_one_inode(root, location.objectid);
1874                         if (!inode) {
1875                                 kfree(name);
1876                                 return -EIO;
1877                         }
1878 
1879                         ret = link_to_fixup_dir(trans, root,
1880                                                 path, location.objectid);
1881                         if (ret) {
1882                                 kfree(name);
1883                                 iput(inode);
1884                                 goto out;
1885                         }
1886 
1887                         inc_nlink(inode);
1888                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1889                                                  name, name_len);
1890                         if (!ret)
1891                                 ret = btrfs_run_delayed_items(trans, root);
1892                         kfree(name);
1893                         iput(inode);
1894                         if (ret)
1895                                 goto out;
1896 
1897                         /* there might still be more names under this key
1898                          * check and repeat if required
1899                          */
1900                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1901                                                 0, 0);
1902                         if (ret == 0)
1903                                 goto again;
1904                         ret = 0;
1905                         goto out;
1906                 } else if (IS_ERR(log_di)) {
1907                         kfree(name);
1908                         return PTR_ERR(log_di);
1909                 }
1910                 btrfs_release_path(log_path);
1911                 kfree(name);
1912 
1913                 ptr = (unsigned long)(di + 1);
1914                 ptr += name_len;
1915         }
1916         ret = 0;
1917 out:
1918         btrfs_release_path(path);
1919         btrfs_release_path(log_path);
1920         return ret;
1921 }
1922 
1923 /*
1924  * deletion replay happens before we copy any new directory items
1925  * out of the log or out of backreferences from inodes.  It
1926  * scans the log to find ranges of keys that log is authoritative for,
1927  * and then scans the directory to find items in those ranges that are
1928  * not present in the log.
1929  *
1930  * Anything we don't find in the log is unlinked and removed from the
1931  * directory.
1932  */
1933 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1934                                        struct btrfs_root *root,
1935                                        struct btrfs_root *log,
1936                                        struct btrfs_path *path,
1937                                        u64 dirid, int del_all)
1938 {
1939         u64 range_start;
1940         u64 range_end;
1941         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1942         int ret = 0;
1943         struct btrfs_key dir_key;
1944         struct btrfs_key found_key;
1945         struct btrfs_path *log_path;
1946         struct inode *dir;
1947 
1948         dir_key.objectid = dirid;
1949         dir_key.type = BTRFS_DIR_ITEM_KEY;
1950         log_path = btrfs_alloc_path();
1951         if (!log_path)
1952                 return -ENOMEM;
1953 
1954         dir = read_one_inode(root, dirid);
1955         /* it isn't an error if the inode isn't there, that can happen
1956          * because we replay the deletes before we copy in the inode item
1957          * from the log
1958          */
1959         if (!dir) {
1960                 btrfs_free_path(log_path);
1961                 return 0;
1962         }
1963 again:
1964         range_start = 0;
1965         range_end = 0;
1966         while (1) {
1967                 if (del_all)
1968                         range_end = (u64)-1;
1969                 else {
1970                         ret = find_dir_range(log, path, dirid, key_type,
1971                                              &range_start, &range_end);
1972                         if (ret != 0)
1973                                 break;
1974                 }
1975 
1976                 dir_key.offset = range_start;
1977                 while (1) {
1978                         int nritems;
1979                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1980                                                 0, 0);
1981                         if (ret < 0)
1982                                 goto out;
1983 
1984                         nritems = btrfs_header_nritems(path->nodes[0]);
1985                         if (path->slots[0] >= nritems) {
1986                                 ret = btrfs_next_leaf(root, path);
1987                                 if (ret)
1988                                         break;
1989                         }
1990                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1991                                               path->slots[0]);
1992                         if (found_key.objectid != dirid ||
1993                             found_key.type != dir_key.type)
1994                                 goto next_type;
1995 
1996                         if (found_key.offset > range_end)
1997                                 break;
1998 
1999                         ret = check_item_in_log(trans, root, log, path,
2000                                                 log_path, dir,
2001                                                 &found_key);
2002                         if (ret)
2003                                 goto out;
2004                         if (found_key.offset == (u64)-1)
2005                                 break;
2006                         dir_key.offset = found_key.offset + 1;
2007                 }
2008                 btrfs_release_path(path);
2009                 if (range_end == (u64)-1)
2010                         break;
2011                 range_start = range_end + 1;
2012         }
2013 
2014 next_type:
2015         ret = 0;
2016         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2017                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2018                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2019                 btrfs_release_path(path);
2020                 goto again;
2021         }
2022 out:
2023         btrfs_release_path(path);
2024         btrfs_free_path(log_path);
2025         iput(dir);
2026         return ret;
2027 }
2028 
2029 /*
2030  * the process_func used to replay items from the log tree.  This
2031  * gets called in two different stages.  The first stage just looks
2032  * for inodes and makes sure they are all copied into the subvolume.
2033  *
2034  * The second stage copies all the other item types from the log into
2035  * the subvolume.  The two stage approach is slower, but gets rid of
2036  * lots of complexity around inodes referencing other inodes that exist
2037  * only in the log (references come from either directory items or inode
2038  * back refs).
2039  */
2040 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2041                              struct walk_control *wc, u64 gen)
2042 {
2043         int nritems;
2044         struct btrfs_path *path;
2045         struct btrfs_root *root = wc->replay_dest;
2046         struct btrfs_key key;
2047         int level;
2048         int i;
2049         int ret;
2050 
2051         ret = btrfs_read_buffer(eb, gen);
2052         if (ret)
2053                 return ret;
2054 
2055         level = btrfs_header_level(eb);
2056 
2057         if (level != 0)
2058                 return 0;
2059 
2060         path = btrfs_alloc_path();
2061         if (!path)
2062                 return -ENOMEM;
2063 
2064         nritems = btrfs_header_nritems(eb);
2065         for (i = 0; i < nritems; i++) {
2066                 btrfs_item_key_to_cpu(eb, &key, i);
2067 
2068                 /* inode keys are done during the first stage */
2069                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2070                     wc->stage == LOG_WALK_REPLAY_INODES) {
2071                         struct btrfs_inode_item *inode_item;
2072                         u32 mode;
2073 
2074                         inode_item = btrfs_item_ptr(eb, i,
2075                                             struct btrfs_inode_item);
2076                         mode = btrfs_inode_mode(eb, inode_item);
2077                         if (S_ISDIR(mode)) {
2078                                 ret = replay_dir_deletes(wc->trans,
2079                                          root, log, path, key.objectid, 0);
2080                                 if (ret)
2081                                         break;
2082                         }
2083                         ret = overwrite_item(wc->trans, root, path,
2084                                              eb, i, &key);
2085                         if (ret)
2086                                 break;
2087 
2088                         /* for regular files, make sure corresponding
2089                          * orhpan item exist. extents past the new EOF
2090                          * will be truncated later by orphan cleanup.
2091                          */
2092                         if (S_ISREG(mode)) {
2093                                 ret = insert_orphan_item(wc->trans, root,
2094                                                          key.objectid);
2095                                 if (ret)
2096                                         break;
2097                         }
2098 
2099                         ret = link_to_fixup_dir(wc->trans, root,
2100                                                 path, key.objectid);
2101                         if (ret)
2102                                 break;
2103                 }
2104 
2105                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2106                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2107                         ret = replay_one_dir_item(wc->trans, root, path,
2108                                                   eb, i, &key);
2109                         if (ret)
2110                                 break;
2111                 }
2112 
2113                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2114                         continue;
2115 
2116                 /* these keys are simply copied */
2117                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2118                         ret = overwrite_item(wc->trans, root, path,
2119                                              eb, i, &key);
2120                         if (ret)
2121                                 break;
2122                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2123                            key.type == BTRFS_INODE_EXTREF_KEY) {
2124                         ret = add_inode_ref(wc->trans, root, log, path,
2125                                             eb, i, &key);
2126                         if (ret && ret != -ENOENT)
2127                                 break;
2128                         ret = 0;
2129                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2130                         ret = replay_one_extent(wc->trans, root, path,
2131                                                 eb, i, &key);
2132                         if (ret)
2133                                 break;
2134                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2135                         ret = replay_one_dir_item(wc->trans, root, path,
2136                                                   eb, i, &key);
2137                         if (ret)
2138                                 break;
2139                 }
2140         }
2141         btrfs_free_path(path);
2142         return ret;
2143 }
2144 
2145 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2146                                    struct btrfs_root *root,
2147                                    struct btrfs_path *path, int *level,
2148                                    struct walk_control *wc)
2149 {
2150         u64 root_owner;
2151         u64 bytenr;
2152         u64 ptr_gen;
2153         struct extent_buffer *next;
2154         struct extent_buffer *cur;
2155         struct extent_buffer *parent;
2156         u32 blocksize;
2157         int ret = 0;
2158 
2159         WARN_ON(*level < 0);
2160         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2161 
2162         while (*level > 0) {
2163                 WARN_ON(*level < 0);
2164                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2165                 cur = path->nodes[*level];
2166 
2167                 WARN_ON(btrfs_header_level(cur) != *level);
2168 
2169                 if (path->slots[*level] >=
2170                     btrfs_header_nritems(cur))
2171                         break;
2172 
2173                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2174                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2175                 blocksize = root->nodesize;
2176 
2177                 parent = path->nodes[*level];
2178                 root_owner = btrfs_header_owner(parent);
2179 
2180                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2181                 if (!next)
2182                         return -ENOMEM;
2183 
2184                 if (*level == 1) {
2185                         ret = wc->process_func(root, next, wc, ptr_gen);
2186                         if (ret) {
2187                                 free_extent_buffer(next);
2188                                 return ret;
2189                         }
2190 
2191                         path->slots[*level]++;
2192                         if (wc->free) {
2193                                 ret = btrfs_read_buffer(next, ptr_gen);
2194                                 if (ret) {
2195                                         free_extent_buffer(next);
2196                                         return ret;
2197                                 }
2198 
2199                                 if (trans) {
2200                                         btrfs_tree_lock(next);
2201                                         btrfs_set_lock_blocking(next);
2202                                         clean_tree_block(trans, root, next);
2203                                         btrfs_wait_tree_block_writeback(next);
2204                                         btrfs_tree_unlock(next);
2205                                 }
2206 
2207                                 WARN_ON(root_owner !=
2208                                         BTRFS_TREE_LOG_OBJECTID);
2209                                 ret = btrfs_free_and_pin_reserved_extent(root,
2210                                                          bytenr, blocksize);
2211                                 if (ret) {
2212                                         free_extent_buffer(next);
2213                                         return ret;
2214                                 }
2215                         }
2216                         free_extent_buffer(next);
2217                         continue;
2218                 }
2219                 ret = btrfs_read_buffer(next, ptr_gen);
2220                 if (ret) {
2221                         free_extent_buffer(next);
2222                         return ret;
2223                 }
2224 
2225                 WARN_ON(*level <= 0);
2226                 if (path->nodes[*level-1])
2227                         free_extent_buffer(path->nodes[*level-1]);
2228                 path->nodes[*level-1] = next;
2229                 *level = btrfs_header_level(next);
2230                 path->slots[*level] = 0;
2231                 cond_resched();
2232         }
2233         WARN_ON(*level < 0);
2234         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2235 
2236         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2237 
2238         cond_resched();
2239         return 0;
2240 }
2241 
2242 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2243                                  struct btrfs_root *root,
2244                                  struct btrfs_path *path, int *level,
2245                                  struct walk_control *wc)
2246 {
2247         u64 root_owner;
2248         int i;
2249         int slot;
2250         int ret;
2251 
2252         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2253                 slot = path->slots[i];
2254                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2255                         path->slots[i]++;
2256                         *level = i;
2257                         WARN_ON(*level == 0);
2258                         return 0;
2259                 } else {
2260                         struct extent_buffer *parent;
2261                         if (path->nodes[*level] == root->node)
2262                                 parent = path->nodes[*level];
2263                         else
2264                                 parent = path->nodes[*level + 1];
2265 
2266                         root_owner = btrfs_header_owner(parent);
2267                         ret = wc->process_func(root, path->nodes[*level], wc,
2268                                  btrfs_header_generation(path->nodes[*level]));
2269                         if (ret)
2270                                 return ret;
2271 
2272                         if (wc->free) {
2273                                 struct extent_buffer *next;
2274 
2275                                 next = path->nodes[*level];
2276 
2277                                 if (trans) {
2278                                         btrfs_tree_lock(next);
2279                                         btrfs_set_lock_blocking(next);
2280                                         clean_tree_block(trans, root, next);
2281                                         btrfs_wait_tree_block_writeback(next);
2282                                         btrfs_tree_unlock(next);
2283                                 }
2284 
2285                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2286                                 ret = btrfs_free_and_pin_reserved_extent(root,
2287                                                 path->nodes[*level]->start,
2288                                                 path->nodes[*level]->len);
2289                                 if (ret)
2290                                         return ret;
2291                         }
2292                         free_extent_buffer(path->nodes[*level]);
2293                         path->nodes[*level] = NULL;
2294                         *level = i + 1;
2295                 }
2296         }
2297         return 1;
2298 }
2299 
2300 /*
2301  * drop the reference count on the tree rooted at 'snap'.  This traverses
2302  * the tree freeing any blocks that have a ref count of zero after being
2303  * decremented.
2304  */
2305 static int walk_log_tree(struct btrfs_trans_handle *trans,
2306                          struct btrfs_root *log, struct walk_control *wc)
2307 {
2308         int ret = 0;
2309         int wret;
2310         int level;
2311         struct btrfs_path *path;
2312         int orig_level;
2313 
2314         path = btrfs_alloc_path();
2315         if (!path)
2316                 return -ENOMEM;
2317 
2318         level = btrfs_header_level(log->node);
2319         orig_level = level;
2320         path->nodes[level] = log->node;
2321         extent_buffer_get(log->node);
2322         path->slots[level] = 0;
2323 
2324         while (1) {
2325                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2326                 if (wret > 0)
2327                         break;
2328                 if (wret < 0) {
2329                         ret = wret;
2330                         goto out;
2331                 }
2332 
2333                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2334                 if (wret > 0)
2335                         break;
2336                 if (wret < 0) {
2337                         ret = wret;
2338                         goto out;
2339                 }
2340         }
2341 
2342         /* was the root node processed? if not, catch it here */
2343         if (path->nodes[orig_level]) {
2344                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2345                          btrfs_header_generation(path->nodes[orig_level]));
2346                 if (ret)
2347                         goto out;
2348                 if (wc->free) {
2349                         struct extent_buffer *next;
2350 
2351                         next = path->nodes[orig_level];
2352 
2353                         if (trans) {
2354                                 btrfs_tree_lock(next);
2355                                 btrfs_set_lock_blocking(next);
2356                                 clean_tree_block(trans, log, next);
2357                                 btrfs_wait_tree_block_writeback(next);
2358                                 btrfs_tree_unlock(next);
2359                         }
2360 
2361                         WARN_ON(log->root_key.objectid !=
2362                                 BTRFS_TREE_LOG_OBJECTID);
2363                         ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2364                                                          next->len);
2365                         if (ret)
2366                                 goto out;
2367                 }
2368         }
2369 
2370 out:
2371         btrfs_free_path(path);
2372         return ret;
2373 }
2374 
2375 /*
2376  * helper function to update the item for a given subvolumes log root
2377  * in the tree of log roots
2378  */
2379 static int update_log_root(struct btrfs_trans_handle *trans,
2380                            struct btrfs_root *log)
2381 {
2382         int ret;
2383 
2384         if (log->log_transid == 1) {
2385                 /* insert root item on the first sync */
2386                 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2387                                 &log->root_key, &log->root_item);
2388         } else {
2389                 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2390                                 &log->root_key, &log->root_item);
2391         }
2392         return ret;
2393 }
2394 
2395 static void wait_log_commit(struct btrfs_trans_handle *trans,
2396                             struct btrfs_root *root, int transid)
2397 {
2398         DEFINE_WAIT(wait);
2399         int index = transid % 2;
2400 
2401         /*
2402          * we only allow two pending log transactions at a time,
2403          * so we know that if ours is more than 2 older than the
2404          * current transaction, we're done
2405          */
2406         do {
2407                 prepare_to_wait(&root->log_commit_wait[index],
2408                                 &wait, TASK_UNINTERRUPTIBLE);
2409                 mutex_unlock(&root->log_mutex);
2410 
2411                 if (root->log_transid_committed < transid &&
2412                     atomic_read(&root->log_commit[index]))
2413                         schedule();
2414 
2415                 finish_wait(&root->log_commit_wait[index], &wait);
2416                 mutex_lock(&root->log_mutex);
2417         } while (root->log_transid_committed < transid &&
2418                  atomic_read(&root->log_commit[index]));
2419 }
2420 
2421 static void wait_for_writer(struct btrfs_trans_handle *trans,
2422                             struct btrfs_root *root)
2423 {
2424         DEFINE_WAIT(wait);
2425 
2426         while (atomic_read(&root->log_writers)) {
2427                 prepare_to_wait(&root->log_writer_wait,
2428                                 &wait, TASK_UNINTERRUPTIBLE);
2429                 mutex_unlock(&root->log_mutex);
2430                 if (atomic_read(&root->log_writers))
2431                         schedule();
2432                 mutex_lock(&root->log_mutex);
2433                 finish_wait(&root->log_writer_wait, &wait);
2434         }
2435 }
2436 
2437 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2438                                         struct btrfs_log_ctx *ctx)
2439 {
2440         if (!ctx)
2441                 return;
2442 
2443         mutex_lock(&root->log_mutex);
2444         list_del_init(&ctx->list);
2445         mutex_unlock(&root->log_mutex);
2446 }
2447 
2448 /* 
2449  * Invoked in log mutex context, or be sure there is no other task which
2450  * can access the list.
2451  */
2452 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2453                                              int index, int error)
2454 {
2455         struct btrfs_log_ctx *ctx;
2456 
2457         if (!error) {
2458                 INIT_LIST_HEAD(&root->log_ctxs[index]);
2459                 return;
2460         }
2461 
2462         list_for_each_entry(ctx, &root->log_ctxs[index], list)
2463                 ctx->log_ret = error;
2464 
2465         INIT_LIST_HEAD(&root->log_ctxs[index]);
2466 }
2467 
2468 /*
2469  * btrfs_sync_log does sends a given tree log down to the disk and
2470  * updates the super blocks to record it.  When this call is done,
2471  * you know that any inodes previously logged are safely on disk only
2472  * if it returns 0.
2473  *
2474  * Any other return value means you need to call btrfs_commit_transaction.
2475  * Some of the edge cases for fsyncing directories that have had unlinks
2476  * or renames done in the past mean that sometimes the only safe
2477  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2478  * that has happened.
2479  */
2480 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2482 {
2483         int index1;
2484         int index2;
2485         int mark;
2486         int ret;
2487         struct btrfs_root *log = root->log_root;
2488         struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2489         int log_transid = 0;
2490         struct btrfs_log_ctx root_log_ctx;
2491         struct blk_plug plug;
2492 
2493         mutex_lock(&root->log_mutex);
2494         log_transid = ctx->log_transid;
2495         if (root->log_transid_committed >= log_transid) {
2496                 mutex_unlock(&root->log_mutex);
2497                 return ctx->log_ret;
2498         }
2499 
2500         index1 = log_transid % 2;
2501         if (atomic_read(&root->log_commit[index1])) {
2502                 wait_log_commit(trans, root, log_transid);
2503                 mutex_unlock(&root->log_mutex);
2504                 return ctx->log_ret;
2505         }
2506         ASSERT(log_transid == root->log_transid);
2507         atomic_set(&root->log_commit[index1], 1);
2508 
2509         /* wait for previous tree log sync to complete */
2510         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2511                 wait_log_commit(trans, root, log_transid - 1);
2512 
2513         while (1) {
2514                 int batch = atomic_read(&root->log_batch);
2515                 /* when we're on an ssd, just kick the log commit out */
2516                 if (!btrfs_test_opt(root, SSD) &&
2517                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2518                         mutex_unlock(&root->log_mutex);
2519                         schedule_timeout_uninterruptible(1);
2520                         mutex_lock(&root->log_mutex);
2521                 }
2522                 wait_for_writer(trans, root);
2523                 if (batch == atomic_read(&root->log_batch))
2524                         break;
2525         }
2526 
2527         /* bail out if we need to do a full commit */
2528         if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2529                 ret = -EAGAIN;
2530                 btrfs_free_logged_extents(log, log_transid);
2531                 mutex_unlock(&root->log_mutex);
2532                 goto out;
2533         }
2534 
2535         if (log_transid % 2 == 0)
2536                 mark = EXTENT_DIRTY;
2537         else
2538                 mark = EXTENT_NEW;
2539 
2540         /* we start IO on  all the marked extents here, but we don't actually
2541          * wait for them until later.
2542          */
2543         blk_start_plug(&plug);
2544         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2545         if (ret) {
2546                 blk_finish_plug(&plug);
2547                 btrfs_abort_transaction(trans, root, ret);
2548                 btrfs_free_logged_extents(log, log_transid);
2549                 btrfs_set_log_full_commit(root->fs_info, trans);
2550                 mutex_unlock(&root->log_mutex);
2551                 goto out;
2552         }
2553 
2554         btrfs_set_root_node(&log->root_item, log->node);
2555 
2556         root->log_transid++;
2557         log->log_transid = root->log_transid;
2558         root->log_start_pid = 0;
2559         /*
2560          * IO has been started, blocks of the log tree have WRITTEN flag set
2561          * in their headers. new modifications of the log will be written to
2562          * new positions. so it's safe to allow log writers to go in.
2563          */
2564         mutex_unlock(&root->log_mutex);
2565 
2566         btrfs_init_log_ctx(&root_log_ctx);
2567 
2568         mutex_lock(&log_root_tree->log_mutex);
2569         atomic_inc(&log_root_tree->log_batch);
2570         atomic_inc(&log_root_tree->log_writers);
2571 
2572         index2 = log_root_tree->log_transid % 2;
2573         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2574         root_log_ctx.log_transid = log_root_tree->log_transid;
2575 
2576         mutex_unlock(&log_root_tree->log_mutex);
2577 
2578         ret = update_log_root(trans, log);
2579 
2580         mutex_lock(&log_root_tree->log_mutex);
2581         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2582                 smp_mb();
2583                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2584                         wake_up(&log_root_tree->log_writer_wait);
2585         }
2586 
2587         if (ret) {
2588                 if (!list_empty(&root_log_ctx.list))
2589                         list_del_init(&root_log_ctx.list);
2590 
2591                 blk_finish_plug(&plug);
2592                 btrfs_set_log_full_commit(root->fs_info, trans);
2593 
2594                 if (ret != -ENOSPC) {
2595                         btrfs_abort_transaction(trans, root, ret);
2596                         mutex_unlock(&log_root_tree->log_mutex);
2597                         goto out;
2598                 }
2599                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2600                 btrfs_free_logged_extents(log, log_transid);
2601                 mutex_unlock(&log_root_tree->log_mutex);
2602                 ret = -EAGAIN;
2603                 goto out;
2604         }
2605 
2606         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2607                 blk_finish_plug(&plug);
2608                 mutex_unlock(&log_root_tree->log_mutex);
2609                 ret = root_log_ctx.log_ret;
2610                 goto out;
2611         }
2612 
2613         index2 = root_log_ctx.log_transid % 2;
2614         if (atomic_read(&log_root_tree->log_commit[index2])) {
2615                 blk_finish_plug(&plug);
2616                 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2617                                                 mark);
2618                 btrfs_wait_logged_extents(trans, log, log_transid);
2619                 wait_log_commit(trans, log_root_tree,
2620                                 root_log_ctx.log_transid);
2621                 mutex_unlock(&log_root_tree->log_mutex);
2622                 if (!ret)
2623                         ret = root_log_ctx.log_ret;
2624                 goto out;
2625         }
2626         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2627         atomic_set(&log_root_tree->log_commit[index2], 1);
2628 
2629         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2630                 wait_log_commit(trans, log_root_tree,
2631                                 root_log_ctx.log_transid - 1);
2632         }
2633 
2634         wait_for_writer(trans, log_root_tree);
2635 
2636         /*
2637          * now that we've moved on to the tree of log tree roots,
2638          * check the full commit flag again
2639          */
2640         if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2641                 blk_finish_plug(&plug);
2642                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2643                 btrfs_free_logged_extents(log, log_transid);
2644                 mutex_unlock(&log_root_tree->log_mutex);
2645                 ret = -EAGAIN;
2646                 goto out_wake_log_root;
2647         }
2648 
2649         ret = btrfs_write_marked_extents(log_root_tree,
2650                                          &log_root_tree->dirty_log_pages,
2651                                          EXTENT_DIRTY | EXTENT_NEW);
2652         blk_finish_plug(&plug);
2653         if (ret) {
2654                 btrfs_set_log_full_commit(root->fs_info, trans);
2655                 btrfs_abort_transaction(trans, root, ret);
2656                 btrfs_free_logged_extents(log, log_transid);
2657                 mutex_unlock(&log_root_tree->log_mutex);
2658                 goto out_wake_log_root;
2659         }
2660         ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2661         if (!ret)
2662                 ret = btrfs_wait_marked_extents(log_root_tree,
2663                                                 &log_root_tree->dirty_log_pages,
2664                                                 EXTENT_NEW | EXTENT_DIRTY);
2665         if (ret) {
2666                 btrfs_set_log_full_commit(root->fs_info, trans);
2667                 btrfs_free_logged_extents(log, log_transid);
2668                 mutex_unlock(&log_root_tree->log_mutex);
2669                 goto out_wake_log_root;
2670         }
2671         btrfs_wait_logged_extents(trans, log, log_transid);
2672 
2673         btrfs_set_super_log_root(root->fs_info->super_for_commit,
2674                                 log_root_tree->node->start);
2675         btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2676                                 btrfs_header_level(log_root_tree->node));
2677 
2678         log_root_tree->log_transid++;
2679         mutex_unlock(&log_root_tree->log_mutex);
2680 
2681         /*
2682          * nobody else is going to jump in and write the the ctree
2683          * super here because the log_commit atomic below is protecting
2684          * us.  We must be called with a transaction handle pinning
2685          * the running transaction open, so a full commit can't hop
2686          * in and cause problems either.
2687          */
2688         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2689         if (ret) {
2690                 btrfs_set_log_full_commit(root->fs_info, trans);
2691                 btrfs_abort_transaction(trans, root, ret);
2692                 goto out_wake_log_root;
2693         }
2694 
2695         mutex_lock(&root->log_mutex);
2696         if (root->last_log_commit < log_transid)
2697                 root->last_log_commit = log_transid;
2698         mutex_unlock(&root->log_mutex);
2699 
2700 out_wake_log_root:
2701         /*
2702          * We needn't get log_mutex here because we are sure all
2703          * the other tasks are blocked.
2704          */
2705         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2706 
2707         mutex_lock(&log_root_tree->log_mutex);
2708         log_root_tree->log_transid_committed++;
2709         atomic_set(&log_root_tree->log_commit[index2], 0);
2710         mutex_unlock(&log_root_tree->log_mutex);
2711 
2712         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2713                 wake_up(&log_root_tree->log_commit_wait[index2]);
2714 out:
2715         /* See above. */
2716         btrfs_remove_all_log_ctxs(root, index1, ret);
2717 
2718         mutex_lock(&root->log_mutex);
2719         root->log_transid_committed++;
2720         atomic_set(&root->log_commit[index1], 0);
2721         mutex_unlock(&root->log_mutex);
2722 
2723         if (waitqueue_active(&root->log_commit_wait[index1]))
2724                 wake_up(&root->log_commit_wait[index1]);
2725         return ret;
2726 }
2727 
2728 static void free_log_tree(struct btrfs_trans_handle *trans,
2729                           struct btrfs_root *log)
2730 {
2731         int ret;
2732         u64 start;
2733         u64 end;
2734         struct walk_control wc = {
2735                 .free = 1,
2736                 .process_func = process_one_buffer
2737         };
2738 
2739         ret = walk_log_tree(trans, log, &wc);
2740         /* I don't think this can happen but just in case */
2741         if (ret)
2742                 btrfs_abort_transaction(trans, log, ret);
2743 
2744         while (1) {
2745                 ret = find_first_extent_bit(&log->dirty_log_pages,
2746                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2747                                 NULL);
2748                 if (ret)
2749                         break;
2750 
2751                 clear_extent_bits(&log->dirty_log_pages, start, end,
2752                                   EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2753         }
2754 
2755         /*
2756          * We may have short-circuited the log tree with the full commit logic
2757          * and left ordered extents on our list, so clear these out to keep us
2758          * from leaking inodes and memory.
2759          */
2760         btrfs_free_logged_extents(log, 0);
2761         btrfs_free_logged_extents(log, 1);
2762 
2763         free_extent_buffer(log->node);
2764         kfree(log);
2765 }
2766 
2767 /*
2768  * free all the extents used by the tree log.  This should be called
2769  * at commit time of the full transaction
2770  */
2771 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2772 {
2773         if (root->log_root) {
2774                 free_log_tree(trans, root->log_root);
2775                 root->log_root = NULL;
2776         }
2777         return 0;
2778 }
2779 
2780 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2781                              struct btrfs_fs_info *fs_info)
2782 {
2783         if (fs_info->log_root_tree) {
2784                 free_log_tree(trans, fs_info->log_root_tree);
2785                 fs_info->log_root_tree = NULL;
2786         }
2787         return 0;
2788 }
2789 
2790 /*
2791  * If both a file and directory are logged, and unlinks or renames are
2792  * mixed in, we have a few interesting corners:
2793  *
2794  * create file X in dir Y
2795  * link file X to X.link in dir Y
2796  * fsync file X
2797  * unlink file X but leave X.link
2798  * fsync dir Y
2799  *
2800  * After a crash we would expect only X.link to exist.  But file X
2801  * didn't get fsync'd again so the log has back refs for X and X.link.
2802  *
2803  * We solve this by removing directory entries and inode backrefs from the
2804  * log when a file that was logged in the current transaction is
2805  * unlinked.  Any later fsync will include the updated log entries, and
2806  * we'll be able to reconstruct the proper directory items from backrefs.
2807  *
2808  * This optimizations allows us to avoid relogging the entire inode
2809  * or the entire directory.
2810  */
2811 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2812                                  struct btrfs_root *root,
2813                                  const char *name, int name_len,
2814                                  struct inode *dir, u64 index)
2815 {
2816         struct btrfs_root *log;
2817         struct btrfs_dir_item *di;
2818         struct btrfs_path *path;
2819         int ret;
2820         int err = 0;
2821         int bytes_del = 0;
2822         u64 dir_ino = btrfs_ino(dir);
2823 
2824         if (BTRFS_I(dir)->logged_trans < trans->transid)
2825                 return 0;
2826 
2827         ret = join_running_log_trans(root);
2828         if (ret)
2829                 return 0;
2830 
2831         mutex_lock(&BTRFS_I(dir)->log_mutex);
2832 
2833         log = root->log_root;
2834         path = btrfs_alloc_path();
2835         if (!path) {
2836                 err = -ENOMEM;
2837                 goto out_unlock;
2838         }
2839 
2840         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2841                                    name, name_len, -1);
2842         if (IS_ERR(di)) {
2843                 err = PTR_ERR(di);
2844                 goto fail;
2845         }
2846         if (di) {
2847                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2848                 bytes_del += name_len;
2849                 if (ret) {
2850                         err = ret;
2851                         goto fail;
2852                 }
2853         }
2854         btrfs_release_path(path);
2855         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2856                                          index, name, name_len, -1);
2857         if (IS_ERR(di)) {
2858                 err = PTR_ERR(di);
2859                 goto fail;
2860         }
2861         if (di) {
2862                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2863                 bytes_del += name_len;
2864                 if (ret) {
2865                         err = ret;
2866                         goto fail;
2867                 }
2868         }
2869 
2870         /* update the directory size in the log to reflect the names
2871          * we have removed
2872          */
2873         if (bytes_del) {
2874                 struct btrfs_key key;
2875 
2876                 key.objectid = dir_ino;
2877                 key.offset = 0;
2878                 key.type = BTRFS_INODE_ITEM_KEY;
2879                 btrfs_release_path(path);
2880 
2881                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2882                 if (ret < 0) {
2883                         err = ret;
2884                         goto fail;
2885                 }
2886                 if (ret == 0) {
2887                         struct btrfs_inode_item *item;
2888                         u64 i_size;
2889 
2890                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2891                                               struct btrfs_inode_item);
2892                         i_size = btrfs_inode_size(path->nodes[0], item);
2893                         if (i_size > bytes_del)
2894                                 i_size -= bytes_del;
2895                         else
2896                                 i_size = 0;
2897                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2898                         btrfs_mark_buffer_dirty(path->nodes[0]);
2899                 } else
2900                         ret = 0;
2901                 btrfs_release_path(path);
2902         }
2903 fail:
2904         btrfs_free_path(path);
2905 out_unlock:
2906         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2907         if (ret == -ENOSPC) {
2908                 btrfs_set_log_full_commit(root->fs_info, trans);
2909                 ret = 0;
2910         } else if (ret < 0)
2911                 btrfs_abort_transaction(trans, root, ret);
2912 
2913         btrfs_end_log_trans(root);
2914 
2915         return err;
2916 }
2917 
2918 /* see comments for btrfs_del_dir_entries_in_log */
2919 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2920                                struct btrfs_root *root,
2921                                const char *name, int name_len,
2922                                struct inode *inode, u64 dirid)
2923 {
2924         struct btrfs_root *log;
2925         u64 index;
2926         int ret;
2927 
2928         if (BTRFS_I(inode)->logged_trans < trans->transid)
2929                 return 0;
2930 
2931         ret = join_running_log_trans(root);
2932         if (ret)
2933                 return 0;
2934         log = root->log_root;
2935         mutex_lock(&BTRFS_I(inode)->log_mutex);
2936 
2937         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2938                                   dirid, &index);
2939         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2940         if (ret == -ENOSPC) {
2941                 btrfs_set_log_full_commit(root->fs_info, trans);
2942                 ret = 0;
2943         } else if (ret < 0 && ret != -ENOENT)
2944                 btrfs_abort_transaction(trans, root, ret);
2945         btrfs_end_log_trans(root);
2946 
2947         return ret;
2948 }
2949 
2950 /*
2951  * creates a range item in the log for 'dirid'.  first_offset and
2952  * last_offset tell us which parts of the key space the log should
2953  * be considered authoritative for.
2954  */
2955 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2956                                        struct btrfs_root *log,
2957                                        struct btrfs_path *path,
2958                                        int key_type, u64 dirid,
2959                                        u64 first_offset, u64 last_offset)
2960 {
2961         int ret;
2962         struct btrfs_key key;
2963         struct btrfs_dir_log_item *item;
2964 
2965         key.objectid = dirid;
2966         key.offset = first_offset;
2967         if (key_type == BTRFS_DIR_ITEM_KEY)
2968                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2969         else
2970                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2971         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2972         if (ret)
2973                 return ret;
2974 
2975         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2976                               struct btrfs_dir_log_item);
2977         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2978         btrfs_mark_buffer_dirty(path->nodes[0]);
2979         btrfs_release_path(path);
2980         return 0;
2981 }
2982 
2983 /*
2984  * log all the items included in the current transaction for a given
2985  * directory.  This also creates the range items in the log tree required
2986  * to replay anything deleted before the fsync
2987  */
2988 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2989                           struct btrfs_root *root, struct inode *inode,
2990                           struct btrfs_path *path,
2991                           struct btrfs_path *dst_path, int key_type,
2992                           u64 min_offset, u64 *last_offset_ret)
2993 {
2994         struct btrfs_key min_key;
2995         struct btrfs_root *log = root->log_root;
2996         struct extent_buffer *src;
2997         int err = 0;
2998         int ret;
2999         int i;
3000         int nritems;
3001         u64 first_offset = min_offset;
3002         u64 last_offset = (u64)-1;
3003         u64 ino = btrfs_ino(inode);
3004 
3005         log = root->log_root;
3006 
3007         min_key.objectid = ino;
3008         min_key.type = key_type;
3009         min_key.offset = min_offset;
3010 
3011         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3012 
3013         /*
3014          * we didn't find anything from this transaction, see if there
3015          * is anything at all
3016          */
3017         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3018                 min_key.objectid = ino;
3019                 min_key.type = key_type;
3020                 min_key.offset = (u64)-1;
3021                 btrfs_release_path(path);
3022                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3023                 if (ret < 0) {
3024                         btrfs_release_path(path);
3025                         return ret;
3026                 }
3027                 ret = btrfs_previous_item(root, path, ino, key_type);
3028 
3029                 /* if ret == 0 there are items for this type,
3030                  * create a range to tell us the last key of this type.
3031                  * otherwise, there are no items in this directory after
3032                  * *min_offset, and we create a range to indicate that.
3033                  */
3034                 if (ret == 0) {
3035                         struct btrfs_key tmp;
3036                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3037                                               path->slots[0]);
3038                         if (key_type == tmp.type)
3039                                 first_offset = max(min_offset, tmp.offset) + 1;
3040                 }
3041                 goto done;
3042         }
3043 
3044         /* go backward to find any previous key */
3045         ret = btrfs_previous_item(root, path, ino, key_type);
3046         if (ret == 0) {
3047                 struct btrfs_key tmp;
3048                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3049                 if (key_type == tmp.type) {
3050                         first_offset = tmp.offset;
3051                         ret = overwrite_item(trans, log, dst_path,
3052                                              path->nodes[0], path->slots[0],
3053                                              &tmp);
3054                         if (ret) {
3055                                 err = ret;
3056                                 goto done;
3057                         }
3058                 }
3059         }
3060         btrfs_release_path(path);
3061 
3062         /* find the first key from this transaction again */
3063         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3064         if (WARN_ON(ret != 0))
3065                 goto done;
3066 
3067         /*
3068          * we have a block from this transaction, log every item in it
3069          * from our directory
3070          */
3071         while (1) {
3072                 struct btrfs_key tmp;
3073                 src = path->nodes[0];
3074                 nritems = btrfs_header_nritems(src);
3075                 for (i = path->slots[0]; i < nritems; i++) {
3076                         btrfs_item_key_to_cpu(src, &min_key, i);
3077 
3078                         if (min_key.objectid != ino || min_key.type != key_type)
3079                                 goto done;
3080                         ret = overwrite_item(trans, log, dst_path, src, i,
3081                                              &min_key);
3082                         if (ret) {
3083                                 err = ret;
3084                                 goto done;
3085                         }
3086                 }
3087                 path->slots[0] = nritems;
3088 
3089                 /*
3090                  * look ahead to the next item and see if it is also
3091                  * from this directory and from this transaction
3092                  */
3093                 ret = btrfs_next_leaf(root, path);
3094                 if (ret == 1) {
3095                         last_offset = (u64)-1;
3096                         goto done;
3097                 }
3098                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3099                 if (tmp.objectid != ino || tmp.type != key_type) {
3100                         last_offset = (u64)-1;
3101                         goto done;
3102                 }
3103                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3104                         ret = overwrite_item(trans, log, dst_path,
3105                                              path->nodes[0], path->slots[0],
3106                                              &tmp);
3107                         if (ret)
3108                                 err = ret;
3109                         else
3110                                 last_offset = tmp.offset;
3111                         goto done;
3112                 }
3113         }
3114 done:
3115         btrfs_release_path(path);
3116         btrfs_release_path(dst_path);
3117 
3118         if (err == 0) {
3119                 *last_offset_ret = last_offset;
3120                 /*
3121                  * insert the log range keys to indicate where the log
3122                  * is valid
3123                  */
3124                 ret = insert_dir_log_key(trans, log, path, key_type,
3125                                          ino, first_offset, last_offset);
3126                 if (ret)
3127                         err = ret;
3128         }
3129         return err;
3130 }
3131 
3132 /*
3133  * logging directories is very similar to logging inodes, We find all the items
3134  * from the current transaction and write them to the log.
3135  *
3136  * The recovery code scans the directory in the subvolume, and if it finds a
3137  * key in the range logged that is not present in the log tree, then it means
3138  * that dir entry was unlinked during the transaction.
3139  *
3140  * In order for that scan to work, we must include one key smaller than
3141  * the smallest logged by this transaction and one key larger than the largest
3142  * key logged by this transaction.
3143  */
3144 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3145                           struct btrfs_root *root, struct inode *inode,
3146                           struct btrfs_path *path,
3147                           struct btrfs_path *dst_path)
3148 {
3149         u64 min_key;
3150         u64 max_key;
3151         int ret;
3152         int key_type = BTRFS_DIR_ITEM_KEY;
3153 
3154 again:
3155         min_key = 0;
3156         max_key = 0;
3157         while (1) {
3158                 ret = log_dir_items(trans, root, inode, path,
3159                                     dst_path, key_type, min_key,
3160                                     &max_key);
3161                 if (ret)
3162                         return ret;
3163                 if (max_key == (u64)-1)
3164                         break;
3165                 min_key = max_key + 1;
3166         }
3167 
3168         if (key_type == BTRFS_DIR_ITEM_KEY) {
3169                 key_type = BTRFS_DIR_INDEX_KEY;
3170                 goto again;
3171         }
3172         return 0;
3173 }
3174 
3175 /*
3176  * a helper function to drop items from the log before we relog an
3177  * inode.  max_key_type indicates the highest item type to remove.
3178  * This cannot be run for file data extents because it does not
3179  * free the extents they point to.
3180  */
3181 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3182                                   struct btrfs_root *log,
3183                                   struct btrfs_path *path,
3184                                   u64 objectid, int max_key_type)
3185 {
3186         int ret;
3187         struct btrfs_key key;
3188         struct btrfs_key found_key;
3189         int start_slot;
3190 
3191         key.objectid = objectid;
3192         key.type = max_key_type;
3193         key.offset = (u64)-1;
3194 
3195         while (1) {
3196                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3197                 BUG_ON(ret == 0); /* Logic error */
3198                 if (ret < 0)
3199                         break;
3200 
3201                 if (path->slots[0] == 0)
3202                         break;
3203 
3204                 path->slots[0]--;
3205                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3206                                       path->slots[0]);
3207 
3208                 if (found_key.objectid != objectid)
3209                         break;
3210 
3211                 found_key.offset = 0;
3212                 found_key.type = 0;
3213                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3214                                        &start_slot);
3215 
3216                 ret = btrfs_del_items(trans, log, path, start_slot,
3217                                       path->slots[0] - start_slot + 1);
3218                 /*
3219                  * If start slot isn't 0 then we don't need to re-search, we've
3220                  * found the last guy with the objectid in this tree.
3221                  */
3222                 if (ret || start_slot != 0)
3223                         break;
3224                 btrfs_release_path(path);
3225         }
3226         btrfs_release_path(path);
3227         if (ret > 0)
3228                 ret = 0;
3229         return ret;
3230 }
3231 
3232 static void fill_inode_item(struct btrfs_trans_handle *trans,
3233                             struct extent_buffer *leaf,
3234                             struct btrfs_inode_item *item,
3235                             struct inode *inode, int log_inode_only,
3236                             u64 logged_isize)
3237 {
3238         struct btrfs_map_token token;
3239 
3240         btrfs_init_map_token(&token);
3241 
3242         if (log_inode_only) {
3243                 /* set the generation to zero so the recover code
3244                  * can tell the difference between an logging
3245                  * just to say 'this inode exists' and a logging
3246                  * to say 'update this inode with these values'
3247                  */
3248                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3249                 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3250         } else {
3251                 btrfs_set_token_inode_generation(leaf, item,
3252                                                  BTRFS_I(inode)->generation,
3253                                                  &token);
3254                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3255         }
3256 
3257         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3258         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3259         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3260         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3261 
3262         btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3263                                      inode->i_atime.tv_sec, &token);
3264         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3265                                       inode->i_atime.tv_nsec, &token);
3266 
3267         btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3268                                      inode->i_mtime.tv_sec, &token);
3269         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3270                                       inode->i_mtime.tv_nsec, &token);
3271 
3272         btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3273                                      inode->i_ctime.tv_sec, &token);
3274         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3275                                       inode->i_ctime.tv_nsec, &token);
3276 
3277         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3278                                      &token);
3279 
3280         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3281         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3282         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3283         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3284         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3285 }
3286 
3287 static int log_inode_item(struct btrfs_trans_handle *trans,
3288                           struct btrfs_root *log, struct btrfs_path *path,
3289                           struct inode *inode)
3290 {
3291         struct btrfs_inode_item *inode_item;
3292         int ret;
3293 
3294         ret = btrfs_insert_empty_item(trans, log, path,
3295                                       &BTRFS_I(inode)->location,
3296                                       sizeof(*inode_item));
3297         if (ret && ret != -EEXIST)
3298                 return ret;
3299         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3300                                     struct btrfs_inode_item);
3301         fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3302         btrfs_release_path(path);
3303         return 0;
3304 }
3305 
3306 static noinline int copy_items(struct btrfs_trans_handle *trans,
3307                                struct inode *inode,
3308                                struct btrfs_path *dst_path,
3309                                struct btrfs_path *src_path, u64 *last_extent,
3310                                int start_slot, int nr, int inode_only,
3311                                u64 logged_isize)
3312 {
3313         unsigned long src_offset;
3314         unsigned long dst_offset;
3315         struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3316         struct btrfs_file_extent_item *extent;
3317         struct btrfs_inode_item *inode_item;
3318         struct extent_buffer *src = src_path->nodes[0];
3319         struct btrfs_key first_key, last_key, key;
3320         int ret;
3321         struct btrfs_key *ins_keys;
3322         u32 *ins_sizes;
3323         char *ins_data;
3324         int i;
3325         struct list_head ordered_sums;
3326         int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3327         bool has_extents = false;
3328         bool need_find_last_extent = true;
3329         bool done = false;
3330 
3331         INIT_LIST_HEAD(&ordered_sums);
3332 
3333         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3334                            nr * sizeof(u32), GFP_NOFS);
3335         if (!ins_data)
3336                 return -ENOMEM;
3337 
3338         first_key.objectid = (u64)-1;
3339 
3340         ins_sizes = (u32 *)ins_data;
3341         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3342 
3343         for (i = 0; i < nr; i++) {
3344                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3345                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3346         }
3347         ret = btrfs_insert_empty_items(trans, log, dst_path,
3348                                        ins_keys, ins_sizes, nr);
3349         if (ret) {
3350                 kfree(ins_data);
3351                 return ret;
3352         }
3353 
3354         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3355                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3356                                                    dst_path->slots[0]);
3357 
3358                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3359 
3360                 if ((i == (nr - 1)))
3361                         last_key = ins_keys[i];
3362 
3363                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3364                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3365                                                     dst_path->slots[0],
3366                                                     struct btrfs_inode_item);
3367                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3368                                         inode, inode_only == LOG_INODE_EXISTS,
3369                                         logged_isize);
3370                 } else {
3371                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3372                                            src_offset, ins_sizes[i]);
3373                 }
3374 
3375                 /*
3376                  * We set need_find_last_extent here in case we know we were
3377                  * processing other items and then walk into the first extent in
3378                  * the inode.  If we don't hit an extent then nothing changes,
3379                  * we'll do the last search the next time around.
3380                  */
3381                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3382                         has_extents = true;
3383                         if (first_key.objectid == (u64)-1)
3384                                 first_key = ins_keys[i];
3385                 } else {
3386                         need_find_last_extent = false;
3387                 }
3388 
3389                 /* take a reference on file data extents so that truncates
3390                  * or deletes of this inode don't have to relog the inode
3391                  * again
3392                  */
3393                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3394                     !skip_csum) {
3395                         int found_type;
3396                         extent = btrfs_item_ptr(src, start_slot + i,
3397                                                 struct btrfs_file_extent_item);
3398 
3399                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3400                                 continue;
3401 
3402                         found_type = btrfs_file_extent_type(src, extent);
3403                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3404                                 u64 ds, dl, cs, cl;
3405                                 ds = btrfs_file_extent_disk_bytenr(src,
3406                                                                 extent);
3407                                 /* ds == 0 is a hole */
3408                                 if (ds == 0)
3409                                         continue;
3410 
3411                                 dl = btrfs_file_extent_disk_num_bytes(src,
3412                                                                 extent);
3413                                 cs = btrfs_file_extent_offset(src, extent);
3414                                 cl = btrfs_file_extent_num_bytes(src,
3415                                                                 extent);
3416                                 if (btrfs_file_extent_compression(src,
3417                                                                   extent)) {
3418                                         cs = 0;
3419                                         cl = dl;
3420                                 }
3421 
3422                                 ret = btrfs_lookup_csums_range(
3423                                                 log->fs_info->csum_root,
3424                                                 ds + cs, ds + cs + cl - 1,
3425                                                 &ordered_sums, 0);
3426                                 if (ret) {
3427                                         btrfs_release_path(dst_path);
3428                                         kfree(ins_data);
3429                                         return ret;
3430                                 }
3431                         }
3432                 }
3433         }
3434 
3435         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3436         btrfs_release_path(dst_path);
3437         kfree(ins_data);
3438 
3439         /*
3440          * we have to do this after the loop above to avoid changing the
3441          * log tree while trying to change the log tree.
3442          */
3443         ret = 0;
3444         while (!list_empty(&ordered_sums)) {
3445                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3446                                                    struct btrfs_ordered_sum,
3447                                                    list);
3448                 if (!ret)
3449                         ret = btrfs_csum_file_blocks(trans, log, sums);
3450                 list_del(&sums->list);
3451                 kfree(sums);
3452         }
3453 
3454         if (!has_extents)
3455                 return ret;
3456 
3457         if (need_find_last_extent && *last_extent == first_key.offset) {
3458                 /*
3459                  * We don't have any leafs between our current one and the one
3460                  * we processed before that can have file extent items for our
3461                  * inode (and have a generation number smaller than our current
3462                  * transaction id).
3463                  */
3464                 need_find_last_extent = false;
3465         }
3466 
3467         /*
3468          * Because we use btrfs_search_forward we could skip leaves that were
3469          * not modified and then assume *last_extent is valid when it really
3470          * isn't.  So back up to the previous leaf and read the end of the last
3471          * extent before we go and fill in holes.
3472          */
3473         if (need_find_last_extent) {
3474                 u64 len;
3475 
3476                 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3477                 if (ret < 0)
3478                         return ret;
3479                 if (ret)
3480                         goto fill_holes;
3481                 if (src_path->slots[0])
3482                         src_path->slots[0]--;
3483                 src = src_path->nodes[0];
3484                 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3485                 if (key.objectid != btrfs_ino(inode) ||
3486                     key.type != BTRFS_EXTENT_DATA_KEY)
3487                         goto fill_holes;
3488                 extent = btrfs_item_ptr(src, src_path->slots[0],
3489                                         struct btrfs_file_extent_item);
3490                 if (btrfs_file_extent_type(src, extent) ==
3491                     BTRFS_FILE_EXTENT_INLINE) {
3492                         len = btrfs_file_extent_inline_len(src,
3493                                                            src_path->slots[0],
3494                                                            extent);
3495                         *last_extent = ALIGN(key.offset + len,
3496                                              log->sectorsize);
3497                 } else {
3498                         len = btrfs_file_extent_num_bytes(src, extent);
3499                         *last_extent = key.offset + len;
3500                 }
3501         }
3502 fill_holes:
3503         /* So we did prev_leaf, now we need to move to the next leaf, but a few
3504          * things could have happened
3505          *
3506          * 1) A merge could have happened, so we could currently be on a leaf
3507          * that holds what we were copying in the first place.
3508          * 2) A split could have happened, and now not all of the items we want
3509          * are on the same leaf.
3510          *
3511          * So we need to adjust how we search for holes, we need to drop the
3512          * path and re-search for the first extent key we found, and then walk
3513          * forward until we hit the last one we copied.
3514          */
3515         if (need_find_last_extent) {
3516                 /* btrfs_prev_leaf could return 1 without releasing the path */
3517                 btrfs_release_path(src_path);
3518                 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3519                                         src_path, 0, 0);
3520                 if (ret < 0)
3521                         return ret;
3522                 ASSERT(ret == 0);
3523                 src = src_path->nodes[0];
3524                 i = src_path->slots[0];
3525         } else {
3526                 i = start_slot;
3527         }
3528 
3529         /*
3530          * Ok so here we need to go through and fill in any holes we may have
3531          * to make sure that holes are punched for those areas in case they had
3532          * extents previously.
3533          */
3534         while (!done) {
3535                 u64 offset, len;
3536                 u64 extent_end;
3537 
3538                 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3539                         ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3540                         if (ret < 0)
3541                                 return ret;
3542                         ASSERT(ret == 0);
3543                         src = src_path->nodes[0];
3544                         i = 0;
3545                 }
3546 
3547                 btrfs_item_key_to_cpu(src, &key, i);
3548                 if (!btrfs_comp_cpu_keys(&key, &last_key))
3549                         done = true;
3550                 if (key.objectid != btrfs_ino(inode) ||
3551                     key.type != BTRFS_EXTENT_DATA_KEY) {
3552                         i++;
3553                         continue;
3554                 }
3555                 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3556                 if (btrfs_file_extent_type(src, extent) ==
3557                     BTRFS_FILE_EXTENT_INLINE) {
3558                         len = btrfs_file_extent_inline_len(src, i, extent);
3559                         extent_end = ALIGN(key.offset + len, log->sectorsize);
3560                 } else {
3561                         len = btrfs_file_extent_num_bytes(src, extent);
3562                         extent_end = key.offset + len;
3563                 }
3564                 i++;
3565 
3566                 if (*last_extent == key.offset) {
3567                         *last_extent = extent_end;
3568                         continue;
3569                 }
3570                 offset = *last_extent;
3571                 len = key.offset - *last_extent;
3572                 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3573                                                offset, 0, 0, len, 0, len, 0,
3574                                                0, 0);
3575                 if (ret)
3576                         break;
3577                 *last_extent = extent_end;
3578         }
3579         /*
3580          * Need to let the callers know we dropped the path so they should
3581          * re-search.
3582          */
3583         if (!ret && need_find_last_extent)
3584                 ret = 1;
3585         return ret;
3586 }
3587 
3588 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3589 {
3590         struct extent_map *em1, *em2;
3591 
3592         em1 = list_entry(a, struct extent_map, list);
3593         em2 = list_entry(b, struct extent_map, list);
3594 
3595         if (em1->start < em2->start)
3596                 return -1;
3597         else if (em1->start > em2->start)
3598                 return 1;
3599         return 0;
3600 }
3601 
3602 static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3603                                 struct inode *inode,
3604                                 struct btrfs_root *root,
3605                                 const struct extent_map *em,
3606                                 const struct list_head *logged_list,
3607                                 bool *ordered_io_error)
3608 {
3609         struct btrfs_ordered_extent *ordered;
3610         struct btrfs_root *log = root->log_root;
3611         u64 mod_start = em->mod_start;
3612         u64 mod_len = em->mod_len;
3613         const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3614         u64 csum_offset;
3615         u64 csum_len;
3616         LIST_HEAD(ordered_sums);
3617         int ret = 0;
3618 
3619         *ordered_io_error = false;
3620 
3621         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3622             em->block_start == EXTENT_MAP_HOLE)
3623                 return 0;
3624 
3625         /*
3626          * Wait far any ordered extent that covers our extent map. If it
3627          * finishes without an error, first check and see if our csums are on
3628          * our outstanding ordered extents.
3629          */
3630         list_for_each_entry(ordered, logged_list, log_list) {
3631                 struct btrfs_ordered_sum *sum;
3632 
3633                 if (!mod_len)
3634                         break;
3635 
3636                 if (ordered->file_offset + ordered->len <= mod_start ||
3637                     mod_start + mod_len <= ordered->file_offset)
3638                         continue;
3639 
3640                 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3641                     !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3642                     !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3643                         const u64 start = ordered->file_offset;
3644                         const u64 end = ordered->file_offset + ordered->len - 1;
3645 
3646                         WARN_ON(ordered->inode != inode);
3647                         filemap_fdatawrite_range(inode->i_mapping, start, end);
3648                 }
3649 
3650                 wait_event(ordered->wait,
3651                            (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3652                             test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3653 
3654                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3655                         /*
3656                          * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3657                          * i_mapping flags, so that the next fsync won't get
3658                          * an outdated io error too.
3659                          */
3660                         btrfs_inode_check_errors(inode);
3661                         *ordered_io_error = true;
3662                         break;
3663                 }
3664                 /*
3665                  * We are going to copy all the csums on this ordered extent, so
3666                  * go ahead and adjust mod_start and mod_len in case this
3667                  * ordered extent has already been logged.
3668                  */
3669                 if (ordered->file_offset > mod_start) {
3670                         if (ordered->file_offset + ordered->len >=
3671                             mod_start + mod_len)
3672                                 mod_len = ordered->file_offset - mod_start;
3673                         /*
3674                          * If we have this case
3675                          *
3676                          * |--------- logged extent ---------|
3677                          *       |----- ordered extent ----|
3678                          *
3679                          * Just don't mess with mod_start and mod_len, we'll
3680                          * just end up logging more csums than we need and it
3681                          * will be ok.
3682                          */
3683                 } else {
3684                         if (ordered->file_offset + ordered->len <
3685                             mod_start + mod_len) {
3686                                 mod_len = (mod_start + mod_len) -
3687                                         (ordered->file_offset + ordered->len);
3688                                 mod_start = ordered->file_offset +
3689                                         ordered->len;
3690                         } else {
3691                                 mod_len = 0;
3692                         }
3693                 }
3694 
3695                 if (skip_csum)
3696                         continue;
3697 
3698                 /*
3699                  * To keep us from looping for the above case of an ordered
3700                  * extent that falls inside of the logged extent.
3701                  */
3702                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3703                                      &ordered->flags))
3704                         continue;
3705 
3706                 if (ordered->csum_bytes_left) {
3707                         btrfs_start_ordered_extent(inode, ordered, 0);
3708                         wait_event(ordered->wait,
3709                                    ordered->csum_bytes_left == 0);
3710                 }
3711 
3712                 list_for_each_entry(sum, &ordered->list, list) {
3713                         ret = btrfs_csum_file_blocks(trans, log, sum);
3714                         if (ret)
3715                                 break;
3716                 }
3717         }
3718 
3719         if (*ordered_io_error || !mod_len || ret || skip_csum)
3720                 return ret;
3721 
3722         if (em->compress_type) {
3723                 csum_offset = 0;
3724                 csum_len = max(em->block_len, em->orig_block_len);
3725         } else {
3726                 csum_offset = mod_start - em->start;
3727                 csum_len = mod_len;
3728         }
3729 
3730         /* block start is already adjusted for the file extent offset. */
3731         ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3732                                        em->block_start + csum_offset,
3733                                        em->block_start + csum_offset +
3734                                        csum_len - 1, &ordered_sums, 0);
3735         if (ret)
3736                 return ret;
3737 
3738         while (!list_empty(&ordered_sums)) {
3739                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3740                                                    struct btrfs_ordered_sum,
3741                                                    list);
3742                 if (!ret)
3743                         ret = btrfs_csum_file_blocks(trans, log, sums);
3744                 list_del(&sums->list);
3745                 kfree(sums);
3746         }
3747 
3748         return ret;
3749 }
3750 
3751 static int log_one_extent(struct btrfs_trans_handle *trans,
3752                           struct inode *inode, struct btrfs_root *root,
3753                           const struct extent_map *em,
3754                           struct btrfs_path *path,
3755                           const struct list_head *logged_list,
3756                           struct btrfs_log_ctx *ctx)
3757 {
3758         struct btrfs_root *log = root->log_root;
3759         struct btrfs_file_extent_item *fi;
3760         struct extent_buffer *leaf;
3761         struct btrfs_map_token token;
3762         struct btrfs_key key;
3763         u64 extent_offset = em->start - em->orig_start;
3764         u64 block_len;
3765         int ret;
3766         int extent_inserted = 0;
3767         bool ordered_io_err = false;
3768 
3769         ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3770                                    &ordered_io_err);
3771         if (ret)
3772                 return ret;
3773 
3774         if (ordered_io_err) {
3775                 ctx->io_err = -EIO;
3776                 return 0;
3777         }
3778 
3779         btrfs_init_map_token(&token);
3780 
3781         ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3782                                    em->start + em->len, NULL, 0, 1,
3783                                    sizeof(*fi), &extent_inserted);
3784         if (ret)
3785                 return ret;
3786 
3787         if (!extent_inserted) {
3788                 key.objectid = btrfs_ino(inode);
3789                 key.type = BTRFS_EXTENT_DATA_KEY;
3790                 key.offset = em->start;
3791 
3792                 ret = btrfs_insert_empty_item(trans, log, path, &key,
3793                                               sizeof(*fi));
3794                 if (ret)
3795                         return ret;
3796         }
3797         leaf = path->nodes[0];
3798         fi = btrfs_item_ptr(leaf, path->slots[0],
3799                             struct btrfs_file_extent_item);
3800 
3801         btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
3802                                                &token);
3803         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3804                 btrfs_set_token_file_extent_type(leaf, fi,
3805                                                  BTRFS_FILE_EXTENT_PREALLOC,
3806                                                  &token);
3807         else
3808                 btrfs_set_token_file_extent_type(leaf, fi,
3809                                                  BTRFS_FILE_EXTENT_REG,
3810                                                  &token);
3811 
3812         block_len = max(em->block_len, em->orig_block_len);
3813         if (em->compress_type != BTRFS_COMPRESS_NONE) {
3814                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3815                                                         em->block_start,
3816                                                         &token);
3817                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3818                                                            &token);
3819         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3820                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3821                                                         em->block_start -
3822                                                         extent_offset, &token);
3823                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3824                                                            &token);
3825         } else {
3826                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3827                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3828                                                            &token);
3829         }
3830 
3831         btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
3832         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3833         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3834         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3835                                                 &token);
3836         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3837         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3838         btrfs_mark_buffer_dirty(leaf);
3839 
3840         btrfs_release_path(path);
3841 
3842         return ret;
3843 }
3844 
3845 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3846                                      struct btrfs_root *root,
3847                                      struct inode *inode,
3848                                      struct btrfs_path *path,
3849                                      struct list_head *logged_list,
3850                                      struct btrfs_log_ctx *ctx)
3851 {
3852         struct extent_map *em, *n;
3853         struct list_head extents;
3854         struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3855         u64 test_gen;
3856         int ret = 0;
3857         int num = 0;
3858 
3859         INIT_LIST_HEAD(&extents);
3860 
3861         write_lock(&tree->lock);
3862         test_gen = root->fs_info->last_trans_committed;
3863 
3864         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3865                 list_del_init(&em->list);
3866 
3867                 /*
3868                  * Just an arbitrary number, this can be really CPU intensive
3869                  * once we start getting a lot of extents, and really once we
3870                  * have a bunch of extents we just want to commit since it will
3871                  * be faster.
3872                  */
3873                 if (++num > 32768) {
3874                         list_del_init(&tree->modified_extents);
3875                         ret = -EFBIG;
3876                         goto process;
3877                 }
3878 
3879                 if (em->generation <= test_gen)
3880                         continue;
3881                 /* Need a ref to keep it from getting evicted from cache */
3882                 atomic_inc(&em->refs);
3883                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3884                 list_add_tail(&em->list, &extents);
3885                 num++;
3886         }
3887 
3888         list_sort(NULL, &extents, extent_cmp);
3889 
3890 process:
3891         while (!list_empty(&extents)) {
3892                 em = list_entry(extents.next, struct extent_map, list);
3893 
3894                 list_del_init(&em->list);
3895 
3896                 /*
3897                  * If we had an error we just need to delete everybody from our
3898                  * private list.
3899                  */
3900                 if (ret) {
3901                         clear_em_logging(tree, em);
3902                         free_extent_map(em);
3903                         continue;
3904                 }
3905 
3906                 write_unlock(&tree->lock);
3907 
3908                 ret = log_one_extent(trans, inode, root, em, path, logged_list,
3909                                      ctx);
3910                 write_lock(&tree->lock);
3911                 clear_em_logging(tree, em);
3912                 free_extent_map(em);
3913         }
3914         WARN_ON(!list_empty(&extents));
3915         write_unlock(&tree->lock);
3916 
3917         btrfs_release_path(path);
3918         return ret;
3919 }
3920 
3921 static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
3922                              struct btrfs_path *path, u64 *size_ret)
3923 {
3924         struct btrfs_key key;
3925         int ret;
3926 
3927         key.objectid = btrfs_ino(inode);
3928         key.type = BTRFS_INODE_ITEM_KEY;
3929         key.offset = 0;
3930 
3931         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
3932         if (ret < 0) {
3933                 return ret;
3934         } else if (ret > 0) {
3935                 *size_ret = i_size_read(inode);
3936         } else {
3937                 struct btrfs_inode_item *item;
3938 
3939                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3940                                       struct btrfs_inode_item);
3941                 *size_ret = btrfs_inode_size(path->nodes[0], item);
3942         }
3943 
3944         btrfs_release_path(path);
3945         return 0;
3946 }
3947 
3948 /* log a single inode in the tree log.
3949  * At least one parent directory for this inode must exist in the tree
3950  * or be logged already.
3951  *
3952  * Any items from this inode changed by the current transaction are copied
3953  * to the log tree.  An extra reference is taken on any extents in this
3954  * file, allowing us to avoid a whole pile of corner cases around logging
3955  * blocks that have been removed from the tree.
3956  *
3957  * See LOG_INODE_ALL and related defines for a description of what inode_only
3958  * does.
3959  *
3960  * This handles both files and directories.
3961  */
3962 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3963                            struct btrfs_root *root, struct inode *inode,
3964                            int inode_only,
3965                            const loff_t start,
3966                            const loff_t end,
3967                            struct btrfs_log_ctx *ctx)
3968 {
3969         struct btrfs_path *path;
3970         struct btrfs_path *dst_path;
3971         struct btrfs_key min_key;
3972         struct btrfs_key max_key;
3973         struct btrfs_root *log = root->log_root;
3974         struct extent_buffer *src = NULL;
3975         LIST_HEAD(logged_list);
3976         u64 last_extent = 0;
3977         int err = 0;
3978         int ret;
3979         int nritems;
3980         int ins_start_slot = 0;
3981         int ins_nr;
3982         bool fast_search = false;
3983         u64 ino = btrfs_ino(inode);
3984         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3985         u64 logged_isize = 0;
3986 
3987         path = btrfs_alloc_path();
3988         if (!path)
3989                 return -ENOMEM;
3990         dst_path = btrfs_alloc_path();
3991         if (!dst_path) {
3992                 btrfs_free_path(path);
3993                 return -ENOMEM;
3994         }
3995 
3996         min_key.objectid = ino;
3997         min_key.type = BTRFS_INODE_ITEM_KEY;
3998         min_key.offset = 0;
3999 
4000         max_key.objectid = ino;
4001 
4002 
4003         /* today the code can only do partial logging of directories */
4004         if (S_ISDIR(inode->i_mode) ||
4005             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4006                        &BTRFS_I(inode)->runtime_flags) &&
4007              inode_only == LOG_INODE_EXISTS))
4008                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4009         else
4010                 max_key.type = (u8)-1;
4011         max_key.offset = (u64)-1;
4012 
4013         /* Only run delayed items if we are a dir or a new file */
4014         if (S_ISDIR(inode->i_mode) ||
4015             BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
4016                 ret = btrfs_commit_inode_delayed_items(trans, inode);
4017                 if (ret) {
4018                         btrfs_free_path(path);
4019                         btrfs_free_path(dst_path);
4020                         return ret;
4021                 }
4022         }
4023 
4024         mutex_lock(&BTRFS_I(inode)->log_mutex);
4025 
4026         btrfs_get_logged_extents(inode, &logged_list, start, end);
4027 
4028         /*
4029          * a brute force approach to making sure we get the most uptodate
4030          * copies of everything.
4031          */
4032         if (S_ISDIR(inode->i_mode)) {
4033                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4034 
4035                 if (inode_only == LOG_INODE_EXISTS)
4036                         max_key_type = BTRFS_XATTR_ITEM_KEY;
4037                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4038         } else {
4039                 if (inode_only == LOG_INODE_EXISTS) {
4040                         /*
4041                          * Make sure the new inode item we write to the log has
4042                          * the same isize as the current one (if it exists).
4043                          * This is necessary to prevent data loss after log
4044                          * replay, and also to prevent doing a wrong expanding
4045                          * truncate - for e.g. create file, write 4K into offset
4046                          * 0, fsync, write 4K into offset 4096, add hard link,
4047                          * fsync some other file (to sync log), power fail - if
4048                          * we use the inode's current i_size, after log replay
4049                          * we get a 8Kb file, with the last 4Kb extent as a hole
4050                          * (zeroes), as if an expanding truncate happened,
4051                          * instead of getting a file of 4Kb only.
4052                          */
4053                         err = logged_inode_size(log, inode, path,
4054                                                 &logged_isize);
4055                         if (err)
4056                                 goto out_unlock;
4057                 }
4058                 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4059                                        &BTRFS_I(inode)->runtime_flags)) {
4060                         clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4061                                   &BTRFS_I(inode)->runtime_flags);
4062                         ret = btrfs_truncate_inode_items(trans, log,
4063                                                          inode, 0, 0);
4064                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4065                                               &BTRFS_I(inode)->runtime_flags) ||
4066                            inode_only == LOG_INODE_EXISTS) {
4067                         if (inode_only == LOG_INODE_ALL)
4068                                 fast_search = true;
4069                         max_key.type = BTRFS_XATTR_ITEM_KEY;
4070                         ret = drop_objectid_items(trans, log, path, ino,
4071                                                   max_key.type);
4072                 } else {
4073                         if (inode_only == LOG_INODE_ALL)
4074                                 fast_search = true;
4075                         ret = log_inode_item(trans, log, dst_path, inode);
4076                         if (ret) {
4077                                 err = ret;
4078                                 goto out_unlock;
4079                         }
4080                         goto log_extents;
4081                 }
4082 
4083         }
4084         if (ret) {
4085                 err = ret;
4086                 goto out_unlock;
4087         }
4088 
4089         while (1) {
4090                 ins_nr = 0;
4091                 ret = btrfs_search_forward(root, &min_key,
4092                                            path, trans->transid);
4093                 if (ret != 0)
4094                         break;
4095 again:
4096                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
4097                 if (min_key.objectid != ino)
4098                         break;
4099                 if (min_key.type > max_key.type)
4100                         break;
4101 
4102                 src = path->nodes[0];
4103                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4104                         ins_nr++;
4105                         goto next_slot;
4106                 } else if (!ins_nr) {
4107                         ins_start_slot = path->slots[0];
4108                         ins_nr = 1;
4109                         goto next_slot;
4110                 }
4111 
4112                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4113                                  ins_start_slot, ins_nr, inode_only,
4114                                  logged_isize);
4115                 if (ret < 0) {
4116                         err = ret;
4117                         goto out_unlock;
4118                 }
4119                 if (ret) {
4120                         ins_nr = 0;
4121                         btrfs_release_path(path);
4122                         continue;
4123                 }
4124                 ins_nr = 1;
4125                 ins_start_slot = path->slots[0];
4126 next_slot:
4127 
4128                 nritems = btrfs_header_nritems(path->nodes[0]);
4129                 path->slots[0]++;
4130                 if (path->slots[0] < nritems) {
4131                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
4132                                               path->slots[0]);
4133                         goto again;
4134                 }
4135                 if (ins_nr) {
4136                         ret = copy_items(trans, inode, dst_path, path,
4137                                          &last_extent, ins_start_slot,
4138                                          ins_nr, inode_only, logged_isize);
4139                         if (ret < 0) {
4140                                 err = ret;
4141                                 goto out_unlock;
4142                         }
4143                         ret = 0;
4144                         ins_nr = 0;
4145                 }
4146                 btrfs_release_path(path);
4147 
4148                 if (min_key.offset < (u64)-1) {
4149                         min_key.offset++;
4150                 } else if (min_key.type < max_key.type) {
4151                         min_key.type++;
4152                         min_key.offset = 0;
4153                 } else {
4154                         break;
4155                 }
4156         }
4157         if (ins_nr) {
4158                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4159                                  ins_start_slot, ins_nr, inode_only,
4160                                  logged_isize);
4161                 if (ret < 0) {
4162                         err = ret;
4163                         goto out_unlock;
4164                 }
4165                 ret = 0;
4166                 ins_nr = 0;
4167         }
4168 
4169 log_extents:
4170         btrfs_release_path(path);
4171         btrfs_release_path(dst_path);
4172         if (fast_search) {
4173                 /*
4174                  * Some ordered extents started by fsync might have completed
4175                  * before we collected the ordered extents in logged_list, which
4176                  * means they're gone, not in our logged_list nor in the inode's
4177                  * ordered tree. We want the application/user space to know an
4178                  * error happened while attempting to persist file data so that
4179                  * it can take proper action. If such error happened, we leave
4180                  * without writing to the log tree and the fsync must report the
4181                  * file data write error and not commit the current transaction.
4182                  */
4183                 err = btrfs_inode_check_errors(inode);
4184                 if (err) {
4185                         ctx->io_err = err;
4186                         goto out_unlock;
4187                 }
4188                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4189                                                 &logged_list, ctx);
4190                 if (ret) {
4191                         err = ret;
4192                         goto out_unlock;
4193                 }
4194         } else if (inode_only == LOG_INODE_ALL) {
4195                 struct extent_map *em, *n;
4196 
4197                 write_lock(&em_tree->lock);
4198                 /*
4199                  * We can't just remove every em if we're called for a ranged
4200                  * fsync - that is, one that doesn't cover the whole possible
4201                  * file range (0 to LLONG_MAX). This is because we can have
4202                  * em's that fall outside the range we're logging and therefore
4203                  * their ordered operations haven't completed yet
4204                  * (btrfs_finish_ordered_io() not invoked yet). This means we
4205                  * didn't get their respective file extent item in the fs/subvol
4206                  * tree yet, and need to let the next fast fsync (one which
4207                  * consults the list of modified extent maps) find the em so
4208                  * that it logs a matching file extent item and waits for the
4209                  * respective ordered operation to complete (if it's still
4210                  * running).
4211                  *
4212                  * Removing every em outside the range we're logging would make
4213                  * the next fast fsync not log their matching file extent items,
4214                  * therefore making us lose data after a log replay.
4215                  */
4216                 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4217                                          list) {
4218                         const u64 mod_end = em->mod_start + em->mod_len - 1;
4219 
4220                         if (em->mod_start >= start && mod_end <= end)
4221                                 list_del_init(&em->list);
4222                 }
4223                 write_unlock(&em_tree->lock);
4224         }
4225 
4226         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4227                 ret = log_directory_changes(trans, root, inode, path, dst_path);
4228                 if (ret) {
4229                         err = ret;
4230                         goto out_unlock;
4231                 }
4232         }
4233 
4234         BTRFS_I(inode)->logged_trans = trans->transid;
4235         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4236 out_unlock:
4237         if (unlikely(err))
4238                 btrfs_put_logged_extents(&logged_list);
4239         else
4240                 btrfs_submit_logged_extents(&logged_list, log);
4241         mutex_unlock(&BTRFS_I(inode)->log_mutex);
4242 
4243         btrfs_free_path(path);
4244         btrfs_free_path(dst_path);
4245         return err;
4246 }
4247 
4248 /*
4249  * follow the dentry parent pointers up the chain and see if any
4250  * of the directories in it require a full commit before they can
4251  * be logged.  Returns zero if nothing special needs to be done or 1 if
4252  * a full commit is required.
4253  */
4254 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4255                                                struct inode *inode,
4256                                                struct dentry *parent,
4257                                                struct super_block *sb,
4258                                                u64 last_committed)
4259 {
4260         int ret = 0;
4261         struct btrfs_root *root;
4262         struct dentry *old_parent = NULL;
4263         struct inode *orig_inode = inode;
4264 
4265         /*
4266          * for regular files, if its inode is already on disk, we don't
4267          * have to worry about the parents at all.  This is because
4268          * we can use the last_unlink_trans field to record renames
4269          * and other fun in this file.
4270          */
4271         if (S_ISREG(inode->i_mode) &&
4272             BTRFS_I(inode)->generation <= last_committed &&
4273             BTRFS_I(inode)->last_unlink_trans <= last_committed)
4274                         goto out;
4275 
4276         if (!S_ISDIR(inode->i_mode)) {
4277                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4278                         goto out;
4279                 inode = parent->d_inode;
4280         }
4281 
4282         while (1) {
4283                 /*
4284                  * If we are logging a directory then we start with our inode,
4285                  * not our parents inode, so we need to skipp setting the
4286                  * logged_trans so that further down in the log code we don't
4287                  * think this inode has already been logged.
4288                  */
4289                 if (inode != orig_inode)
4290                         BTRFS_I(inode)->logged_trans = trans->transid;
4291                 smp_mb();
4292 
4293                 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
4294                         root = BTRFS_I(inode)->root;
4295 
4296                         /*
4297                          * make sure any commits to the log are forced
4298                          * to be full commits
4299                          */
4300                         btrfs_set_log_full_commit(root->fs_info, trans);
4301                         ret = 1;
4302                         break;
4303                 }
4304 
4305                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4306                         break;
4307 
4308                 if (IS_ROOT(parent))
4309                         break;
4310 
4311                 parent = dget_parent(parent);
4312                 dput(old_parent);
4313                 old_parent = parent;
4314                 inode = parent->d_inode;
4315 
4316         }
4317         dput(old_parent);
4318 out:
4319         return ret;
4320 }
4321 
4322 /*
4323  * helper function around btrfs_log_inode to make sure newly created
4324  * parent directories also end up in the log.  A minimal inode and backref
4325  * only logging is done of any parent directories that are older than
4326  * the last committed transaction
4327  */
4328 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4329                                   struct btrfs_root *root, struct inode *inode,
4330                                   struct dentry *parent,
4331                                   const loff_t start,
4332                                   const loff_t end,
4333                                   int exists_only,
4334                                   struct btrfs_log_ctx *ctx)
4335 {
4336         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4337         struct super_block *sb;
4338         struct dentry *old_parent = NULL;
4339         int ret = 0;
4340         u64 last_committed = root->fs_info->last_trans_committed;
4341 
4342         sb = inode->i_sb;
4343 
4344         if (btrfs_test_opt(root, NOTREELOG)) {
4345                 ret = 1;
4346                 goto end_no_trans;
4347         }
4348 
4349         /*
4350          * The prev transaction commit doesn't complete, we need do
4351          * full commit by ourselves.
4352          */
4353         if (root->fs_info->last_trans_log_full_commit >
4354             root->fs_info->last_trans_committed) {
4355                 ret = 1;
4356                 goto end_no_trans;
4357         }
4358 
4359         if (root != BTRFS_I(inode)->root ||
4360             btrfs_root_refs(&root->root_item) == 0) {
4361                 ret = 1;
4362                 goto end_no_trans;
4363         }
4364 
4365         ret = check_parent_dirs_for_sync(trans, inode, parent,
4366                                          sb, last_committed);
4367         if (ret)
4368                 goto end_no_trans;
4369 
4370         if (btrfs_inode_in_log(inode, trans->transid)) {
4371                 ret = BTRFS_NO_LOG_SYNC;
4372                 goto end_no_trans;
4373         }
4374 
4375         ret = start_log_trans(trans, root, ctx);
4376         if (ret)
4377                 goto end_no_trans;
4378 
4379         ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4380         if (ret)
4381                 goto end_trans;
4382 
4383         /*
4384          * for regular files, if its inode is already on disk, we don't
4385          * have to worry about the parents at all.  This is because
4386          * we can use the last_unlink_trans field to record renames
4387          * and other fun in this file.
4388          */
4389         if (S_ISREG(inode->i_mode) &&
4390             BTRFS_I(inode)->generation <= last_committed &&
4391             BTRFS_I(inode)->last_unlink_trans <= last_committed) {
4392                 ret = 0;
4393                 goto end_trans;
4394         }
4395 
4396         inode_only = LOG_INODE_EXISTS;
4397         while (1) {
4398                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4399                         break;
4400 
4401                 inode = parent->d_inode;
4402                 if (root != BTRFS_I(inode)->root)
4403                         break;
4404 
4405                 if (BTRFS_I(inode)->generation >
4406                     root->fs_info->last_trans_committed) {
4407                         ret = btrfs_log_inode(trans, root, inode, inode_only,
4408                                               0, LLONG_MAX, ctx);
4409                         if (ret)
4410                                 goto end_trans;
4411                 }
4412                 if (IS_ROOT(parent))
4413                         break;
4414 
4415                 parent = dget_parent(parent);
4416                 dput(old_parent);
4417                 old_parent = parent;
4418         }
4419         ret = 0;
4420 end_trans:
4421         dput(old_parent);
4422         if (ret < 0) {
4423                 btrfs_set_log_full_commit(root->fs_info, trans);
4424                 ret = 1;
4425         }
4426 
4427         if (ret)
4428                 btrfs_remove_log_ctx(root, ctx);
4429         btrfs_end_log_trans(root);
4430 end_no_trans:
4431         return ret;
4432 }
4433 
4434 /*
4435  * it is not safe to log dentry if the chunk root has added new
4436  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
4437  * If this returns 1, you must commit the transaction to safely get your
4438  * data on disk.
4439  */
4440 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4441                           struct btrfs_root *root, struct dentry *dentry,
4442                           const loff_t start,
4443                           const loff_t end,
4444                           struct btrfs_log_ctx *ctx)
4445 {
4446         struct dentry *parent = dget_parent(dentry);
4447         int ret;
4448 
4449         ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4450                                      start, end, 0, ctx);
4451         dput(parent);
4452 
4453         return ret;
4454 }
4455 
4456 /*
4457  * should be called during mount to recover any replay any log trees
4458  * from the FS
4459  */
4460 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4461 {
4462         int ret;
4463         struct btrfs_path *path;
4464         struct btrfs_trans_handle *trans;
4465         struct btrfs_key key;
4466         struct btrfs_key found_key;
4467         struct btrfs_key tmp_key;
4468         struct btrfs_root *log;
4469         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
4470         struct walk_control wc = {
4471                 .process_func = process_one_buffer,
4472                 .stage = 0,
4473         };
4474 
4475         path = btrfs_alloc_path();
4476         if (!path)
4477                 return -ENOMEM;
4478 
4479         fs_info->log_root_recovering = 1;
4480 
4481         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4482         if (IS_ERR(trans)) {
4483                 ret = PTR_ERR(trans);
4484                 goto error;
4485         }
4486 
4487         wc.trans = trans;
4488         wc.pin = 1;
4489 
4490         ret = walk_log_tree(trans, log_root_tree, &wc);
4491         if (ret) {
4492                 btrfs_error(fs_info, ret, "Failed to pin buffers while "
4493                             "recovering log root tree.");
4494                 goto error;
4495         }
4496 
4497 again:
4498         key.objectid = BTRFS_TREE_LOG_OBJECTID;
4499         key.offset = (u64)-1;
4500         key.type = BTRFS_ROOT_ITEM_KEY;
4501 
4502         while (1) {
4503                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4504 
4505                 if (ret < 0) {
4506                         btrfs_error(fs_info, ret,
4507                                     "Couldn't find tree log root.");
4508                         goto error;
4509                 }
4510                 if (ret > 0) {
4511                         if (path->slots[0] == 0)
4512                                 break;
4513                         path->slots[0]--;
4514                 }
4515                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4516                                       path->slots[0]);
4517                 btrfs_release_path(path);
4518                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4519                         break;
4520 
4521                 log = btrfs_read_fs_root(log_root_tree, &found_key);
4522                 if (IS_ERR(log)) {
4523                         ret = PTR_ERR(log);
4524                         btrfs_error(fs_info, ret,
4525                                     "Couldn't read tree log root.");
4526                         goto error;
4527                 }
4528 
4529                 tmp_key.objectid = found_key.offset;
4530                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4531                 tmp_key.offset = (u64)-1;
4532 
4533                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4534                 if (IS_ERR(wc.replay_dest)) {
4535                         ret = PTR_ERR(wc.replay_dest);
4536                         free_extent_buffer(log->node);
4537                         free_extent_buffer(log->commit_root);
4538                         kfree(log);
4539                         btrfs_error(fs_info, ret, "Couldn't read target root "
4540                                     "for tree log recovery.");
4541                         goto error;
4542                 }
4543 
4544                 wc.replay_dest->log_root = log;
4545                 btrfs_record_root_in_trans(trans, wc.replay_dest);
4546                 ret = walk_log_tree(trans, log, &wc);
4547 
4548                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4549                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
4550                                                       path);
4551                 }
4552 
4553                 key.offset = found_key.offset - 1;
4554                 wc.replay_dest->log_root = NULL;
4555                 free_extent_buffer(log->node);
4556                 free_extent_buffer(log->commit_root);
4557                 kfree(log);
4558 
4559                 if (ret)
4560                         goto error;
4561 
4562                 if (found_key.offset == 0)
4563                         break;
4564         }
4565         btrfs_release_path(path);
4566 
4567         /* step one is to pin it all, step two is to replay just inodes */
4568         if (wc.pin) {
4569                 wc.pin = 0;
4570                 wc.process_func = replay_one_buffer;
4571                 wc.stage = LOG_WALK_REPLAY_INODES;
4572                 goto again;
4573         }
4574         /* step three is to replay everything */
4575         if (wc.stage < LOG_WALK_REPLAY_ALL) {
4576                 wc.stage++;
4577                 goto again;
4578         }
4579 
4580         btrfs_free_path(path);
4581 
4582         /* step 4: commit the transaction, which also unpins the blocks */
4583         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4584         if (ret)
4585                 return ret;
4586 
4587         free_extent_buffer(log_root_tree->node);
4588         log_root_tree->log_root = NULL;
4589         fs_info->log_root_recovering = 0;
4590         kfree(log_root_tree);
4591 
4592         return 0;
4593 error:
4594         if (wc.trans)
4595                 btrfs_end_transaction(wc.trans, fs_info->tree_root);
4596         btrfs_free_path(path);
4597         return ret;
4598 }
4599 
4600 /*
4601  * there are some corner cases where we want to force a full
4602  * commit instead of allowing a directory to be logged.
4603  *
4604  * They revolve around files there were unlinked from the directory, and
4605  * this function updates the parent directory so that a full commit is
4606  * properly done if it is fsync'd later after the unlinks are done.
4607  */
4608 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4609                              struct inode *dir, struct inode *inode,
4610                              int for_rename)
4611 {
4612         /*
4613          * when we're logging a file, if it hasn't been renamed
4614          * or unlinked, and its inode is fully committed on disk,
4615          * we don't have to worry about walking up the directory chain
4616          * to log its parents.
4617          *
4618          * So, we use the last_unlink_trans field to put this transid
4619          * into the file.  When the file is logged we check it and
4620          * don't log the parents if the file is fully on disk.
4621          */
4622         if (S_ISREG(inode->i_mode))
4623                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4624 
4625         /*
4626          * if this directory was already logged any new
4627          * names for this file/dir will get recorded
4628          */
4629         smp_mb();
4630         if (BTRFS_I(dir)->logged_trans == trans->transid)
4631                 return;
4632 
4633         /*
4634          * if the inode we're about to unlink was logged,
4635          * the log will be properly updated for any new names
4636          */
4637         if (BTRFS_I(inode)->logged_trans == trans->transid)
4638                 return;
4639 
4640         /*
4641          * when renaming files across directories, if the directory
4642          * there we're unlinking from gets fsync'd later on, there's
4643          * no way to find the destination directory later and fsync it
4644          * properly.  So, we have to be conservative and force commits
4645          * so the new name gets discovered.
4646          */
4647         if (for_rename)
4648                 goto record;
4649 
4650         /* we can safely do the unlink without any special recording */
4651         return;
4652 
4653 record:
4654         BTRFS_I(dir)->last_unlink_trans = trans->transid;
4655 }
4656 
4657 /*
4658  * Call this after adding a new name for a file and it will properly
4659  * update the log to reflect the new name.
4660  *
4661  * It will return zero if all goes well, and it will return 1 if a
4662  * full transaction commit is required.
4663  */
4664 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4665                         struct inode *inode, struct inode *old_dir,
4666                         struct dentry *parent)
4667 {
4668         struct btrfs_root * root = BTRFS_I(inode)->root;
4669 
4670         /*
4671          * this will force the logging code to walk the dentry chain
4672          * up for the file
4673          */
4674         if (S_ISREG(inode->i_mode))
4675                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4676 
4677         /*
4678          * if this inode hasn't been logged and directory we're renaming it
4679          * from hasn't been logged, we don't need to log it
4680          */
4681         if (BTRFS_I(inode)->logged_trans <=
4682             root->fs_info->last_trans_committed &&
4683             (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4684                     root->fs_info->last_trans_committed))
4685                 return 0;
4686 
4687         return btrfs_log_inode_parent(trans, root, inode, parent, 0,
4688                                       LLONG_MAX, 1, NULL);
4689 }
4690 
4691 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp