~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/btrfs/ctree.c

Version: ~ [ linux-5.8-rc4 ] ~ [ linux-5.7.7 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.50 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.131 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.187 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.229 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.229 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
  4  */
  5 
  6 #include <linux/sched.h>
  7 #include <linux/slab.h>
  8 #include <linux/rbtree.h>
  9 #include <linux/mm.h>
 10 #include "ctree.h"
 11 #include "disk-io.h"
 12 #include "transaction.h"
 13 #include "print-tree.h"
 14 #include "locking.h"
 15 
 16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 17                       *root, struct btrfs_path *path, int level);
 18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
 20                       int data_size, int extend);
 21 static int push_node_left(struct btrfs_trans_handle *trans,
 22                           struct btrfs_fs_info *fs_info,
 23                           struct extent_buffer *dst,
 24                           struct extent_buffer *src, int empty);
 25 static int balance_node_right(struct btrfs_trans_handle *trans,
 26                               struct btrfs_fs_info *fs_info,
 27                               struct extent_buffer *dst_buf,
 28                               struct extent_buffer *src_buf);
 29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 30                     int level, int slot);
 31 
 32 struct btrfs_path *btrfs_alloc_path(void)
 33 {
 34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
 35 }
 36 
 37 /*
 38  * set all locked nodes in the path to blocking locks.  This should
 39  * be done before scheduling
 40  */
 41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 42 {
 43         int i;
 44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 45                 if (!p->nodes[i] || !p->locks[i])
 46                         continue;
 47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
 48                 if (p->locks[i] == BTRFS_READ_LOCK)
 49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
 50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
 51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
 52         }
 53 }
 54 
 55 /*
 56  * reset all the locked nodes in the patch to spinning locks.
 57  *
 58  * held is used to keep lockdep happy, when lockdep is enabled
 59  * we set held to a blocking lock before we go around and
 60  * retake all the spinlocks in the path.  You can safely use NULL
 61  * for held
 62  */
 63 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 64                                         struct extent_buffer *held, int held_rw)
 65 {
 66         int i;
 67 
 68         if (held) {
 69                 btrfs_set_lock_blocking_rw(held, held_rw);
 70                 if (held_rw == BTRFS_WRITE_LOCK)
 71                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
 72                 else if (held_rw == BTRFS_READ_LOCK)
 73                         held_rw = BTRFS_READ_LOCK_BLOCKING;
 74         }
 75         btrfs_set_path_blocking(p);
 76 
 77         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 78                 if (p->nodes[i] && p->locks[i]) {
 79                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
 80                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
 81                                 p->locks[i] = BTRFS_WRITE_LOCK;
 82                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
 83                                 p->locks[i] = BTRFS_READ_LOCK;
 84                 }
 85         }
 86 
 87         if (held)
 88                 btrfs_clear_lock_blocking_rw(held, held_rw);
 89 }
 90 
 91 /* this also releases the path */
 92 void btrfs_free_path(struct btrfs_path *p)
 93 {
 94         if (!p)
 95                 return;
 96         btrfs_release_path(p);
 97         kmem_cache_free(btrfs_path_cachep, p);
 98 }
 99 
100 /*
101  * path release drops references on the extent buffers in the path
102  * and it drops any locks held by this path
103  *
104  * It is safe to call this on paths that no locks or extent buffers held.
105  */
106 noinline void btrfs_release_path(struct btrfs_path *p)
107 {
108         int i;
109 
110         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
111                 p->slots[i] = 0;
112                 if (!p->nodes[i])
113                         continue;
114                 if (p->locks[i]) {
115                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
116                         p->locks[i] = 0;
117                 }
118                 free_extent_buffer(p->nodes[i]);
119                 p->nodes[i] = NULL;
120         }
121 }
122 
123 /*
124  * safely gets a reference on the root node of a tree.  A lock
125  * is not taken, so a concurrent writer may put a different node
126  * at the root of the tree.  See btrfs_lock_root_node for the
127  * looping required.
128  *
129  * The extent buffer returned by this has a reference taken, so
130  * it won't disappear.  It may stop being the root of the tree
131  * at any time because there are no locks held.
132  */
133 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
134 {
135         struct extent_buffer *eb;
136 
137         while (1) {
138                 rcu_read_lock();
139                 eb = rcu_dereference(root->node);
140 
141                 /*
142                  * RCU really hurts here, we could free up the root node because
143                  * it was COWed but we may not get the new root node yet so do
144                  * the inc_not_zero dance and if it doesn't work then
145                  * synchronize_rcu and try again.
146                  */
147                 if (atomic_inc_not_zero(&eb->refs)) {
148                         rcu_read_unlock();
149                         break;
150                 }
151                 rcu_read_unlock();
152                 synchronize_rcu();
153         }
154         return eb;
155 }
156 
157 /* loop around taking references on and locking the root node of the
158  * tree until you end up with a lock on the root.  A locked buffer
159  * is returned, with a reference held.
160  */
161 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162 {
163         struct extent_buffer *eb;
164 
165         while (1) {
166                 eb = btrfs_root_node(root);
167                 btrfs_tree_lock(eb);
168                 if (eb == root->node)
169                         break;
170                 btrfs_tree_unlock(eb);
171                 free_extent_buffer(eb);
172         }
173         return eb;
174 }
175 
176 /* loop around taking references on and locking the root node of the
177  * tree until you end up with a lock on the root.  A locked buffer
178  * is returned, with a reference held.
179  */
180 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181 {
182         struct extent_buffer *eb;
183 
184         while (1) {
185                 eb = btrfs_root_node(root);
186                 btrfs_tree_read_lock(eb);
187                 if (eb == root->node)
188                         break;
189                 btrfs_tree_read_unlock(eb);
190                 free_extent_buffer(eb);
191         }
192         return eb;
193 }
194 
195 /* cowonly root (everything not a reference counted cow subvolume), just get
196  * put onto a simple dirty list.  transaction.c walks this to make sure they
197  * get properly updated on disk.
198  */
199 static void add_root_to_dirty_list(struct btrfs_root *root)
200 {
201         struct btrfs_fs_info *fs_info = root->fs_info;
202 
203         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
204             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
205                 return;
206 
207         spin_lock(&fs_info->trans_lock);
208         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209                 /* Want the extent tree to be the last on the list */
210                 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
211                         list_move_tail(&root->dirty_list,
212                                        &fs_info->dirty_cowonly_roots);
213                 else
214                         list_move(&root->dirty_list,
215                                   &fs_info->dirty_cowonly_roots);
216         }
217         spin_unlock(&fs_info->trans_lock);
218 }
219 
220 /*
221  * used by snapshot creation to make a copy of a root for a tree with
222  * a given objectid.  The buffer with the new root node is returned in
223  * cow_ret, and this func returns zero on success or a negative error code.
224  */
225 int btrfs_copy_root(struct btrfs_trans_handle *trans,
226                       struct btrfs_root *root,
227                       struct extent_buffer *buf,
228                       struct extent_buffer **cow_ret, u64 new_root_objectid)
229 {
230         struct btrfs_fs_info *fs_info = root->fs_info;
231         struct extent_buffer *cow;
232         int ret = 0;
233         int level;
234         struct btrfs_disk_key disk_key;
235 
236         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
237                 trans->transid != fs_info->running_transaction->transid);
238         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
239                 trans->transid != root->last_trans);
240 
241         level = btrfs_header_level(buf);
242         if (level == 0)
243                 btrfs_item_key(buf, &disk_key, 0);
244         else
245                 btrfs_node_key(buf, &disk_key, 0);
246 
247         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248                         &disk_key, level, buf->start, 0);
249         if (IS_ERR(cow))
250                 return PTR_ERR(cow);
251 
252         copy_extent_buffer_full(cow, buf);
253         btrfs_set_header_bytenr(cow, cow->start);
254         btrfs_set_header_generation(cow, trans->transid);
255         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
256         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
257                                      BTRFS_HEADER_FLAG_RELOC);
258         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
259                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
260         else
261                 btrfs_set_header_owner(cow, new_root_objectid);
262 
263         write_extent_buffer_fsid(cow, fs_info->fsid);
264 
265         WARN_ON(btrfs_header_generation(buf) > trans->transid);
266         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
267                 ret = btrfs_inc_ref(trans, root, cow, 1);
268         else
269                 ret = btrfs_inc_ref(trans, root, cow, 0);
270 
271         if (ret)
272                 return ret;
273 
274         btrfs_mark_buffer_dirty(cow);
275         *cow_ret = cow;
276         return 0;
277 }
278 
279 enum mod_log_op {
280         MOD_LOG_KEY_REPLACE,
281         MOD_LOG_KEY_ADD,
282         MOD_LOG_KEY_REMOVE,
283         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
284         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
285         MOD_LOG_MOVE_KEYS,
286         MOD_LOG_ROOT_REPLACE,
287 };
288 
289 struct tree_mod_root {
290         u64 logical;
291         u8 level;
292 };
293 
294 struct tree_mod_elem {
295         struct rb_node node;
296         u64 logical;
297         u64 seq;
298         enum mod_log_op op;
299 
300         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
301         int slot;
302 
303         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
304         u64 generation;
305 
306         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
307         struct btrfs_disk_key key;
308         u64 blockptr;
309 
310         /* this is used for op == MOD_LOG_MOVE_KEYS */
311         struct {
312                 int dst_slot;
313                 int nr_items;
314         } move;
315 
316         /* this is used for op == MOD_LOG_ROOT_REPLACE */
317         struct tree_mod_root old_root;
318 };
319 
320 /*
321  * Pull a new tree mod seq number for our operation.
322  */
323 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
324 {
325         return atomic64_inc_return(&fs_info->tree_mod_seq);
326 }
327 
328 /*
329  * This adds a new blocker to the tree mod log's blocker list if the @elem
330  * passed does not already have a sequence number set. So when a caller expects
331  * to record tree modifications, it should ensure to set elem->seq to zero
332  * before calling btrfs_get_tree_mod_seq.
333  * Returns a fresh, unused tree log modification sequence number, even if no new
334  * blocker was added.
335  */
336 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
337                            struct seq_list *elem)
338 {
339         write_lock(&fs_info->tree_mod_log_lock);
340         spin_lock(&fs_info->tree_mod_seq_lock);
341         if (!elem->seq) {
342                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
343                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
344         }
345         spin_unlock(&fs_info->tree_mod_seq_lock);
346         write_unlock(&fs_info->tree_mod_log_lock);
347 
348         return elem->seq;
349 }
350 
351 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
352                             struct seq_list *elem)
353 {
354         struct rb_root *tm_root;
355         struct rb_node *node;
356         struct rb_node *next;
357         struct seq_list *cur_elem;
358         struct tree_mod_elem *tm;
359         u64 min_seq = (u64)-1;
360         u64 seq_putting = elem->seq;
361 
362         if (!seq_putting)
363                 return;
364 
365         spin_lock(&fs_info->tree_mod_seq_lock);
366         list_del(&elem->list);
367         elem->seq = 0;
368 
369         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
370                 if (cur_elem->seq < min_seq) {
371                         if (seq_putting > cur_elem->seq) {
372                                 /*
373                                  * blocker with lower sequence number exists, we
374                                  * cannot remove anything from the log
375                                  */
376                                 spin_unlock(&fs_info->tree_mod_seq_lock);
377                                 return;
378                         }
379                         min_seq = cur_elem->seq;
380                 }
381         }
382         spin_unlock(&fs_info->tree_mod_seq_lock);
383 
384         /*
385          * anything that's lower than the lowest existing (read: blocked)
386          * sequence number can be removed from the tree.
387          */
388         write_lock(&fs_info->tree_mod_log_lock);
389         tm_root = &fs_info->tree_mod_log;
390         for (node = rb_first(tm_root); node; node = next) {
391                 next = rb_next(node);
392                 tm = rb_entry(node, struct tree_mod_elem, node);
393                 if (tm->seq > min_seq)
394                         continue;
395                 rb_erase(node, tm_root);
396                 kfree(tm);
397         }
398         write_unlock(&fs_info->tree_mod_log_lock);
399 }
400 
401 /*
402  * key order of the log:
403  *       node/leaf start address -> sequence
404  *
405  * The 'start address' is the logical address of the *new* root node
406  * for root replace operations, or the logical address of the affected
407  * block for all other operations.
408  *
409  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
410  */
411 static noinline int
412 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
413 {
414         struct rb_root *tm_root;
415         struct rb_node **new;
416         struct rb_node *parent = NULL;
417         struct tree_mod_elem *cur;
418 
419         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
420 
421         tm_root = &fs_info->tree_mod_log;
422         new = &tm_root->rb_node;
423         while (*new) {
424                 cur = rb_entry(*new, struct tree_mod_elem, node);
425                 parent = *new;
426                 if (cur->logical < tm->logical)
427                         new = &((*new)->rb_left);
428                 else if (cur->logical > tm->logical)
429                         new = &((*new)->rb_right);
430                 else if (cur->seq < tm->seq)
431                         new = &((*new)->rb_left);
432                 else if (cur->seq > tm->seq)
433                         new = &((*new)->rb_right);
434                 else
435                         return -EEXIST;
436         }
437 
438         rb_link_node(&tm->node, parent, new);
439         rb_insert_color(&tm->node, tm_root);
440         return 0;
441 }
442 
443 /*
444  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
445  * returns zero with the tree_mod_log_lock acquired. The caller must hold
446  * this until all tree mod log insertions are recorded in the rb tree and then
447  * write unlock fs_info::tree_mod_log_lock.
448  */
449 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
450                                     struct extent_buffer *eb) {
451         smp_mb();
452         if (list_empty(&(fs_info)->tree_mod_seq_list))
453                 return 1;
454         if (eb && btrfs_header_level(eb) == 0)
455                 return 1;
456 
457         write_lock(&fs_info->tree_mod_log_lock);
458         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
459                 write_unlock(&fs_info->tree_mod_log_lock);
460                 return 1;
461         }
462 
463         return 0;
464 }
465 
466 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
467 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
468                                     struct extent_buffer *eb)
469 {
470         smp_mb();
471         if (list_empty(&(fs_info)->tree_mod_seq_list))
472                 return 0;
473         if (eb && btrfs_header_level(eb) == 0)
474                 return 0;
475 
476         return 1;
477 }
478 
479 static struct tree_mod_elem *
480 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
481                     enum mod_log_op op, gfp_t flags)
482 {
483         struct tree_mod_elem *tm;
484 
485         tm = kzalloc(sizeof(*tm), flags);
486         if (!tm)
487                 return NULL;
488 
489         tm->logical = eb->start;
490         if (op != MOD_LOG_KEY_ADD) {
491                 btrfs_node_key(eb, &tm->key, slot);
492                 tm->blockptr = btrfs_node_blockptr(eb, slot);
493         }
494         tm->op = op;
495         tm->slot = slot;
496         tm->generation = btrfs_node_ptr_generation(eb, slot);
497         RB_CLEAR_NODE(&tm->node);
498 
499         return tm;
500 }
501 
502 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
503                 enum mod_log_op op, gfp_t flags)
504 {
505         struct tree_mod_elem *tm;
506         int ret;
507 
508         if (!tree_mod_need_log(eb->fs_info, eb))
509                 return 0;
510 
511         tm = alloc_tree_mod_elem(eb, slot, op, flags);
512         if (!tm)
513                 return -ENOMEM;
514 
515         if (tree_mod_dont_log(eb->fs_info, eb)) {
516                 kfree(tm);
517                 return 0;
518         }
519 
520         ret = __tree_mod_log_insert(eb->fs_info, tm);
521         write_unlock(&eb->fs_info->tree_mod_log_lock);
522         if (ret)
523                 kfree(tm);
524 
525         return ret;
526 }
527 
528 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
529                 int dst_slot, int src_slot, int nr_items)
530 {
531         struct tree_mod_elem *tm = NULL;
532         struct tree_mod_elem **tm_list = NULL;
533         int ret = 0;
534         int i;
535         int locked = 0;
536 
537         if (!tree_mod_need_log(eb->fs_info, eb))
538                 return 0;
539 
540         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
541         if (!tm_list)
542                 return -ENOMEM;
543 
544         tm = kzalloc(sizeof(*tm), GFP_NOFS);
545         if (!tm) {
546                 ret = -ENOMEM;
547                 goto free_tms;
548         }
549 
550         tm->logical = eb->start;
551         tm->slot = src_slot;
552         tm->move.dst_slot = dst_slot;
553         tm->move.nr_items = nr_items;
554         tm->op = MOD_LOG_MOVE_KEYS;
555 
556         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
557                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
558                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
559                 if (!tm_list[i]) {
560                         ret = -ENOMEM;
561                         goto free_tms;
562                 }
563         }
564 
565         if (tree_mod_dont_log(eb->fs_info, eb))
566                 goto free_tms;
567         locked = 1;
568 
569         /*
570          * When we override something during the move, we log these removals.
571          * This can only happen when we move towards the beginning of the
572          * buffer, i.e. dst_slot < src_slot.
573          */
574         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
575                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
576                 if (ret)
577                         goto free_tms;
578         }
579 
580         ret = __tree_mod_log_insert(eb->fs_info, tm);
581         if (ret)
582                 goto free_tms;
583         write_unlock(&eb->fs_info->tree_mod_log_lock);
584         kfree(tm_list);
585 
586         return 0;
587 free_tms:
588         for (i = 0; i < nr_items; i++) {
589                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
590                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
591                 kfree(tm_list[i]);
592         }
593         if (locked)
594                 write_unlock(&eb->fs_info->tree_mod_log_lock);
595         kfree(tm_list);
596         kfree(tm);
597 
598         return ret;
599 }
600 
601 static inline int
602 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
603                        struct tree_mod_elem **tm_list,
604                        int nritems)
605 {
606         int i, j;
607         int ret;
608 
609         for (i = nritems - 1; i >= 0; i--) {
610                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
611                 if (ret) {
612                         for (j = nritems - 1; j > i; j--)
613                                 rb_erase(&tm_list[j]->node,
614                                          &fs_info->tree_mod_log);
615                         return ret;
616                 }
617         }
618 
619         return 0;
620 }
621 
622 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
623                          struct extent_buffer *new_root, int log_removal)
624 {
625         struct btrfs_fs_info *fs_info = old_root->fs_info;
626         struct tree_mod_elem *tm = NULL;
627         struct tree_mod_elem **tm_list = NULL;
628         int nritems = 0;
629         int ret = 0;
630         int i;
631 
632         if (!tree_mod_need_log(fs_info, NULL))
633                 return 0;
634 
635         if (log_removal && btrfs_header_level(old_root) > 0) {
636                 nritems = btrfs_header_nritems(old_root);
637                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
638                                   GFP_NOFS);
639                 if (!tm_list) {
640                         ret = -ENOMEM;
641                         goto free_tms;
642                 }
643                 for (i = 0; i < nritems; i++) {
644                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
645                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
646                         if (!tm_list[i]) {
647                                 ret = -ENOMEM;
648                                 goto free_tms;
649                         }
650                 }
651         }
652 
653         tm = kzalloc(sizeof(*tm), GFP_NOFS);
654         if (!tm) {
655                 ret = -ENOMEM;
656                 goto free_tms;
657         }
658 
659         tm->logical = new_root->start;
660         tm->old_root.logical = old_root->start;
661         tm->old_root.level = btrfs_header_level(old_root);
662         tm->generation = btrfs_header_generation(old_root);
663         tm->op = MOD_LOG_ROOT_REPLACE;
664 
665         if (tree_mod_dont_log(fs_info, NULL))
666                 goto free_tms;
667 
668         if (tm_list)
669                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
670         if (!ret)
671                 ret = __tree_mod_log_insert(fs_info, tm);
672 
673         write_unlock(&fs_info->tree_mod_log_lock);
674         if (ret)
675                 goto free_tms;
676         kfree(tm_list);
677 
678         return ret;
679 
680 free_tms:
681         if (tm_list) {
682                 for (i = 0; i < nritems; i++)
683                         kfree(tm_list[i]);
684                 kfree(tm_list);
685         }
686         kfree(tm);
687 
688         return ret;
689 }
690 
691 static struct tree_mod_elem *
692 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
693                       int smallest)
694 {
695         struct rb_root *tm_root;
696         struct rb_node *node;
697         struct tree_mod_elem *cur = NULL;
698         struct tree_mod_elem *found = NULL;
699 
700         read_lock(&fs_info->tree_mod_log_lock);
701         tm_root = &fs_info->tree_mod_log;
702         node = tm_root->rb_node;
703         while (node) {
704                 cur = rb_entry(node, struct tree_mod_elem, node);
705                 if (cur->logical < start) {
706                         node = node->rb_left;
707                 } else if (cur->logical > start) {
708                         node = node->rb_right;
709                 } else if (cur->seq < min_seq) {
710                         node = node->rb_left;
711                 } else if (!smallest) {
712                         /* we want the node with the highest seq */
713                         if (found)
714                                 BUG_ON(found->seq > cur->seq);
715                         found = cur;
716                         node = node->rb_left;
717                 } else if (cur->seq > min_seq) {
718                         /* we want the node with the smallest seq */
719                         if (found)
720                                 BUG_ON(found->seq < cur->seq);
721                         found = cur;
722                         node = node->rb_right;
723                 } else {
724                         found = cur;
725                         break;
726                 }
727         }
728         read_unlock(&fs_info->tree_mod_log_lock);
729 
730         return found;
731 }
732 
733 /*
734  * this returns the element from the log with the smallest time sequence
735  * value that's in the log (the oldest log item). any element with a time
736  * sequence lower than min_seq will be ignored.
737  */
738 static struct tree_mod_elem *
739 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
740                            u64 min_seq)
741 {
742         return __tree_mod_log_search(fs_info, start, min_seq, 1);
743 }
744 
745 /*
746  * this returns the element from the log with the largest time sequence
747  * value that's in the log (the most recent log item). any element with
748  * a time sequence lower than min_seq will be ignored.
749  */
750 static struct tree_mod_elem *
751 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
752 {
753         return __tree_mod_log_search(fs_info, start, min_seq, 0);
754 }
755 
756 static noinline int
757 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
758                      struct extent_buffer *src, unsigned long dst_offset,
759                      unsigned long src_offset, int nr_items)
760 {
761         int ret = 0;
762         struct tree_mod_elem **tm_list = NULL;
763         struct tree_mod_elem **tm_list_add, **tm_list_rem;
764         int i;
765         int locked = 0;
766 
767         if (!tree_mod_need_log(fs_info, NULL))
768                 return 0;
769 
770         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
771                 return 0;
772 
773         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
774                           GFP_NOFS);
775         if (!tm_list)
776                 return -ENOMEM;
777 
778         tm_list_add = tm_list;
779         tm_list_rem = tm_list + nr_items;
780         for (i = 0; i < nr_items; i++) {
781                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
782                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
783                 if (!tm_list_rem[i]) {
784                         ret = -ENOMEM;
785                         goto free_tms;
786                 }
787 
788                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
789                     MOD_LOG_KEY_ADD, GFP_NOFS);
790                 if (!tm_list_add[i]) {
791                         ret = -ENOMEM;
792                         goto free_tms;
793                 }
794         }
795 
796         if (tree_mod_dont_log(fs_info, NULL))
797                 goto free_tms;
798         locked = 1;
799 
800         for (i = 0; i < nr_items; i++) {
801                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
802                 if (ret)
803                         goto free_tms;
804                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
805                 if (ret)
806                         goto free_tms;
807         }
808 
809         write_unlock(&fs_info->tree_mod_log_lock);
810         kfree(tm_list);
811 
812         return 0;
813 
814 free_tms:
815         for (i = 0; i < nr_items * 2; i++) {
816                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
817                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
818                 kfree(tm_list[i]);
819         }
820         if (locked)
821                 write_unlock(&fs_info->tree_mod_log_lock);
822         kfree(tm_list);
823 
824         return ret;
825 }
826 
827 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
828 {
829         struct tree_mod_elem **tm_list = NULL;
830         int nritems = 0;
831         int i;
832         int ret = 0;
833 
834         if (btrfs_header_level(eb) == 0)
835                 return 0;
836 
837         if (!tree_mod_need_log(eb->fs_info, NULL))
838                 return 0;
839 
840         nritems = btrfs_header_nritems(eb);
841         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
842         if (!tm_list)
843                 return -ENOMEM;
844 
845         for (i = 0; i < nritems; i++) {
846                 tm_list[i] = alloc_tree_mod_elem(eb, i,
847                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
848                 if (!tm_list[i]) {
849                         ret = -ENOMEM;
850                         goto free_tms;
851                 }
852         }
853 
854         if (tree_mod_dont_log(eb->fs_info, eb))
855                 goto free_tms;
856 
857         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
858         write_unlock(&eb->fs_info->tree_mod_log_lock);
859         if (ret)
860                 goto free_tms;
861         kfree(tm_list);
862 
863         return 0;
864 
865 free_tms:
866         for (i = 0; i < nritems; i++)
867                 kfree(tm_list[i]);
868         kfree(tm_list);
869 
870         return ret;
871 }
872 
873 /*
874  * check if the tree block can be shared by multiple trees
875  */
876 int btrfs_block_can_be_shared(struct btrfs_root *root,
877                               struct extent_buffer *buf)
878 {
879         /*
880          * Tree blocks not in reference counted trees and tree roots
881          * are never shared. If a block was allocated after the last
882          * snapshot and the block was not allocated by tree relocation,
883          * we know the block is not shared.
884          */
885         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
886             buf != root->node && buf != root->commit_root &&
887             (btrfs_header_generation(buf) <=
888              btrfs_root_last_snapshot(&root->root_item) ||
889              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
890                 return 1;
891 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
892         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
893             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
894                 return 1;
895 #endif
896         return 0;
897 }
898 
899 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
900                                        struct btrfs_root *root,
901                                        struct extent_buffer *buf,
902                                        struct extent_buffer *cow,
903                                        int *last_ref)
904 {
905         struct btrfs_fs_info *fs_info = root->fs_info;
906         u64 refs;
907         u64 owner;
908         u64 flags;
909         u64 new_flags = 0;
910         int ret;
911 
912         /*
913          * Backrefs update rules:
914          *
915          * Always use full backrefs for extent pointers in tree block
916          * allocated by tree relocation.
917          *
918          * If a shared tree block is no longer referenced by its owner
919          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
920          * use full backrefs for extent pointers in tree block.
921          *
922          * If a tree block is been relocating
923          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
924          * use full backrefs for extent pointers in tree block.
925          * The reason for this is some operations (such as drop tree)
926          * are only allowed for blocks use full backrefs.
927          */
928 
929         if (btrfs_block_can_be_shared(root, buf)) {
930                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
931                                                btrfs_header_level(buf), 1,
932                                                &refs, &flags);
933                 if (ret)
934                         return ret;
935                 if (refs == 0) {
936                         ret = -EROFS;
937                         btrfs_handle_fs_error(fs_info, ret, NULL);
938                         return ret;
939                 }
940         } else {
941                 refs = 1;
942                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
943                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
944                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
945                 else
946                         flags = 0;
947         }
948 
949         owner = btrfs_header_owner(buf);
950         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
951                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
952 
953         if (refs > 1) {
954                 if ((owner == root->root_key.objectid ||
955                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
956                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
957                         ret = btrfs_inc_ref(trans, root, buf, 1);
958                         if (ret)
959                                 return ret;
960 
961                         if (root->root_key.objectid ==
962                             BTRFS_TREE_RELOC_OBJECTID) {
963                                 ret = btrfs_dec_ref(trans, root, buf, 0);
964                                 if (ret)
965                                         return ret;
966                                 ret = btrfs_inc_ref(trans, root, cow, 1);
967                                 if (ret)
968                                         return ret;
969                         }
970                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
971                 } else {
972 
973                         if (root->root_key.objectid ==
974                             BTRFS_TREE_RELOC_OBJECTID)
975                                 ret = btrfs_inc_ref(trans, root, cow, 1);
976                         else
977                                 ret = btrfs_inc_ref(trans, root, cow, 0);
978                         if (ret)
979                                 return ret;
980                 }
981                 if (new_flags != 0) {
982                         int level = btrfs_header_level(buf);
983 
984                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
985                                                           buf->start,
986                                                           buf->len,
987                                                           new_flags, level, 0);
988                         if (ret)
989                                 return ret;
990                 }
991         } else {
992                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
993                         if (root->root_key.objectid ==
994                             BTRFS_TREE_RELOC_OBJECTID)
995                                 ret = btrfs_inc_ref(trans, root, cow, 1);
996                         else
997                                 ret = btrfs_inc_ref(trans, root, cow, 0);
998                         if (ret)
999                                 return ret;
1000                         ret = btrfs_dec_ref(trans, root, buf, 1);
1001                         if (ret)
1002                                 return ret;
1003                 }
1004                 clean_tree_block(fs_info, buf);
1005                 *last_ref = 1;
1006         }
1007         return 0;
1008 }
1009 
1010 /*
1011  * does the dirty work in cow of a single block.  The parent block (if
1012  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1013  * dirty and returned locked.  If you modify the block it needs to be marked
1014  * dirty again.
1015  *
1016  * search_start -- an allocation hint for the new block
1017  *
1018  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1019  * bytes the allocator should try to find free next to the block it returns.
1020  * This is just a hint and may be ignored by the allocator.
1021  */
1022 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1023                              struct btrfs_root *root,
1024                              struct extent_buffer *buf,
1025                              struct extent_buffer *parent, int parent_slot,
1026                              struct extent_buffer **cow_ret,
1027                              u64 search_start, u64 empty_size)
1028 {
1029         struct btrfs_fs_info *fs_info = root->fs_info;
1030         struct btrfs_disk_key disk_key;
1031         struct extent_buffer *cow;
1032         int level, ret;
1033         int last_ref = 0;
1034         int unlock_orig = 0;
1035         u64 parent_start = 0;
1036 
1037         if (*cow_ret == buf)
1038                 unlock_orig = 1;
1039 
1040         btrfs_assert_tree_locked(buf);
1041 
1042         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1043                 trans->transid != fs_info->running_transaction->transid);
1044         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1045                 trans->transid != root->last_trans);
1046 
1047         level = btrfs_header_level(buf);
1048 
1049         if (level == 0)
1050                 btrfs_item_key(buf, &disk_key, 0);
1051         else
1052                 btrfs_node_key(buf, &disk_key, 0);
1053 
1054         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1055                 parent_start = parent->start;
1056 
1057         /*
1058          * If we are COWing a node/leaf from the extent, chunk or device trees,
1059          * make sure that we do not finish block group creation of pending block
1060          * groups. We do this to avoid a deadlock.
1061          * COWing can result in allocation of a new chunk, and flushing pending
1062          * block groups (btrfs_create_pending_block_groups()) can be triggered
1063          * when finishing allocation of a new chunk. Creation of a pending block
1064          * group modifies the extent, chunk and device trees, therefore we could
1065          * deadlock with ourselves since we are holding a lock on an extent
1066          * buffer that btrfs_create_pending_block_groups() may try to COW later.
1067          */
1068         if (root == fs_info->extent_root ||
1069             root == fs_info->chunk_root ||
1070             root == fs_info->dev_root)
1071                 trans->can_flush_pending_bgs = false;
1072 
1073         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1074                         root->root_key.objectid, &disk_key, level,
1075                         search_start, empty_size);
1076         trans->can_flush_pending_bgs = true;
1077         if (IS_ERR(cow))
1078                 return PTR_ERR(cow);
1079 
1080         /* cow is set to blocking by btrfs_init_new_buffer */
1081 
1082         copy_extent_buffer_full(cow, buf);
1083         btrfs_set_header_bytenr(cow, cow->start);
1084         btrfs_set_header_generation(cow, trans->transid);
1085         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1086         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1087                                      BTRFS_HEADER_FLAG_RELOC);
1088         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1089                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1090         else
1091                 btrfs_set_header_owner(cow, root->root_key.objectid);
1092 
1093         write_extent_buffer_fsid(cow, fs_info->fsid);
1094 
1095         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1096         if (ret) {
1097                 btrfs_abort_transaction(trans, ret);
1098                 return ret;
1099         }
1100 
1101         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1102                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1103                 if (ret) {
1104                         btrfs_abort_transaction(trans, ret);
1105                         return ret;
1106                 }
1107         }
1108 
1109         if (buf == root->node) {
1110                 WARN_ON(parent && parent != buf);
1111                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1112                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1113                         parent_start = buf->start;
1114 
1115                 extent_buffer_get(cow);
1116                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1117                 BUG_ON(ret < 0);
1118                 rcu_assign_pointer(root->node, cow);
1119 
1120                 btrfs_free_tree_block(trans, root, buf, parent_start,
1121                                       last_ref);
1122                 free_extent_buffer(buf);
1123                 add_root_to_dirty_list(root);
1124         } else {
1125                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1126                 tree_mod_log_insert_key(parent, parent_slot,
1127                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1128                 btrfs_set_node_blockptr(parent, parent_slot,
1129                                         cow->start);
1130                 btrfs_set_node_ptr_generation(parent, parent_slot,
1131                                               trans->transid);
1132                 btrfs_mark_buffer_dirty(parent);
1133                 if (last_ref) {
1134                         ret = tree_mod_log_free_eb(buf);
1135                         if (ret) {
1136                                 btrfs_abort_transaction(trans, ret);
1137                                 return ret;
1138                         }
1139                 }
1140                 btrfs_free_tree_block(trans, root, buf, parent_start,
1141                                       last_ref);
1142         }
1143         if (unlock_orig)
1144                 btrfs_tree_unlock(buf);
1145         free_extent_buffer_stale(buf);
1146         btrfs_mark_buffer_dirty(cow);
1147         *cow_ret = cow;
1148         return 0;
1149 }
1150 
1151 /*
1152  * returns the logical address of the oldest predecessor of the given root.
1153  * entries older than time_seq are ignored.
1154  */
1155 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1156                 struct extent_buffer *eb_root, u64 time_seq)
1157 {
1158         struct tree_mod_elem *tm;
1159         struct tree_mod_elem *found = NULL;
1160         u64 root_logical = eb_root->start;
1161         int looped = 0;
1162 
1163         if (!time_seq)
1164                 return NULL;
1165 
1166         /*
1167          * the very last operation that's logged for a root is the
1168          * replacement operation (if it is replaced at all). this has
1169          * the logical address of the *new* root, making it the very
1170          * first operation that's logged for this root.
1171          */
1172         while (1) {
1173                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1174                                                 time_seq);
1175                 if (!looped && !tm)
1176                         return NULL;
1177                 /*
1178                  * if there are no tree operation for the oldest root, we simply
1179                  * return it. this should only happen if that (old) root is at
1180                  * level 0.
1181                  */
1182                 if (!tm)
1183                         break;
1184 
1185                 /*
1186                  * if there's an operation that's not a root replacement, we
1187                  * found the oldest version of our root. normally, we'll find a
1188                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1189                  */
1190                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1191                         break;
1192 
1193                 found = tm;
1194                 root_logical = tm->old_root.logical;
1195                 looped = 1;
1196         }
1197 
1198         /* if there's no old root to return, return what we found instead */
1199         if (!found)
1200                 found = tm;
1201 
1202         return found;
1203 }
1204 
1205 /*
1206  * tm is a pointer to the first operation to rewind within eb. then, all
1207  * previous operations will be rewound (until we reach something older than
1208  * time_seq).
1209  */
1210 static void
1211 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1212                       u64 time_seq, struct tree_mod_elem *first_tm)
1213 {
1214         u32 n;
1215         struct rb_node *next;
1216         struct tree_mod_elem *tm = first_tm;
1217         unsigned long o_dst;
1218         unsigned long o_src;
1219         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1220 
1221         n = btrfs_header_nritems(eb);
1222         read_lock(&fs_info->tree_mod_log_lock);
1223         while (tm && tm->seq >= time_seq) {
1224                 /*
1225                  * all the operations are recorded with the operator used for
1226                  * the modification. as we're going backwards, we do the
1227                  * opposite of each operation here.
1228                  */
1229                 switch (tm->op) {
1230                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1231                         BUG_ON(tm->slot < n);
1232                         /* Fallthrough */
1233                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1234                 case MOD_LOG_KEY_REMOVE:
1235                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1236                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1237                         btrfs_set_node_ptr_generation(eb, tm->slot,
1238                                                       tm->generation);
1239                         n++;
1240                         break;
1241                 case MOD_LOG_KEY_REPLACE:
1242                         BUG_ON(tm->slot >= n);
1243                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1244                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1245                         btrfs_set_node_ptr_generation(eb, tm->slot,
1246                                                       tm->generation);
1247                         break;
1248                 case MOD_LOG_KEY_ADD:
1249                         /* if a move operation is needed it's in the log */
1250                         n--;
1251                         break;
1252                 case MOD_LOG_MOVE_KEYS:
1253                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1254                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1255                         memmove_extent_buffer(eb, o_dst, o_src,
1256                                               tm->move.nr_items * p_size);
1257                         break;
1258                 case MOD_LOG_ROOT_REPLACE:
1259                         /*
1260                          * this operation is special. for roots, this must be
1261                          * handled explicitly before rewinding.
1262                          * for non-roots, this operation may exist if the node
1263                          * was a root: root A -> child B; then A gets empty and
1264                          * B is promoted to the new root. in the mod log, we'll
1265                          * have a root-replace operation for B, a tree block
1266                          * that is no root. we simply ignore that operation.
1267                          */
1268                         break;
1269                 }
1270                 next = rb_next(&tm->node);
1271                 if (!next)
1272                         break;
1273                 tm = rb_entry(next, struct tree_mod_elem, node);
1274                 if (tm->logical != first_tm->logical)
1275                         break;
1276         }
1277         read_unlock(&fs_info->tree_mod_log_lock);
1278         btrfs_set_header_nritems(eb, n);
1279 }
1280 
1281 /*
1282  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1283  * is returned. If rewind operations happen, a fresh buffer is returned. The
1284  * returned buffer is always read-locked. If the returned buffer is not the
1285  * input buffer, the lock on the input buffer is released and the input buffer
1286  * is freed (its refcount is decremented).
1287  */
1288 static struct extent_buffer *
1289 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1290                     struct extent_buffer *eb, u64 time_seq)
1291 {
1292         struct extent_buffer *eb_rewin;
1293         struct tree_mod_elem *tm;
1294 
1295         if (!time_seq)
1296                 return eb;
1297 
1298         if (btrfs_header_level(eb) == 0)
1299                 return eb;
1300 
1301         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1302         if (!tm)
1303                 return eb;
1304 
1305         btrfs_set_path_blocking(path);
1306         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1307 
1308         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1309                 BUG_ON(tm->slot != 0);
1310                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1311                 if (!eb_rewin) {
1312                         btrfs_tree_read_unlock_blocking(eb);
1313                         free_extent_buffer(eb);
1314                         return NULL;
1315                 }
1316                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1317                 btrfs_set_header_backref_rev(eb_rewin,
1318                                              btrfs_header_backref_rev(eb));
1319                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1320                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1321         } else {
1322                 eb_rewin = btrfs_clone_extent_buffer(eb);
1323                 if (!eb_rewin) {
1324                         btrfs_tree_read_unlock_blocking(eb);
1325                         free_extent_buffer(eb);
1326                         return NULL;
1327                 }
1328         }
1329 
1330         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1331         btrfs_tree_read_unlock_blocking(eb);
1332         free_extent_buffer(eb);
1333 
1334         extent_buffer_get(eb_rewin);
1335         btrfs_tree_read_lock(eb_rewin);
1336         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1337         WARN_ON(btrfs_header_nritems(eb_rewin) >
1338                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1339 
1340         return eb_rewin;
1341 }
1342 
1343 /*
1344  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1345  * value. If there are no changes, the current root->root_node is returned. If
1346  * anything changed in between, there's a fresh buffer allocated on which the
1347  * rewind operations are done. In any case, the returned buffer is read locked.
1348  * Returns NULL on error (with no locks held).
1349  */
1350 static inline struct extent_buffer *
1351 get_old_root(struct btrfs_root *root, u64 time_seq)
1352 {
1353         struct btrfs_fs_info *fs_info = root->fs_info;
1354         struct tree_mod_elem *tm;
1355         struct extent_buffer *eb = NULL;
1356         struct extent_buffer *eb_root;
1357         struct extent_buffer *old;
1358         struct tree_mod_root *old_root = NULL;
1359         u64 old_generation = 0;
1360         u64 logical;
1361         int level;
1362 
1363         eb_root = btrfs_read_lock_root_node(root);
1364         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1365         if (!tm)
1366                 return eb_root;
1367 
1368         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1369                 old_root = &tm->old_root;
1370                 old_generation = tm->generation;
1371                 logical = old_root->logical;
1372                 level = old_root->level;
1373         } else {
1374                 logical = eb_root->start;
1375                 level = btrfs_header_level(eb_root);
1376         }
1377 
1378         tm = tree_mod_log_search(fs_info, logical, time_seq);
1379         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1380                 btrfs_tree_read_unlock(eb_root);
1381                 free_extent_buffer(eb_root);
1382                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1383                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1384                         if (!IS_ERR(old))
1385                                 free_extent_buffer(old);
1386                         btrfs_warn(fs_info,
1387                                    "failed to read tree block %llu from get_old_root",
1388                                    logical);
1389                 } else {
1390                         eb = btrfs_clone_extent_buffer(old);
1391                         free_extent_buffer(old);
1392                 }
1393         } else if (old_root) {
1394                 btrfs_tree_read_unlock(eb_root);
1395                 free_extent_buffer(eb_root);
1396                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1397         } else {
1398                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1399                 eb = btrfs_clone_extent_buffer(eb_root);
1400                 btrfs_tree_read_unlock_blocking(eb_root);
1401                 free_extent_buffer(eb_root);
1402         }
1403 
1404         if (!eb)
1405                 return NULL;
1406         extent_buffer_get(eb);
1407         btrfs_tree_read_lock(eb);
1408         if (old_root) {
1409                 btrfs_set_header_bytenr(eb, eb->start);
1410                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1411                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1412                 btrfs_set_header_level(eb, old_root->level);
1413                 btrfs_set_header_generation(eb, old_generation);
1414         }
1415         if (tm)
1416                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1417         else
1418                 WARN_ON(btrfs_header_level(eb) != 0);
1419         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1420 
1421         return eb;
1422 }
1423 
1424 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1425 {
1426         struct tree_mod_elem *tm;
1427         int level;
1428         struct extent_buffer *eb_root = btrfs_root_node(root);
1429 
1430         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1431         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1432                 level = tm->old_root.level;
1433         } else {
1434                 level = btrfs_header_level(eb_root);
1435         }
1436         free_extent_buffer(eb_root);
1437 
1438         return level;
1439 }
1440 
1441 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1442                                    struct btrfs_root *root,
1443                                    struct extent_buffer *buf)
1444 {
1445         if (btrfs_is_testing(root->fs_info))
1446                 return 0;
1447 
1448         /* Ensure we can see the FORCE_COW bit */
1449         smp_mb__before_atomic();
1450 
1451         /*
1452          * We do not need to cow a block if
1453          * 1) this block is not created or changed in this transaction;
1454          * 2) this block does not belong to TREE_RELOC tree;
1455          * 3) the root is not forced COW.
1456          *
1457          * What is forced COW:
1458          *    when we create snapshot during committing the transaction,
1459          *    after we've finished coping src root, we must COW the shared
1460          *    block to ensure the metadata consistency.
1461          */
1462         if (btrfs_header_generation(buf) == trans->transid &&
1463             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1464             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1465               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1466             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1467                 return 0;
1468         return 1;
1469 }
1470 
1471 /*
1472  * cows a single block, see __btrfs_cow_block for the real work.
1473  * This version of it has extra checks so that a block isn't COWed more than
1474  * once per transaction, as long as it hasn't been written yet
1475  */
1476 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1477                     struct btrfs_root *root, struct extent_buffer *buf,
1478                     struct extent_buffer *parent, int parent_slot,
1479                     struct extent_buffer **cow_ret)
1480 {
1481         struct btrfs_fs_info *fs_info = root->fs_info;
1482         u64 search_start;
1483         int ret;
1484 
1485         if (trans->transaction != fs_info->running_transaction)
1486                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1487                        trans->transid,
1488                        fs_info->running_transaction->transid);
1489 
1490         if (trans->transid != fs_info->generation)
1491                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1492                        trans->transid, fs_info->generation);
1493 
1494         if (!should_cow_block(trans, root, buf)) {
1495                 trans->dirty = true;
1496                 *cow_ret = buf;
1497                 return 0;
1498         }
1499 
1500         search_start = buf->start & ~((u64)SZ_1G - 1);
1501 
1502         if (parent)
1503                 btrfs_set_lock_blocking(parent);
1504         btrfs_set_lock_blocking(buf);
1505 
1506         ret = __btrfs_cow_block(trans, root, buf, parent,
1507                                  parent_slot, cow_ret, search_start, 0);
1508 
1509         trace_btrfs_cow_block(root, buf, *cow_ret);
1510 
1511         return ret;
1512 }
1513 
1514 /*
1515  * helper function for defrag to decide if two blocks pointed to by a
1516  * node are actually close by
1517  */
1518 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1519 {
1520         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1521                 return 1;
1522         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1523                 return 1;
1524         return 0;
1525 }
1526 
1527 /*
1528  * compare two keys in a memcmp fashion
1529  */
1530 static int comp_keys(const struct btrfs_disk_key *disk,
1531                      const struct btrfs_key *k2)
1532 {
1533         struct btrfs_key k1;
1534 
1535         btrfs_disk_key_to_cpu(&k1, disk);
1536 
1537         return btrfs_comp_cpu_keys(&k1, k2);
1538 }
1539 
1540 /*
1541  * same as comp_keys only with two btrfs_key's
1542  */
1543 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1544 {
1545         if (k1->objectid > k2->objectid)
1546                 return 1;
1547         if (k1->objectid < k2->objectid)
1548                 return -1;
1549         if (k1->type > k2->type)
1550                 return 1;
1551         if (k1->type < k2->type)
1552                 return -1;
1553         if (k1->offset > k2->offset)
1554                 return 1;
1555         if (k1->offset < k2->offset)
1556                 return -1;
1557         return 0;
1558 }
1559 
1560 /*
1561  * this is used by the defrag code to go through all the
1562  * leaves pointed to by a node and reallocate them so that
1563  * disk order is close to key order
1564  */
1565 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1566                        struct btrfs_root *root, struct extent_buffer *parent,
1567                        int start_slot, u64 *last_ret,
1568                        struct btrfs_key *progress)
1569 {
1570         struct btrfs_fs_info *fs_info = root->fs_info;
1571         struct extent_buffer *cur;
1572         u64 blocknr;
1573         u64 gen;
1574         u64 search_start = *last_ret;
1575         u64 last_block = 0;
1576         u64 other;
1577         u32 parent_nritems;
1578         int end_slot;
1579         int i;
1580         int err = 0;
1581         int parent_level;
1582         int uptodate;
1583         u32 blocksize;
1584         int progress_passed = 0;
1585         struct btrfs_disk_key disk_key;
1586 
1587         parent_level = btrfs_header_level(parent);
1588 
1589         WARN_ON(trans->transaction != fs_info->running_transaction);
1590         WARN_ON(trans->transid != fs_info->generation);
1591 
1592         parent_nritems = btrfs_header_nritems(parent);
1593         blocksize = fs_info->nodesize;
1594         end_slot = parent_nritems - 1;
1595 
1596         if (parent_nritems <= 1)
1597                 return 0;
1598 
1599         btrfs_set_lock_blocking(parent);
1600 
1601         for (i = start_slot; i <= end_slot; i++) {
1602                 struct btrfs_key first_key;
1603                 int close = 1;
1604 
1605                 btrfs_node_key(parent, &disk_key, i);
1606                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1607                         continue;
1608 
1609                 progress_passed = 1;
1610                 blocknr = btrfs_node_blockptr(parent, i);
1611                 gen = btrfs_node_ptr_generation(parent, i);
1612                 btrfs_node_key_to_cpu(parent, &first_key, i);
1613                 if (last_block == 0)
1614                         last_block = blocknr;
1615 
1616                 if (i > 0) {
1617                         other = btrfs_node_blockptr(parent, i - 1);
1618                         close = close_blocks(blocknr, other, blocksize);
1619                 }
1620                 if (!close && i < end_slot) {
1621                         other = btrfs_node_blockptr(parent, i + 1);
1622                         close = close_blocks(blocknr, other, blocksize);
1623                 }
1624                 if (close) {
1625                         last_block = blocknr;
1626                         continue;
1627                 }
1628 
1629                 cur = find_extent_buffer(fs_info, blocknr);
1630                 if (cur)
1631                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1632                 else
1633                         uptodate = 0;
1634                 if (!cur || !uptodate) {
1635                         if (!cur) {
1636                                 cur = read_tree_block(fs_info, blocknr, gen,
1637                                                       parent_level - 1,
1638                                                       &first_key);
1639                                 if (IS_ERR(cur)) {
1640                                         return PTR_ERR(cur);
1641                                 } else if (!extent_buffer_uptodate(cur)) {
1642                                         free_extent_buffer(cur);
1643                                         return -EIO;
1644                                 }
1645                         } else if (!uptodate) {
1646                                 err = btrfs_read_buffer(cur, gen,
1647                                                 parent_level - 1,&first_key);
1648                                 if (err) {
1649                                         free_extent_buffer(cur);
1650                                         return err;
1651                                 }
1652                         }
1653                 }
1654                 if (search_start == 0)
1655                         search_start = last_block;
1656 
1657                 btrfs_tree_lock(cur);
1658                 btrfs_set_lock_blocking(cur);
1659                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1660                                         &cur, search_start,
1661                                         min(16 * blocksize,
1662                                             (end_slot - i) * blocksize));
1663                 if (err) {
1664                         btrfs_tree_unlock(cur);
1665                         free_extent_buffer(cur);
1666                         break;
1667                 }
1668                 search_start = cur->start;
1669                 last_block = cur->start;
1670                 *last_ret = search_start;
1671                 btrfs_tree_unlock(cur);
1672                 free_extent_buffer(cur);
1673         }
1674         return err;
1675 }
1676 
1677 /*
1678  * search for key in the extent_buffer.  The items start at offset p,
1679  * and they are item_size apart.  There are 'max' items in p.
1680  *
1681  * the slot in the array is returned via slot, and it points to
1682  * the place where you would insert key if it is not found in
1683  * the array.
1684  *
1685  * slot may point to max if the key is bigger than all of the keys
1686  */
1687 static noinline int generic_bin_search(struct extent_buffer *eb,
1688                                        unsigned long p, int item_size,
1689                                        const struct btrfs_key *key,
1690                                        int max, int *slot)
1691 {
1692         int low = 0;
1693         int high = max;
1694         int mid;
1695         int ret;
1696         struct btrfs_disk_key *tmp = NULL;
1697         struct btrfs_disk_key unaligned;
1698         unsigned long offset;
1699         char *kaddr = NULL;
1700         unsigned long map_start = 0;
1701         unsigned long map_len = 0;
1702         int err;
1703 
1704         if (low > high) {
1705                 btrfs_err(eb->fs_info,
1706                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1707                           __func__, low, high, eb->start,
1708                           btrfs_header_owner(eb), btrfs_header_level(eb));
1709                 return -EINVAL;
1710         }
1711 
1712         while (low < high) {
1713                 mid = (low + high) / 2;
1714                 offset = p + mid * item_size;
1715 
1716                 if (!kaddr || offset < map_start ||
1717                     (offset + sizeof(struct btrfs_disk_key)) >
1718                     map_start + map_len) {
1719 
1720                         err = map_private_extent_buffer(eb, offset,
1721                                                 sizeof(struct btrfs_disk_key),
1722                                                 &kaddr, &map_start, &map_len);
1723 
1724                         if (!err) {
1725                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1726                                                         map_start);
1727                         } else if (err == 1) {
1728                                 read_extent_buffer(eb, &unaligned,
1729                                                    offset, sizeof(unaligned));
1730                                 tmp = &unaligned;
1731                         } else {
1732                                 return err;
1733                         }
1734 
1735                 } else {
1736                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1737                                                         map_start);
1738                 }
1739                 ret = comp_keys(tmp, key);
1740 
1741                 if (ret < 0)
1742                         low = mid + 1;
1743                 else if (ret > 0)
1744                         high = mid;
1745                 else {
1746                         *slot = mid;
1747                         return 0;
1748                 }
1749         }
1750         *slot = low;
1751         return 1;
1752 }
1753 
1754 /*
1755  * simple bin_search frontend that does the right thing for
1756  * leaves vs nodes
1757  */
1758 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1759                      int level, int *slot)
1760 {
1761         if (level == 0)
1762                 return generic_bin_search(eb,
1763                                           offsetof(struct btrfs_leaf, items),
1764                                           sizeof(struct btrfs_item),
1765                                           key, btrfs_header_nritems(eb),
1766                                           slot);
1767         else
1768                 return generic_bin_search(eb,
1769                                           offsetof(struct btrfs_node, ptrs),
1770                                           sizeof(struct btrfs_key_ptr),
1771                                           key, btrfs_header_nritems(eb),
1772                                           slot);
1773 }
1774 
1775 static void root_add_used(struct btrfs_root *root, u32 size)
1776 {
1777         spin_lock(&root->accounting_lock);
1778         btrfs_set_root_used(&root->root_item,
1779                             btrfs_root_used(&root->root_item) + size);
1780         spin_unlock(&root->accounting_lock);
1781 }
1782 
1783 static void root_sub_used(struct btrfs_root *root, u32 size)
1784 {
1785         spin_lock(&root->accounting_lock);
1786         btrfs_set_root_used(&root->root_item,
1787                             btrfs_root_used(&root->root_item) - size);
1788         spin_unlock(&root->accounting_lock);
1789 }
1790 
1791 /* given a node and slot number, this reads the blocks it points to.  The
1792  * extent buffer is returned with a reference taken (but unlocked).
1793  */
1794 static noinline struct extent_buffer *
1795 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1796                int slot)
1797 {
1798         int level = btrfs_header_level(parent);
1799         struct extent_buffer *eb;
1800         struct btrfs_key first_key;
1801 
1802         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1803                 return ERR_PTR(-ENOENT);
1804 
1805         BUG_ON(level == 0);
1806 
1807         btrfs_node_key_to_cpu(parent, &first_key, slot);
1808         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1809                              btrfs_node_ptr_generation(parent, slot),
1810                              level - 1, &first_key);
1811         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1812                 free_extent_buffer(eb);
1813                 eb = ERR_PTR(-EIO);
1814         }
1815 
1816         return eb;
1817 }
1818 
1819 /*
1820  * node level balancing, used to make sure nodes are in proper order for
1821  * item deletion.  We balance from the top down, so we have to make sure
1822  * that a deletion won't leave an node completely empty later on.
1823  */
1824 static noinline int balance_level(struct btrfs_trans_handle *trans,
1825                          struct btrfs_root *root,
1826                          struct btrfs_path *path, int level)
1827 {
1828         struct btrfs_fs_info *fs_info = root->fs_info;
1829         struct extent_buffer *right = NULL;
1830         struct extent_buffer *mid;
1831         struct extent_buffer *left = NULL;
1832         struct extent_buffer *parent = NULL;
1833         int ret = 0;
1834         int wret;
1835         int pslot;
1836         int orig_slot = path->slots[level];
1837         u64 orig_ptr;
1838 
1839         if (level == 0)
1840                 return 0;
1841 
1842         mid = path->nodes[level];
1843 
1844         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1845                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1846         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1847 
1848         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1849 
1850         if (level < BTRFS_MAX_LEVEL - 1) {
1851                 parent = path->nodes[level + 1];
1852                 pslot = path->slots[level + 1];
1853         }
1854 
1855         /*
1856          * deal with the case where there is only one pointer in the root
1857          * by promoting the node below to a root
1858          */
1859         if (!parent) {
1860                 struct extent_buffer *child;
1861 
1862                 if (btrfs_header_nritems(mid) != 1)
1863                         return 0;
1864 
1865                 /* promote the child to a root */
1866                 child = read_node_slot(fs_info, mid, 0);
1867                 if (IS_ERR(child)) {
1868                         ret = PTR_ERR(child);
1869                         btrfs_handle_fs_error(fs_info, ret, NULL);
1870                         goto enospc;
1871                 }
1872 
1873                 btrfs_tree_lock(child);
1874                 btrfs_set_lock_blocking(child);
1875                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1876                 if (ret) {
1877                         btrfs_tree_unlock(child);
1878                         free_extent_buffer(child);
1879                         goto enospc;
1880                 }
1881 
1882                 ret = tree_mod_log_insert_root(root->node, child, 1);
1883                 BUG_ON(ret < 0);
1884                 rcu_assign_pointer(root->node, child);
1885 
1886                 add_root_to_dirty_list(root);
1887                 btrfs_tree_unlock(child);
1888 
1889                 path->locks[level] = 0;
1890                 path->nodes[level] = NULL;
1891                 clean_tree_block(fs_info, mid);
1892                 btrfs_tree_unlock(mid);
1893                 /* once for the path */
1894                 free_extent_buffer(mid);
1895 
1896                 root_sub_used(root, mid->len);
1897                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1898                 /* once for the root ptr */
1899                 free_extent_buffer_stale(mid);
1900                 return 0;
1901         }
1902         if (btrfs_header_nritems(mid) >
1903             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1904                 return 0;
1905 
1906         left = read_node_slot(fs_info, parent, pslot - 1);
1907         if (IS_ERR(left))
1908                 left = NULL;
1909 
1910         if (left) {
1911                 btrfs_tree_lock(left);
1912                 btrfs_set_lock_blocking(left);
1913                 wret = btrfs_cow_block(trans, root, left,
1914                                        parent, pslot - 1, &left);
1915                 if (wret) {
1916                         ret = wret;
1917                         goto enospc;
1918                 }
1919         }
1920 
1921         right = read_node_slot(fs_info, parent, pslot + 1);
1922         if (IS_ERR(right))
1923                 right = NULL;
1924 
1925         if (right) {
1926                 btrfs_tree_lock(right);
1927                 btrfs_set_lock_blocking(right);
1928                 wret = btrfs_cow_block(trans, root, right,
1929                                        parent, pslot + 1, &right);
1930                 if (wret) {
1931                         ret = wret;
1932                         goto enospc;
1933                 }
1934         }
1935 
1936         /* first, try to make some room in the middle buffer */
1937         if (left) {
1938                 orig_slot += btrfs_header_nritems(left);
1939                 wret = push_node_left(trans, fs_info, left, mid, 1);
1940                 if (wret < 0)
1941                         ret = wret;
1942         }
1943 
1944         /*
1945          * then try to empty the right most buffer into the middle
1946          */
1947         if (right) {
1948                 wret = push_node_left(trans, fs_info, mid, right, 1);
1949                 if (wret < 0 && wret != -ENOSPC)
1950                         ret = wret;
1951                 if (btrfs_header_nritems(right) == 0) {
1952                         clean_tree_block(fs_info, right);
1953                         btrfs_tree_unlock(right);
1954                         del_ptr(root, path, level + 1, pslot + 1);
1955                         root_sub_used(root, right->len);
1956                         btrfs_free_tree_block(trans, root, right, 0, 1);
1957                         free_extent_buffer_stale(right);
1958                         right = NULL;
1959                 } else {
1960                         struct btrfs_disk_key right_key;
1961                         btrfs_node_key(right, &right_key, 0);
1962                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1963                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1964                         BUG_ON(ret < 0);
1965                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1966                         btrfs_mark_buffer_dirty(parent);
1967                 }
1968         }
1969         if (btrfs_header_nritems(mid) == 1) {
1970                 /*
1971                  * we're not allowed to leave a node with one item in the
1972                  * tree during a delete.  A deletion from lower in the tree
1973                  * could try to delete the only pointer in this node.
1974                  * So, pull some keys from the left.
1975                  * There has to be a left pointer at this point because
1976                  * otherwise we would have pulled some pointers from the
1977                  * right
1978                  */
1979                 if (!left) {
1980                         ret = -EROFS;
1981                         btrfs_handle_fs_error(fs_info, ret, NULL);
1982                         goto enospc;
1983                 }
1984                 wret = balance_node_right(trans, fs_info, mid, left);
1985                 if (wret < 0) {
1986                         ret = wret;
1987                         goto enospc;
1988                 }
1989                 if (wret == 1) {
1990                         wret = push_node_left(trans, fs_info, left, mid, 1);
1991                         if (wret < 0)
1992                                 ret = wret;
1993                 }
1994                 BUG_ON(wret == 1);
1995         }
1996         if (btrfs_header_nritems(mid) == 0) {
1997                 clean_tree_block(fs_info, mid);
1998                 btrfs_tree_unlock(mid);
1999                 del_ptr(root, path, level + 1, pslot);
2000                 root_sub_used(root, mid->len);
2001                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2002                 free_extent_buffer_stale(mid);
2003                 mid = NULL;
2004         } else {
2005                 /* update the parent key to reflect our changes */
2006                 struct btrfs_disk_key mid_key;
2007                 btrfs_node_key(mid, &mid_key, 0);
2008                 ret = tree_mod_log_insert_key(parent, pslot,
2009                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2010                 BUG_ON(ret < 0);
2011                 btrfs_set_node_key(parent, &mid_key, pslot);
2012                 btrfs_mark_buffer_dirty(parent);
2013         }
2014 
2015         /* update the path */
2016         if (left) {
2017                 if (btrfs_header_nritems(left) > orig_slot) {
2018                         extent_buffer_get(left);
2019                         /* left was locked after cow */
2020                         path->nodes[level] = left;
2021                         path->slots[level + 1] -= 1;
2022                         path->slots[level] = orig_slot;
2023                         if (mid) {
2024                                 btrfs_tree_unlock(mid);
2025                                 free_extent_buffer(mid);
2026                         }
2027                 } else {
2028                         orig_slot -= btrfs_header_nritems(left);
2029                         path->slots[level] = orig_slot;
2030                 }
2031         }
2032         /* double check we haven't messed things up */
2033         if (orig_ptr !=
2034             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2035                 BUG();
2036 enospc:
2037         if (right) {
2038                 btrfs_tree_unlock(right);
2039                 free_extent_buffer(right);
2040         }
2041         if (left) {
2042                 if (path->nodes[level] != left)
2043                         btrfs_tree_unlock(left);
2044                 free_extent_buffer(left);
2045         }
2046         return ret;
2047 }
2048 
2049 /* Node balancing for insertion.  Here we only split or push nodes around
2050  * when they are completely full.  This is also done top down, so we
2051  * have to be pessimistic.
2052  */
2053 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2054                                           struct btrfs_root *root,
2055                                           struct btrfs_path *path, int level)
2056 {
2057         struct btrfs_fs_info *fs_info = root->fs_info;
2058         struct extent_buffer *right = NULL;
2059         struct extent_buffer *mid;
2060         struct extent_buffer *left = NULL;
2061         struct extent_buffer *parent = NULL;
2062         int ret = 0;
2063         int wret;
2064         int pslot;
2065         int orig_slot = path->slots[level];
2066 
2067         if (level == 0)
2068                 return 1;
2069 
2070         mid = path->nodes[level];
2071         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2072 
2073         if (level < BTRFS_MAX_LEVEL - 1) {
2074                 parent = path->nodes[level + 1];
2075                 pslot = path->slots[level + 1];
2076         }
2077 
2078         if (!parent)
2079                 return 1;
2080 
2081         left = read_node_slot(fs_info, parent, pslot - 1);
2082         if (IS_ERR(left))
2083                 left = NULL;
2084 
2085         /* first, try to make some room in the middle buffer */
2086         if (left) {
2087                 u32 left_nr;
2088 
2089                 btrfs_tree_lock(left);
2090                 btrfs_set_lock_blocking(left);
2091 
2092                 left_nr = btrfs_header_nritems(left);
2093                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2094                         wret = 1;
2095                 } else {
2096                         ret = btrfs_cow_block(trans, root, left, parent,
2097                                               pslot - 1, &left);
2098                         if (ret)
2099                                 wret = 1;
2100                         else {
2101                                 wret = push_node_left(trans, fs_info,
2102                                                       left, mid, 0);
2103                         }
2104                 }
2105                 if (wret < 0)
2106                         ret = wret;
2107                 if (wret == 0) {
2108                         struct btrfs_disk_key disk_key;
2109                         orig_slot += left_nr;
2110                         btrfs_node_key(mid, &disk_key, 0);
2111                         ret = tree_mod_log_insert_key(parent, pslot,
2112                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2113                         BUG_ON(ret < 0);
2114                         btrfs_set_node_key(parent, &disk_key, pslot);
2115                         btrfs_mark_buffer_dirty(parent);
2116                         if (btrfs_header_nritems(left) > orig_slot) {
2117                                 path->nodes[level] = left;
2118                                 path->slots[level + 1] -= 1;
2119                                 path->slots[level] = orig_slot;
2120                                 btrfs_tree_unlock(mid);
2121                                 free_extent_buffer(mid);
2122                         } else {
2123                                 orig_slot -=
2124                                         btrfs_header_nritems(left);
2125                                 path->slots[level] = orig_slot;
2126                                 btrfs_tree_unlock(left);
2127                                 free_extent_buffer(left);
2128                         }
2129                         return 0;
2130                 }
2131                 btrfs_tree_unlock(left);
2132                 free_extent_buffer(left);
2133         }
2134         right = read_node_slot(fs_info, parent, pslot + 1);
2135         if (IS_ERR(right))
2136                 right = NULL;
2137 
2138         /*
2139          * then try to empty the right most buffer into the middle
2140          */
2141         if (right) {
2142                 u32 right_nr;
2143 
2144                 btrfs_tree_lock(right);
2145                 btrfs_set_lock_blocking(right);
2146 
2147                 right_nr = btrfs_header_nritems(right);
2148                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2149                         wret = 1;
2150                 } else {
2151                         ret = btrfs_cow_block(trans, root, right,
2152                                               parent, pslot + 1,
2153                                               &right);
2154                         if (ret)
2155                                 wret = 1;
2156                         else {
2157                                 wret = balance_node_right(trans, fs_info,
2158                                                           right, mid);
2159                         }
2160                 }
2161                 if (wret < 0)
2162                         ret = wret;
2163                 if (wret == 0) {
2164                         struct btrfs_disk_key disk_key;
2165 
2166                         btrfs_node_key(right, &disk_key, 0);
2167                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2168                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2169                         BUG_ON(ret < 0);
2170                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2171                         btrfs_mark_buffer_dirty(parent);
2172 
2173                         if (btrfs_header_nritems(mid) <= orig_slot) {
2174                                 path->nodes[level] = right;
2175                                 path->slots[level + 1] += 1;
2176                                 path->slots[level] = orig_slot -
2177                                         btrfs_header_nritems(mid);
2178                                 btrfs_tree_unlock(mid);
2179                                 free_extent_buffer(mid);
2180                         } else {
2181                                 btrfs_tree_unlock(right);
2182                                 free_extent_buffer(right);
2183                         }
2184                         return 0;
2185                 }
2186                 btrfs_tree_unlock(right);
2187                 free_extent_buffer(right);
2188         }
2189         return 1;
2190 }
2191 
2192 /*
2193  * readahead one full node of leaves, finding things that are close
2194  * to the block in 'slot', and triggering ra on them.
2195  */
2196 static void reada_for_search(struct btrfs_fs_info *fs_info,
2197                              struct btrfs_path *path,
2198                              int level, int slot, u64 objectid)
2199 {
2200         struct extent_buffer *node;
2201         struct btrfs_disk_key disk_key;
2202         u32 nritems;
2203         u64 search;
2204         u64 target;
2205         u64 nread = 0;
2206         struct extent_buffer *eb;
2207         u32 nr;
2208         u32 blocksize;
2209         u32 nscan = 0;
2210 
2211         if (level != 1)
2212                 return;
2213 
2214         if (!path->nodes[level])
2215                 return;
2216 
2217         node = path->nodes[level];
2218 
2219         search = btrfs_node_blockptr(node, slot);
2220         blocksize = fs_info->nodesize;
2221         eb = find_extent_buffer(fs_info, search);
2222         if (eb) {
2223                 free_extent_buffer(eb);
2224                 return;
2225         }
2226 
2227         target = search;
2228 
2229         nritems = btrfs_header_nritems(node);
2230         nr = slot;
2231 
2232         while (1) {
2233                 if (path->reada == READA_BACK) {
2234                         if (nr == 0)
2235                                 break;
2236                         nr--;
2237                 } else if (path->reada == READA_FORWARD) {
2238                         nr++;
2239                         if (nr >= nritems)
2240                                 break;
2241                 }
2242                 if (path->reada == READA_BACK && objectid) {
2243                         btrfs_node_key(node, &disk_key, nr);
2244                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2245                                 break;
2246                 }
2247                 search = btrfs_node_blockptr(node, nr);
2248                 if ((search <= target && target - search <= 65536) ||
2249                     (search > target && search - target <= 65536)) {
2250                         readahead_tree_block(fs_info, search);
2251                         nread += blocksize;
2252                 }
2253                 nscan++;
2254                 if ((nread > 65536 || nscan > 32))
2255                         break;
2256         }
2257 }
2258 
2259 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2260                                        struct btrfs_path *path, int level)
2261 {
2262         int slot;
2263         int nritems;
2264         struct extent_buffer *parent;
2265         struct extent_buffer *eb;
2266         u64 gen;
2267         u64 block1 = 0;
2268         u64 block2 = 0;
2269 
2270         parent = path->nodes[level + 1];
2271         if (!parent)
2272                 return;
2273 
2274         nritems = btrfs_header_nritems(parent);
2275         slot = path->slots[level + 1];
2276 
2277         if (slot > 0) {
2278                 block1 = btrfs_node_blockptr(parent, slot - 1);
2279                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2280                 eb = find_extent_buffer(fs_info, block1);
2281                 /*
2282                  * if we get -eagain from btrfs_buffer_uptodate, we
2283                  * don't want to return eagain here.  That will loop
2284                  * forever
2285                  */
2286                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2287                         block1 = 0;
2288                 free_extent_buffer(eb);
2289         }
2290         if (slot + 1 < nritems) {
2291                 block2 = btrfs_node_blockptr(parent, slot + 1);
2292                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2293                 eb = find_extent_buffer(fs_info, block2);
2294                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2295                         block2 = 0;
2296                 free_extent_buffer(eb);
2297         }
2298 
2299         if (block1)
2300                 readahead_tree_block(fs_info, block1);
2301         if (block2)
2302                 readahead_tree_block(fs_info, block2);
2303 }
2304 
2305 
2306 /*
2307  * when we walk down the tree, it is usually safe to unlock the higher layers
2308  * in the tree.  The exceptions are when our path goes through slot 0, because
2309  * operations on the tree might require changing key pointers higher up in the
2310  * tree.
2311  *
2312  * callers might also have set path->keep_locks, which tells this code to keep
2313  * the lock if the path points to the last slot in the block.  This is part of
2314  * walking through the tree, and selecting the next slot in the higher block.
2315  *
2316  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2317  * if lowest_unlock is 1, level 0 won't be unlocked
2318  */
2319 static noinline void unlock_up(struct btrfs_path *path, int level,
2320                                int lowest_unlock, int min_write_lock_level,
2321                                int *write_lock_level)
2322 {
2323         int i;
2324         int skip_level = level;
2325         int no_skips = 0;
2326         struct extent_buffer *t;
2327 
2328         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2329                 if (!path->nodes[i])
2330                         break;
2331                 if (!path->locks[i])
2332                         break;
2333                 if (!no_skips && path->slots[i] == 0) {
2334                         skip_level = i + 1;
2335                         continue;
2336                 }
2337                 if (!no_skips && path->keep_locks) {
2338                         u32 nritems;
2339                         t = path->nodes[i];
2340                         nritems = btrfs_header_nritems(t);
2341                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2342                                 skip_level = i + 1;
2343                                 continue;
2344                         }
2345                 }
2346                 if (skip_level < i && i >= lowest_unlock)
2347                         no_skips = 1;
2348 
2349                 t = path->nodes[i];
2350                 if (i >= lowest_unlock && i > skip_level) {
2351                         btrfs_tree_unlock_rw(t, path->locks[i]);
2352                         path->locks[i] = 0;
2353                         if (write_lock_level &&
2354                             i > min_write_lock_level &&
2355                             i <= *write_lock_level) {
2356                                 *write_lock_level = i - 1;
2357                         }
2358                 }
2359         }
2360 }
2361 
2362 /*
2363  * This releases any locks held in the path starting at level and
2364  * going all the way up to the root.
2365  *
2366  * btrfs_search_slot will keep the lock held on higher nodes in a few
2367  * corner cases, such as COW of the block at slot zero in the node.  This
2368  * ignores those rules, and it should only be called when there are no
2369  * more updates to be done higher up in the tree.
2370  */
2371 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2372 {
2373         int i;
2374 
2375         if (path->keep_locks)
2376                 return;
2377 
2378         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2379                 if (!path->nodes[i])
2380                         continue;
2381                 if (!path->locks[i])
2382                         continue;
2383                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2384                 path->locks[i] = 0;
2385         }
2386 }
2387 
2388 /*
2389  * helper function for btrfs_search_slot.  The goal is to find a block
2390  * in cache without setting the path to blocking.  If we find the block
2391  * we return zero and the path is unchanged.
2392  *
2393  * If we can't find the block, we set the path blocking and do some
2394  * reada.  -EAGAIN is returned and the search must be repeated.
2395  */
2396 static int
2397 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2398                       struct extent_buffer **eb_ret, int level, int slot,
2399                       const struct btrfs_key *key)
2400 {
2401         struct btrfs_fs_info *fs_info = root->fs_info;
2402         u64 blocknr;
2403         u64 gen;
2404         struct extent_buffer *b = *eb_ret;
2405         struct extent_buffer *tmp;
2406         struct btrfs_key first_key;
2407         int ret;
2408         int parent_level;
2409 
2410         blocknr = btrfs_node_blockptr(b, slot);
2411         gen = btrfs_node_ptr_generation(b, slot);
2412         parent_level = btrfs_header_level(b);
2413         btrfs_node_key_to_cpu(b, &first_key, slot);
2414 
2415         tmp = find_extent_buffer(fs_info, blocknr);
2416         if (tmp) {
2417                 /* first we do an atomic uptodate check */
2418                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2419                         *eb_ret = tmp;
2420                         return 0;
2421                 }
2422 
2423                 /* the pages were up to date, but we failed
2424                  * the generation number check.  Do a full
2425                  * read for the generation number that is correct.
2426                  * We must do this without dropping locks so
2427                  * we can trust our generation number
2428                  */
2429                 btrfs_set_path_blocking(p);
2430 
2431                 /* now we're allowed to do a blocking uptodate check */
2432                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2433                 if (!ret) {
2434                         *eb_ret = tmp;
2435                         return 0;
2436                 }
2437                 free_extent_buffer(tmp);
2438                 btrfs_release_path(p);
2439                 return -EIO;
2440         }
2441 
2442         /*
2443          * reduce lock contention at high levels
2444          * of the btree by dropping locks before
2445          * we read.  Don't release the lock on the current
2446          * level because we need to walk this node to figure
2447          * out which blocks to read.
2448          */
2449         btrfs_unlock_up_safe(p, level + 1);
2450         btrfs_set_path_blocking(p);
2451 
2452         if (p->reada != READA_NONE)
2453                 reada_for_search(fs_info, p, level, slot, key->objectid);
2454 
2455         ret = -EAGAIN;
2456         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2457                               &first_key);
2458         if (!IS_ERR(tmp)) {
2459                 /*
2460                  * If the read above didn't mark this buffer up to date,
2461                  * it will never end up being up to date.  Set ret to EIO now
2462                  * and give up so that our caller doesn't loop forever
2463                  * on our EAGAINs.
2464                  */
2465                 if (!extent_buffer_uptodate(tmp))
2466                         ret = -EIO;
2467                 free_extent_buffer(tmp);
2468         } else {
2469                 ret = PTR_ERR(tmp);
2470         }
2471 
2472         btrfs_release_path(p);
2473         return ret;
2474 }
2475 
2476 /*
2477  * helper function for btrfs_search_slot.  This does all of the checks
2478  * for node-level blocks and does any balancing required based on
2479  * the ins_len.
2480  *
2481  * If no extra work was required, zero is returned.  If we had to
2482  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2483  * start over
2484  */
2485 static int
2486 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2487                        struct btrfs_root *root, struct btrfs_path *p,
2488                        struct extent_buffer *b, int level, int ins_len,
2489                        int *write_lock_level)
2490 {
2491         struct btrfs_fs_info *fs_info = root->fs_info;
2492         int ret;
2493 
2494         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2495             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2496                 int sret;
2497 
2498                 if (*write_lock_level < level + 1) {
2499                         *write_lock_level = level + 1;
2500                         btrfs_release_path(p);
2501                         goto again;
2502                 }
2503 
2504                 btrfs_set_path_blocking(p);
2505                 reada_for_balance(fs_info, p, level);
2506                 sret = split_node(trans, root, p, level);
2507                 btrfs_clear_path_blocking(p, NULL, 0);
2508 
2509                 BUG_ON(sret > 0);
2510                 if (sret) {
2511                         ret = sret;
2512                         goto done;
2513                 }
2514                 b = p->nodes[level];
2515         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2516                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2517                 int sret;
2518 
2519                 if (*write_lock_level < level + 1) {
2520                         *write_lock_level = level + 1;
2521                         btrfs_release_path(p);
2522                         goto again;
2523                 }
2524 
2525                 btrfs_set_path_blocking(p);
2526                 reada_for_balance(fs_info, p, level);
2527                 sret = balance_level(trans, root, p, level);
2528                 btrfs_clear_path_blocking(p, NULL, 0);
2529 
2530                 if (sret) {
2531                         ret = sret;
2532                         goto done;
2533                 }
2534                 b = p->nodes[level];
2535                 if (!b) {
2536                         btrfs_release_path(p);
2537                         goto again;
2538                 }
2539                 BUG_ON(btrfs_header_nritems(b) == 1);
2540         }
2541         return 0;
2542 
2543 again:
2544         ret = -EAGAIN;
2545 done:
2546         return ret;
2547 }
2548 
2549 static void key_search_validate(struct extent_buffer *b,
2550                                 const struct btrfs_key *key,
2551                                 int level)
2552 {
2553 #ifdef CONFIG_BTRFS_ASSERT
2554         struct btrfs_disk_key disk_key;
2555 
2556         btrfs_cpu_key_to_disk(&disk_key, key);
2557 
2558         if (level == 0)
2559                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2560                     offsetof(struct btrfs_leaf, items[0].key),
2561                     sizeof(disk_key)));
2562         else
2563                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2564                     offsetof(struct btrfs_node, ptrs[0].key),
2565                     sizeof(disk_key)));
2566 #endif
2567 }
2568 
2569 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2570                       int level, int *prev_cmp, int *slot)
2571 {
2572         if (*prev_cmp != 0) {
2573                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2574                 return *prev_cmp;
2575         }
2576 
2577         key_search_validate(b, key, level);
2578         *slot = 0;
2579 
2580         return 0;
2581 }
2582 
2583 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2584                 u64 iobjectid, u64 ioff, u8 key_type,
2585                 struct btrfs_key *found_key)
2586 {
2587         int ret;
2588         struct btrfs_key key;
2589         struct extent_buffer *eb;
2590 
2591         ASSERT(path);
2592         ASSERT(found_key);
2593 
2594         key.type = key_type;
2595         key.objectid = iobjectid;
2596         key.offset = ioff;
2597 
2598         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2599         if (ret < 0)
2600                 return ret;
2601 
2602         eb = path->nodes[0];
2603         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2604                 ret = btrfs_next_leaf(fs_root, path);
2605                 if (ret)
2606                         return ret;
2607                 eb = path->nodes[0];
2608         }
2609 
2610         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2611         if (found_key->type != key.type ||
2612                         found_key->objectid != key.objectid)
2613                 return 1;
2614 
2615         return 0;
2616 }
2617 
2618 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2619                                                         struct btrfs_path *p,
2620                                                         int write_lock_level)
2621 {
2622         struct btrfs_fs_info *fs_info = root->fs_info;
2623         struct extent_buffer *b;
2624         int root_lock;
2625         int level = 0;
2626 
2627         /* We try very hard to do read locks on the root */
2628         root_lock = BTRFS_READ_LOCK;
2629 
2630         if (p->search_commit_root) {
2631                 /* The commit roots are read only so we always do read locks */
2632                 if (p->need_commit_sem)
2633                         down_read(&fs_info->commit_root_sem);
2634                 b = root->commit_root;
2635                 extent_buffer_get(b);
2636                 level = btrfs_header_level(b);
2637                 if (p->need_commit_sem)
2638                         up_read(&fs_info->commit_root_sem);
2639                 /*
2640                  * Ensure that all callers have set skip_locking when
2641                  * p->search_commit_root = 1.
2642                  */
2643                 ASSERT(p->skip_locking == 1);
2644 
2645                 goto out;
2646         }
2647 
2648         if (p->skip_locking) {
2649                 b = btrfs_root_node(root);
2650                 level = btrfs_header_level(b);
2651                 goto out;
2652         }
2653 
2654         /*
2655          * If the level is set to maximum, we can skip trying to get the read
2656          * lock.
2657          */
2658         if (write_lock_level < BTRFS_MAX_LEVEL) {
2659                 /*
2660                  * We don't know the level of the root node until we actually
2661                  * have it read locked
2662                  */
2663                 b = btrfs_read_lock_root_node(root);
2664                 level = btrfs_header_level(b);
2665                 if (level > write_lock_level)
2666                         goto out;
2667 
2668                 /* Whoops, must trade for write lock */
2669                 btrfs_tree_read_unlock(b);
2670                 free_extent_buffer(b);
2671         }
2672 
2673         b = btrfs_lock_root_node(root);
2674         root_lock = BTRFS_WRITE_LOCK;
2675 
2676         /* The level might have changed, check again */
2677         level = btrfs_header_level(b);
2678 
2679 out:
2680         p->nodes[level] = b;
2681         if (!p->skip_locking)
2682                 p->locks[level] = root_lock;
2683         /*
2684          * Callers are responsible for dropping b's references.
2685          */
2686         return b;
2687 }
2688 
2689 
2690 /*
2691  * btrfs_search_slot - look for a key in a tree and perform necessary
2692  * modifications to preserve tree invariants.
2693  *
2694  * @trans:      Handle of transaction, used when modifying the tree
2695  * @p:          Holds all btree nodes along the search path
2696  * @root:       The root node of the tree
2697  * @key:        The key we are looking for
2698  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2699  *              deletions it's -1. 0 for plain searches
2700  * @cow:        boolean should CoW operations be performed. Must always be 1
2701  *              when modifying the tree.
2702  *
2703  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2704  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2705  *
2706  * If @key is found, 0 is returned and you can find the item in the leaf level
2707  * of the path (level 0)
2708  *
2709  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2710  * points to the slot where it should be inserted
2711  *
2712  * If an error is encountered while searching the tree a negative error number
2713  * is returned
2714  */
2715 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2716                       const struct btrfs_key *key, struct btrfs_path *p,
2717                       int ins_len, int cow)
2718 {
2719         struct btrfs_fs_info *fs_info = root->fs_info;
2720         struct extent_buffer *b;
2721         int slot;
2722         int ret;
2723         int err;
2724         int level;
2725         int lowest_unlock = 1;
2726         /* everything at write_lock_level or lower must be write locked */
2727         int write_lock_level = 0;
2728         u8 lowest_level = 0;
2729         int min_write_lock_level;
2730         int prev_cmp;
2731 
2732         lowest_level = p->lowest_level;
2733         WARN_ON(lowest_level && ins_len > 0);
2734         WARN_ON(p->nodes[0] != NULL);
2735         BUG_ON(!cow && ins_len);
2736 
2737         if (ins_len < 0) {
2738                 lowest_unlock = 2;
2739 
2740                 /* when we are removing items, we might have to go up to level
2741                  * two as we update tree pointers  Make sure we keep write
2742                  * for those levels as well
2743                  */
2744                 write_lock_level = 2;
2745         } else if (ins_len > 0) {
2746                 /*
2747                  * for inserting items, make sure we have a write lock on
2748                  * level 1 so we can update keys
2749                  */
2750                 write_lock_level = 1;
2751         }
2752 
2753         if (!cow)
2754                 write_lock_level = -1;
2755 
2756         if (cow && (p->keep_locks || p->lowest_level))
2757                 write_lock_level = BTRFS_MAX_LEVEL;
2758 
2759         min_write_lock_level = write_lock_level;
2760 
2761 again:
2762         prev_cmp = -1;
2763         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2764 
2765         while (b) {
2766                 level = btrfs_header_level(b);
2767 
2768                 /*
2769                  * setup the path here so we can release it under lock
2770                  * contention with the cow code
2771                  */
2772                 if (cow) {
2773                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2774 
2775                         /*
2776                          * if we don't really need to cow this block
2777                          * then we don't want to set the path blocking,
2778                          * so we test it here
2779                          */
2780                         if (!should_cow_block(trans, root, b)) {
2781                                 trans->dirty = true;
2782                                 goto cow_done;
2783                         }
2784 
2785                         /*
2786                          * must have write locks on this node and the
2787                          * parent
2788                          */
2789                         if (level > write_lock_level ||
2790                             (level + 1 > write_lock_level &&
2791                             level + 1 < BTRFS_MAX_LEVEL &&
2792                             p->nodes[level + 1])) {
2793                                 write_lock_level = level + 1;
2794                                 btrfs_release_path(p);
2795                                 goto again;
2796                         }
2797 
2798                         btrfs_set_path_blocking(p);
2799                         if (last_level)
2800                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2801                                                       &b);
2802                         else
2803                                 err = btrfs_cow_block(trans, root, b,
2804                                                       p->nodes[level + 1],
2805                                                       p->slots[level + 1], &b);
2806                         if (err) {
2807                                 ret = err;
2808                                 goto done;
2809                         }
2810                 }
2811 cow_done:
2812                 p->nodes[level] = b;
2813                 btrfs_clear_path_blocking(p, NULL, 0);
2814 
2815                 /*
2816                  * we have a lock on b and as long as we aren't changing
2817                  * the tree, there is no way to for the items in b to change.
2818                  * It is safe to drop the lock on our parent before we
2819                  * go through the expensive btree search on b.
2820                  *
2821                  * If we're inserting or deleting (ins_len != 0), then we might
2822                  * be changing slot zero, which may require changing the parent.
2823                  * So, we can't drop the lock until after we know which slot
2824                  * we're operating on.
2825                  */
2826                 if (!ins_len && !p->keep_locks) {
2827                         int u = level + 1;
2828 
2829                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2830                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2831                                 p->locks[u] = 0;
2832                         }
2833                 }
2834 
2835                 ret = key_search(b, key, level, &prev_cmp, &slot);
2836                 if (ret < 0)
2837                         goto done;
2838 
2839                 if (level != 0) {
2840                         int dec = 0;
2841                         if (ret && slot > 0) {
2842                                 dec = 1;
2843                                 slot -= 1;
2844                         }
2845                         p->slots[level] = slot;
2846                         err = setup_nodes_for_search(trans, root, p, b, level,
2847                                              ins_len, &write_lock_level);
2848                         if (err == -EAGAIN)
2849                                 goto again;
2850                         if (err) {
2851                                 ret = err;
2852                                 goto done;
2853                         }
2854                         b = p->nodes[level];
2855                         slot = p->slots[level];
2856 
2857                         /*
2858                          * slot 0 is special, if we change the key
2859                          * we have to update the parent pointer
2860                          * which means we must have a write lock
2861                          * on the parent
2862                          */
2863                         if (slot == 0 && ins_len &&
2864                             write_lock_level < level + 1) {
2865                                 write_lock_level = level + 1;
2866                                 btrfs_release_path(p);
2867                                 goto again;
2868                         }
2869 
2870                         unlock_up(p, level, lowest_unlock,
2871                                   min_write_lock_level, &write_lock_level);
2872 
2873                         if (level == lowest_level) {
2874                                 if (dec)
2875                                         p->slots[level]++;
2876                                 goto done;
2877                         }
2878 
2879                         err = read_block_for_search(root, p, &b, level,
2880                                                     slot, key);
2881                         if (err == -EAGAIN)
2882                                 goto again;
2883                         if (err) {
2884                                 ret = err;
2885                                 goto done;
2886                         }
2887 
2888                         if (!p->skip_locking) {
2889                                 level = btrfs_header_level(b);
2890                                 if (level <= write_lock_level) {
2891                                         err = btrfs_try_tree_write_lock(b);
2892                                         if (!err) {
2893                                                 btrfs_set_path_blocking(p);
2894                                                 btrfs_tree_lock(b);
2895                                                 btrfs_clear_path_blocking(p, b,
2896                                                                   BTRFS_WRITE_LOCK);
2897                                         }
2898                                         p->locks[level] = BTRFS_WRITE_LOCK;
2899                                 } else {
2900                                         err = btrfs_tree_read_lock_atomic(b);
2901                                         if (!err) {
2902                                                 btrfs_set_path_blocking(p);
2903                                                 btrfs_tree_read_lock(b);
2904                                                 btrfs_clear_path_blocking(p, b,
2905                                                                   BTRFS_READ_LOCK);
2906                                         }
2907                                         p->locks[level] = BTRFS_READ_LOCK;
2908                                 }
2909                                 p->nodes[level] = b;
2910                         }
2911                 } else {
2912                         p->slots[level] = slot;
2913                         if (ins_len > 0 &&
2914                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2915                                 if (write_lock_level < 1) {
2916                                         write_lock_level = 1;
2917                                         btrfs_release_path(p);
2918                                         goto again;
2919                                 }
2920 
2921                                 btrfs_set_path_blocking(p);
2922                                 err = split_leaf(trans, root, key,
2923                                                  p, ins_len, ret == 0);
2924                                 btrfs_clear_path_blocking(p, NULL, 0);
2925 
2926                                 BUG_ON(err > 0);
2927                                 if (err) {
2928                                         ret = err;
2929                                         goto done;
2930                                 }
2931                         }
2932                         if (!p->search_for_split)
2933                                 unlock_up(p, level, lowest_unlock,
2934                                           min_write_lock_level, &write_lock_level);
2935                         goto done;
2936                 }
2937         }
2938         ret = 1;
2939 done:
2940         /*
2941          * we don't really know what they plan on doing with the path
2942          * from here on, so for now just mark it as blocking
2943          */
2944         if (!p->leave_spinning)
2945                 btrfs_set_path_blocking(p);
2946         if (ret < 0 && !p->skip_release_on_error)
2947                 btrfs_release_path(p);
2948         return ret;
2949 }
2950 
2951 /*
2952  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2953  * current state of the tree together with the operations recorded in the tree
2954  * modification log to search for the key in a previous version of this tree, as
2955  * denoted by the time_seq parameter.
2956  *
2957  * Naturally, there is no support for insert, delete or cow operations.
2958  *
2959  * The resulting path and return value will be set up as if we called
2960  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2961  */
2962 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2963                           struct btrfs_path *p, u64 time_seq)
2964 {
2965         struct btrfs_fs_info *fs_info = root->fs_info;
2966         struct extent_buffer *b;
2967         int slot;
2968         int ret;
2969         int err;
2970         int level;
2971         int lowest_unlock = 1;
2972         u8 lowest_level = 0;
2973         int prev_cmp = -1;
2974 
2975         lowest_level = p->lowest_level;
2976         WARN_ON(p->nodes[0] != NULL);
2977 
2978         if (p->search_commit_root) {
2979                 BUG_ON(time_seq);
2980                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2981         }
2982 
2983 again:
2984         b = get_old_root(root, time_seq);
2985         level = btrfs_header_level(b);
2986         p->locks[level] = BTRFS_READ_LOCK;
2987 
2988         while (b) {
2989                 level = btrfs_header_level(b);
2990                 p->nodes[level] = b;
2991                 btrfs_clear_path_blocking(p, NULL, 0);
2992 
2993                 /*
2994                  * we have a lock on b and as long as we aren't changing
2995                  * the tree, there is no way to for the items in b to change.
2996                  * It is safe to drop the lock on our parent before we
2997                  * go through the expensive btree search on b.
2998                  */
2999                 btrfs_unlock_up_safe(p, level + 1);
3000 
3001                 /*
3002                  * Since we can unwind ebs we want to do a real search every
3003                  * time.
3004                  */
3005                 prev_cmp = -1;
3006                 ret = key_search(b, key, level, &prev_cmp, &slot);
3007 
3008                 if (level != 0) {
3009                         int dec = 0;
3010                         if (ret && slot > 0) {
3011                                 dec = 1;
3012                                 slot -= 1;
3013                         }
3014                         p->slots[level] = slot;
3015                         unlock_up(p, level, lowest_unlock, 0, NULL);
3016 
3017                         if (level == lowest_level) {
3018                                 if (dec)
3019                                         p->slots[level]++;
3020                                 goto done;
3021                         }
3022 
3023                         err = read_block_for_search(root, p, &b, level,
3024                                                     slot, key);
3025                         if (err == -EAGAIN)
3026                                 goto again;
3027                         if (err) {
3028                                 ret = err;
3029                                 goto done;
3030                         }
3031 
3032                         level = btrfs_header_level(b);
3033                         err = btrfs_tree_read_lock_atomic(b);
3034                         if (!err) {
3035                                 btrfs_set_path_blocking(p);
3036                                 btrfs_tree_read_lock(b);
3037                                 btrfs_clear_path_blocking(p, b,
3038                                                           BTRFS_READ_LOCK);
3039                         }
3040                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3041                         if (!b) {
3042                                 ret = -ENOMEM;
3043                                 goto done;
3044                         }
3045                         p->locks[level] = BTRFS_READ_LOCK;
3046                         p->nodes[level] = b;
3047                 } else {
3048                         p->slots[level] = slot;
3049                         unlock_up(p, level, lowest_unlock, 0, NULL);
3050                         goto done;
3051                 }
3052         }
3053         ret = 1;
3054 done:
3055         if (!p->leave_spinning)
3056                 btrfs_set_path_blocking(p);
3057         if (ret < 0)
3058                 btrfs_release_path(p);
3059 
3060         return ret;
3061 }
3062 
3063 /*
3064  * helper to use instead of search slot if no exact match is needed but
3065  * instead the next or previous item should be returned.
3066  * When find_higher is true, the next higher item is returned, the next lower
3067  * otherwise.
3068  * When return_any and find_higher are both true, and no higher item is found,
3069  * return the next lower instead.
3070  * When return_any is true and find_higher is false, and no lower item is found,
3071  * return the next higher instead.
3072  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3073  * < 0 on error
3074  */
3075 int btrfs_search_slot_for_read(struct btrfs_root *root,
3076                                const struct btrfs_key *key,
3077                                struct btrfs_path *p, int find_higher,
3078                                int return_any)
3079 {
3080         int ret;
3081         struct extent_buffer *leaf;
3082 
3083 again:
3084         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3085         if (ret <= 0)
3086                 return ret;
3087         /*
3088          * a return value of 1 means the path is at the position where the
3089          * item should be inserted. Normally this is the next bigger item,
3090          * but in case the previous item is the last in a leaf, path points
3091          * to the first free slot in the previous leaf, i.e. at an invalid
3092          * item.
3093          */
3094         leaf = p->nodes[0];
3095 
3096         if (find_higher) {
3097                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3098                         ret = btrfs_next_leaf(root, p);
3099                         if (ret <= 0)
3100                                 return ret;
3101                         if (!return_any)
3102                                 return 1;
3103                         /*
3104                          * no higher item found, return the next
3105                          * lower instead
3106                          */
3107                         return_any = 0;
3108                         find_higher = 0;
3109                         btrfs_release_path(p);
3110                         goto again;
3111                 }
3112         } else {
3113                 if (p->slots[0] == 0) {
3114                         ret = btrfs_prev_leaf(root, p);
3115                         if (ret < 0)
3116                                 return ret;
3117                         if (!ret) {
3118                                 leaf = p->nodes[0];
3119                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3120                                         p->slots[0]--;
3121                                 return 0;
3122                         }
3123                         if (!return_any)
3124                                 return 1;
3125                         /*
3126                          * no lower item found, return the next
3127                          * higher instead
3128                          */
3129                         return_any = 0;
3130                         find_higher = 1;
3131                         btrfs_release_path(p);
3132                         goto again;
3133                 } else {
3134                         --p->slots[0];
3135                 }
3136         }
3137         return 0;
3138 }
3139 
3140 /*
3141  * adjust the pointers going up the tree, starting at level
3142  * making sure the right key of each node is points to 'key'.
3143  * This is used after shifting pointers to the left, so it stops
3144  * fixing up pointers when a given leaf/node is not in slot 0 of the
3145  * higher levels
3146  *
3147  */
3148 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3149                            struct btrfs_path *path,
3150                            struct btrfs_disk_key *key, int level)
3151 {
3152         int i;
3153         struct extent_buffer *t;
3154         int ret;
3155 
3156         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3157                 int tslot = path->slots[i];
3158 
3159                 if (!path->nodes[i])
3160                         break;
3161                 t = path->nodes[i];
3162                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3163                                 GFP_ATOMIC);
3164                 BUG_ON(ret < 0);
3165                 btrfs_set_node_key(t, key, tslot);
3166                 btrfs_mark_buffer_dirty(path->nodes[i]);
3167                 if (tslot != 0)
3168                         break;
3169         }
3170 }
3171 
3172 /*
3173  * update item key.
3174  *
3175  * This function isn't completely safe. It's the caller's responsibility
3176  * that the new key won't break the order
3177  */
3178 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3179                              struct btrfs_path *path,
3180                              const struct btrfs_key *new_key)
3181 {
3182         struct btrfs_disk_key disk_key;
3183         struct extent_buffer *eb;
3184         int slot;
3185 
3186         eb = path->nodes[0];
3187         slot = path->slots[0];
3188         if (slot > 0) {
3189                 btrfs_item_key(eb, &disk_key, slot - 1);
3190                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3191         }
3192         if (slot < btrfs_header_nritems(eb) - 1) {
3193                 btrfs_item_key(eb, &disk_key, slot + 1);
3194                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3195         }
3196 
3197         btrfs_cpu_key_to_disk(&disk_key, new_key);
3198         btrfs_set_item_key(eb, &disk_key, slot);
3199         btrfs_mark_buffer_dirty(eb);
3200         if (slot == 0)
3201                 fixup_low_keys(fs_info, path, &disk_key, 1);
3202 }
3203 
3204 /*
3205  * try to push data from one node into the next node left in the
3206  * tree.
3207  *
3208  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3209  * error, and > 0 if there was no room in the left hand block.
3210  */
3211 static int push_node_left(struct btrfs_trans_handle *trans,
3212                           struct btrfs_fs_info *fs_info,
3213                           struct extent_buffer *dst,
3214                           struct extent_buffer *src, int empty)
3215 {
3216         int push_items = 0;
3217         int src_nritems;
3218         int dst_nritems;
3219         int ret = 0;
3220 
3221         src_nritems = btrfs_header_nritems(src);
3222         dst_nritems = btrfs_header_nritems(dst);
3223         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3224         WARN_ON(btrfs_header_generation(src) != trans->transid);
3225         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3226 
3227         if (!empty && src_nritems <= 8)
3228                 return 1;
3229 
3230         if (push_items <= 0)
3231                 return 1;
3232 
3233         if (empty) {
3234                 push_items = min(src_nritems, push_items);
3235                 if (push_items < src_nritems) {
3236                         /* leave at least 8 pointers in the node if
3237                          * we aren't going to empty it
3238                          */
3239                         if (src_nritems - push_items < 8) {
3240                                 if (push_items <= 8)
3241                                         return 1;
3242                                 push_items -= 8;
3243                         }
3244                 }
3245         } else
3246                 push_items = min(src_nritems - 8, push_items);
3247 
3248         ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3249                                    push_items);
3250         if (ret) {
3251                 btrfs_abort_transaction(trans, ret);
3252                 return ret;
3253         }
3254         copy_extent_buffer(dst, src,
3255                            btrfs_node_key_ptr_offset(dst_nritems),
3256                            btrfs_node_key_ptr_offset(0),
3257                            push_items * sizeof(struct btrfs_key_ptr));
3258 
3259         if (push_items < src_nritems) {
3260                 /*
3261                  * Don't call tree_mod_log_insert_move here, key removal was
3262                  * already fully logged by tree_mod_log_eb_copy above.
3263                  */
3264                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3265                                       btrfs_node_key_ptr_offset(push_items),
3266                                       (src_nritems - push_items) *
3267                                       sizeof(struct btrfs_key_ptr));
3268         }
3269         btrfs_set_header_nritems(src, src_nritems - push_items);
3270         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3271         btrfs_mark_buffer_dirty(src);
3272         btrfs_mark_buffer_dirty(dst);
3273 
3274         return ret;
3275 }
3276 
3277 /*
3278  * try to push data from one node into the next node right in the
3279  * tree.
3280  *
3281  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3282  * error, and > 0 if there was no room in the right hand block.
3283  *
3284  * this will  only push up to 1/2 the contents of the left node over
3285  */
3286 static int balance_node_right(struct btrfs_trans_handle *trans,
3287                               struct btrfs_fs_info *fs_info,
3288                               struct extent_buffer *dst,
3289                               struct extent_buffer *src)
3290 {
3291         int push_items = 0;
3292         int max_push;
3293         int src_nritems;
3294         int dst_nritems;
3295         int ret = 0;
3296 
3297         WARN_ON(btrfs_header_generation(src) != trans->transid);
3298         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3299 
3300         src_nritems = btrfs_header_nritems(src);
3301         dst_nritems = btrfs_header_nritems(dst);
3302         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3303         if (push_items <= 0)
3304                 return 1;
3305 
3306         if (src_nritems < 4)
3307                 return 1;
3308 
3309         max_push = src_nritems / 2 + 1;
3310         /* don't try to empty the node */
3311         if (max_push >= src_nritems)
3312                 return 1;
3313 
3314         if (max_push < push_items)
3315                 push_items = max_push;
3316 
3317         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3318         BUG_ON(ret < 0);
3319         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3320                                       btrfs_node_key_ptr_offset(0),
3321                                       (dst_nritems) *
3322                                       sizeof(struct btrfs_key_ptr));
3323 
3324         ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3325                                    src_nritems - push_items, push_items);
3326         if (ret) {
3327                 btrfs_abort_transaction(trans, ret);
3328                 return ret;
3329         }
3330         copy_extent_buffer(dst, src,
3331                            btrfs_node_key_ptr_offset(0),
3332                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3333                            push_items * sizeof(struct btrfs_key_ptr));
3334 
3335         btrfs_set_header_nritems(src, src_nritems - push_items);
3336         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3337 
3338         btrfs_mark_buffer_dirty(src);
3339         btrfs_mark_buffer_dirty(dst);
3340 
3341         return ret;
3342 }
3343 
3344 /*
3345  * helper function to insert a new root level in the tree.
3346  * A new node is allocated, and a single item is inserted to
3347  * point to the existing root
3348  *
3349  * returns zero on success or < 0 on failure.
3350  */
3351 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3352                            struct btrfs_root *root,
3353                            struct btrfs_path *path, int level)
3354 {
3355         struct btrfs_fs_info *fs_info = root->fs_info;
3356         u64 lower_gen;
3357         struct extent_buffer *lower;
3358         struct extent_buffer *c;
3359         struct extent_buffer *old;
3360         struct btrfs_disk_key lower_key;
3361         int ret;
3362 
3363         BUG_ON(path->nodes[level]);
3364         BUG_ON(path->nodes[level-1] != root->node);
3365 
3366         lower = path->nodes[level-1];
3367         if (level == 1)
3368                 btrfs_item_key(lower, &lower_key, 0);
3369         else
3370                 btrfs_node_key(lower, &lower_key, 0);
3371 
3372         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3373                                    &lower_key, level, root->node->start, 0);
3374         if (IS_ERR(c))
3375                 return PTR_ERR(c);
3376 
3377         root_add_used(root, fs_info->nodesize);
3378 
3379         memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3380         btrfs_set_header_nritems(c, 1);
3381         btrfs_set_header_level(c, level);
3382         btrfs_set_header_bytenr(c, c->start);
3383         btrfs_set_header_generation(c, trans->transid);
3384         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3385         btrfs_set_header_owner(c, root->root_key.objectid);
3386 
3387         write_extent_buffer_fsid(c, fs_info->fsid);
3388         write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
3389 
3390         btrfs_set_node_key(c, &lower_key, 0);
3391         btrfs_set_node_blockptr(c, 0, lower->start);
3392         lower_gen = btrfs_header_generation(lower);
3393         WARN_ON(lower_gen != trans->transid);
3394 
3395         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3396 
3397         btrfs_mark_buffer_dirty(c);
3398 
3399         old = root->node;
3400         ret = tree_mod_log_insert_root(root->node, c, 0);
3401         BUG_ON(ret < 0);
3402         rcu_assign_pointer(root->node, c);
3403 
3404         /* the super has an extra ref to root->node */
3405         free_extent_buffer(old);
3406 
3407         add_root_to_dirty_list(root);
3408         extent_buffer_get(c);
3409         path->nodes[level] = c;
3410         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3411         path->slots[level] = 0;
3412         return 0;
3413 }
3414 
3415 /*
3416  * worker function to insert a single pointer in a node.
3417  * the node should have enough room for the pointer already
3418  *
3419  * slot and level indicate where you want the key to go, and
3420  * blocknr is the block the key points to.
3421  */
3422 static void insert_ptr(struct btrfs_trans_handle *trans,
3423                        struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3424                        struct btrfs_disk_key *key, u64 bytenr,
3425                        int slot, int level)
3426 {
3427         struct extent_buffer *lower;
3428         int nritems;
3429         int ret;
3430 
3431         BUG_ON(!path->nodes[level]);
3432         btrfs_assert_tree_locked(path->nodes[level]);
3433         lower = path->nodes[level];
3434         nritems = btrfs_header_nritems(lower);
3435         BUG_ON(slot > nritems);
3436         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3437         if (slot != nritems) {
3438                 if (level) {
3439                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3440                                         nritems - slot);
3441                         BUG_ON(ret < 0);
3442                 }
3443                 memmove_extent_buffer(lower,
3444                               btrfs_node_key_ptr_offset(slot + 1),
3445                               btrfs_node_key_ptr_offset(slot),
3446                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3447         }
3448         if (level) {
3449                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3450                                 GFP_NOFS);
3451                 BUG_ON(ret < 0);
3452         }
3453         btrfs_set_node_key(lower, key, slot);
3454         btrfs_set_node_blockptr(lower, slot, bytenr);
3455         WARN_ON(trans->transid == 0);
3456         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3457         btrfs_set_header_nritems(lower, nritems + 1);
3458         btrfs_mark_buffer_dirty(lower);
3459 }
3460 
3461 /*
3462  * split the node at the specified level in path in two.
3463  * The path is corrected to point to the appropriate node after the split
3464  *
3465  * Before splitting this tries to make some room in the node by pushing
3466  * left and right, if either one works, it returns right away.
3467  *
3468  * returns 0 on success and < 0 on failure
3469  */
3470 static noinline int split_node(struct btrfs_trans_handle *trans,
3471                                struct btrfs_root *root,
3472                                struct btrfs_path *path, int level)
3473 {
3474         struct btrfs_fs_info *fs_info = root->fs_info;
3475         struct extent_buffer *c;
3476         struct extent_buffer *split;
3477         struct btrfs_disk_key disk_key;
3478         int mid;
3479         int ret;
3480         u32 c_nritems;
3481 
3482         c = path->nodes[level];
3483         WARN_ON(btrfs_header_generation(c) != trans->transid);
3484         if (c == root->node) {
3485                 /*
3486                  * trying to split the root, lets make a new one
3487                  *
3488                  * tree mod log: We don't log_removal old root in
3489                  * insert_new_root, because that root buffer will be kept as a
3490                  * normal node. We are going to log removal of half of the
3491                  * elements below with tree_mod_log_eb_copy. We're holding a
3492                  * tree lock on the buffer, which is why we cannot race with
3493                  * other tree_mod_log users.
3494                  */
3495                 ret = insert_new_root(trans, root, path, level + 1);
3496                 if (ret)
3497                         return ret;
3498         } else {
3499                 ret = push_nodes_for_insert(trans, root, path, level);
3500                 c = path->nodes[level];
3501                 if (!ret && btrfs_header_nritems(c) <
3502                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3503                         return 0;
3504                 if (ret < 0)
3505                         return ret;
3506         }
3507 
3508         c_nritems = btrfs_header_nritems(c);
3509         mid = (c_nritems + 1) / 2;
3510         btrfs_node_key(c, &disk_key, mid);
3511 
3512         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3513                         &disk_key, level, c->start, 0);
3514         if (IS_ERR(split))
3515                 return PTR_ERR(split);
3516 
3517         root_add_used(root, fs_info->nodesize);
3518 
3519         memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3520         btrfs_set_header_level(split, btrfs_header_level(c));
3521         btrfs_set_header_bytenr(split, split->start);
3522         btrfs_set_header_generation(split, trans->transid);
3523         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3524         btrfs_set_header_owner(split, root->root_key.objectid);
3525         write_extent_buffer_fsid(split, fs_info->fsid);
3526         write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
3527 
3528         ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3529         if (ret) {
3530                 btrfs_abort_transaction(trans, ret);
3531                 return ret;
3532         }
3533         copy_extent_buffer(split, c,
3534                            btrfs_node_key_ptr_offset(0),
3535                            btrfs_node_key_ptr_offset(mid),
3536                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3537         btrfs_set_header_nritems(split, c_nritems - mid);
3538         btrfs_set_header_nritems(c, mid);
3539         ret = 0;
3540 
3541         btrfs_mark_buffer_dirty(c);
3542         btrfs_mark_buffer_dirty(split);
3543 
3544         insert_ptr(trans, fs_info, path, &disk_key, split->start,
3545                    path->slots[level + 1] + 1, level + 1);
3546 
3547         if (path->slots[level] >= mid) {
3548                 path->slots[level] -= mid;
3549                 btrfs_tree_unlock(c);
3550                 free_extent_buffer(c);
3551                 path->nodes[level] = split;
3552                 path->slots[level + 1] += 1;
3553         } else {
3554                 btrfs_tree_unlock(split);
3555                 free_extent_buffer(split);
3556         }
3557         return ret;
3558 }
3559 
3560 /*
3561  * how many bytes are required to store the items in a leaf.  start
3562  * and nr indicate which items in the leaf to check.  This totals up the
3563  * space used both by the item structs and the item data
3564  */
3565 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3566 {
3567         struct btrfs_item *start_item;
3568         struct btrfs_item *end_item;
3569         struct btrfs_map_token token;
3570         int data_len;
3571         int nritems = btrfs_header_nritems(l);
3572         int end = min(nritems, start + nr) - 1;
3573 
3574         if (!nr)
3575                 return 0;
3576         btrfs_init_map_token(&token);
3577         start_item = btrfs_item_nr(start);
3578         end_item = btrfs_item_nr(end);
3579         data_len = btrfs_token_item_offset(l, start_item, &token) +
3580                 btrfs_token_item_size(l, start_item, &token);
3581         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3582         data_len += sizeof(struct btrfs_item) * nr;
3583         WARN_ON(data_len < 0);
3584         return data_len;
3585 }
3586 
3587 /*
3588  * The space between the end of the leaf items and
3589  * the start of the leaf data.  IOW, how much room
3590  * the leaf has left for both items and data
3591  */
3592 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3593                                    struct extent_buffer *leaf)
3594 {
3595         int nritems = btrfs_header_nritems(leaf);
3596         int ret;
3597 
3598         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3599         if (ret < 0) {
3600                 btrfs_crit(fs_info,
3601                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3602                            ret,
3603                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3604                            leaf_space_used(leaf, 0, nritems), nritems);
3605         }
3606         return ret;
3607 }
3608 
3609 /*
3610  * min slot controls the lowest index we're willing to push to the
3611  * right.  We'll push up to and including min_slot, but no lower
3612  */
3613 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3614                                       struct btrfs_path *path,
3615                                       int data_size, int empty,
3616                                       struct extent_buffer *right,
3617                                       int free_space, u32 left_nritems,
3618                                       u32 min_slot)
3619 {
3620         struct extent_buffer *left = path->nodes[0];
3621         struct extent_buffer *upper = path->nodes[1];
3622         struct btrfs_map_token token;
3623         struct btrfs_disk_key disk_key;
3624         int slot;
3625         u32 i;
3626         int push_space = 0;
3627         int push_items = 0;
3628         struct btrfs_item *item;
3629         u32 nr;
3630         u32 right_nritems;
3631         u32 data_end;
3632         u32 this_item_size;
3633 
3634         btrfs_init_map_token(&token);
3635 
3636         if (empty)
3637                 nr = 0;
3638         else
3639                 nr = max_t(u32, 1, min_slot);
3640 
3641         if (path->slots[0] >= left_nritems)
3642                 push_space += data_size;
3643 
3644         slot = path->slots[1];
3645         i = left_nritems - 1;
3646         while (i >= nr) {
3647                 item = btrfs_item_nr(i);
3648 
3649                 if (!empty && push_items > 0) {
3650                         if (path->slots[0] > i)
3651                                 break;
3652                         if (path->slots[0] == i) {
3653                                 int space = btrfs_leaf_free_space(fs_info, left);
3654                                 if (space + push_space * 2 > free_space)
3655                                         break;
3656                         }
3657                 }
3658 
3659                 if (path->slots[0] == i)
3660                         push_space += data_size;
3661 
3662                 this_item_size = btrfs_item_size(left, item);
3663                 if (this_item_size + sizeof(*item) + push_space > free_space)
3664                         break;
3665 
3666                 push_items++;
3667                 push_space += this_item_size + sizeof(*item);
3668                 if (i == 0)
3669                         break;
3670                 i--;
3671         }
3672 
3673         if (push_items == 0)
3674                 goto out_unlock;
3675 
3676         WARN_ON(!empty && push_items == left_nritems);
3677 
3678         /* push left to right */
3679         right_nritems = btrfs_header_nritems(right);
3680 
3681         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3682         push_space -= leaf_data_end(fs_info, left);
3683 
3684         /* make room in the right data area */
3685         data_end = leaf_data_end(fs_info, right);
3686         memmove_extent_buffer(right,
3687                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3688                               BTRFS_LEAF_DATA_OFFSET + data_end,
3689                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3690 
3691         /* copy from the left data area */
3692         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3693                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3694                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3695                      push_space);
3696 
3697         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3698                               btrfs_item_nr_offset(0),
3699                               right_nritems * sizeof(struct btrfs_item));
3700 
3701         /* copy the items from left to right */
3702         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3703                    btrfs_item_nr_offset(left_nritems - push_items),
3704                    push_items * sizeof(struct btrfs_item));
3705 
3706         /* update the item pointers */
3707         right_nritems += push_items;
3708         btrfs_set_header_nritems(right, right_nritems);
3709         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3710         for (i = 0; i < right_nritems; i++) {
3711                 item = btrfs_item_nr(i);
3712                 push_space -= btrfs_token_item_size(right, item, &token);
3713                 btrfs_set_token_item_offset(right, item, push_space, &token);
3714         }
3715 
3716         left_nritems -= push_items;
3717         btrfs_set_header_nritems(left, left_nritems);
3718 
3719         if (left_nritems)
3720                 btrfs_mark_buffer_dirty(left);
3721         else
3722                 clean_tree_block(fs_info, left);
3723 
3724         btrfs_mark_buffer_dirty(right);
3725 
3726         btrfs_item_key(right, &disk_key, 0);
3727         btrfs_set_node_key(upper, &disk_key, slot + 1);
3728         btrfs_mark_buffer_dirty(upper);
3729 
3730         /* then fixup the leaf pointer in the path */
3731         if (path->slots[0] >= left_nritems) {
3732                 path->slots[0] -= left_nritems;
3733                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3734                         clean_tree_block(fs_info, path->nodes[0]);
3735                 btrfs_tree_unlock(path->nodes[0]);
3736                 free_extent_buffer(path->nodes[0]);
3737                 path->nodes[0] = right;
3738                 path->slots[1] += 1;
3739         } else {
3740                 btrfs_tree_unlock(right);
3741                 free_extent_buffer(right);
3742         }
3743         return 0;
3744 
3745 out_unlock:
3746         btrfs_tree_unlock(right);
3747         free_extent_buffer(right);
3748         return 1;
3749 }
3750 
3751 /*
3752  * push some data in the path leaf to the right, trying to free up at
3753  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3754  *
3755  * returns 1 if the push failed because the other node didn't have enough
3756  * room, 0 if everything worked out and < 0 if there were major errors.
3757  *
3758  * this will push starting from min_slot to the end of the leaf.  It won't
3759  * push any slot lower than min_slot
3760  */
3761 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3762                            *root, struct btrfs_path *path,
3763                            int min_data_size, int data_size,
3764                            int empty, u32 min_slot)
3765 {
3766         struct btrfs_fs_info *fs_info = root->fs_info;
3767         struct extent_buffer *left = path->nodes[0];
3768         struct extent_buffer *right;
3769         struct extent_buffer *upper;
3770         int slot;
3771         int free_space;
3772         u32 left_nritems;
3773         int ret;
3774 
3775         if (!path->nodes[1])
3776                 return 1;
3777 
3778         slot = path->slots[1];
3779         upper = path->nodes[1];
3780         if (slot >= btrfs_header_nritems(upper) - 1)
3781                 return 1;
3782 
3783         btrfs_assert_tree_locked(path->nodes[1]);
3784 
3785         right = read_node_slot(fs_info, upper, slot + 1);
3786         /*
3787          * slot + 1 is not valid or we fail to read the right node,
3788          * no big deal, just return.
3789          */
3790         if (IS_ERR(right))
3791                 return 1;
3792 
3793         btrfs_tree_lock(right);
3794         btrfs_set_lock_blocking(right);
3795 
3796         free_space = btrfs_leaf_free_space(fs_info, right);
3797         if (free_space < data_size)
3798                 goto out_unlock;
3799 
3800         /* cow and double check */
3801         ret = btrfs_cow_block(trans, root, right, upper,
3802                               slot + 1, &right);
3803         if (ret)
3804                 goto out_unlock;
3805 
3806         free_space = btrfs_leaf_free_space(fs_info, right);
3807         if (free_space < data_size)
3808                 goto out_unlock;
3809 
3810         left_nritems = btrfs_header_nritems(left);
3811         if (left_nritems == 0)
3812                 goto out_unlock;
3813 
3814         if (path->slots[0] == left_nritems && !empty) {
3815                 /* Key greater than all keys in the leaf, right neighbor has
3816                  * enough room for it and we're not emptying our leaf to delete
3817                  * it, therefore use right neighbor to insert the new item and
3818                  * no need to touch/dirty our left leaft. */
3819                 btrfs_tree_unlock(left);
3820                 free_extent_buffer(left);
3821                 path->nodes[0] = right;
3822                 path->slots[0] = 0;
3823                 path->slots[1]++;
3824                 return 0;
3825         }
3826 
3827         return __push_leaf_right(fs_info, path, min_data_size, empty,
3828                                 right, free_space, left_nritems, min_slot);
3829 out_unlock:
3830         btrfs_tree_unlock(right);
3831         free_extent_buffer(right);
3832         return 1;
3833 }
3834 
3835 /*
3836  * push some data in the path leaf to the left, trying to free up at
3837  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3838  *
3839  * max_slot can put a limit on how far into the leaf we'll push items.  The
3840  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3841  * items
3842  */
3843 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3844                                      struct btrfs_path *path, int data_size,
3845                                      int empty, struct extent_buffer *left,
3846                                      int free_space, u32 right_nritems,
3847                                      u32 max_slot)
3848 {
3849         struct btrfs_disk_key disk_key;
3850         struct extent_buffer *right = path->nodes[0];
3851         int i;
3852         int push_space = 0;
3853         int push_items = 0;
3854         struct btrfs_item *item;
3855         u32 old_left_nritems;
3856         u32 nr;
3857         int ret = 0;
3858         u32 this_item_size;
3859         u32 old_left_item_size;
3860         struct btrfs_map_token token;
3861 
3862         btrfs_init_map_token(&token);
3863 
3864         if (empty)
3865                 nr = min(right_nritems, max_slot);
3866         else
3867                 nr = min(right_nritems - 1, max_slot);
3868 
3869         for (i = 0; i < nr; i++) {
3870                 item = btrfs_item_nr(i);
3871 
3872                 if (!empty && push_items > 0) {
3873                         if (path->slots[0] < i)
3874                                 break;
3875                         if (path->slots[0] == i) {
3876                                 int space = btrfs_leaf_free_space(fs_info, right);
3877                                 if (space + push_space * 2 > free_space)
3878                                         break;
3879                         }
3880                 }
3881 
3882                 if (path->slots[0] == i)
3883                         push_space += data_size;
3884 
3885                 this_item_size = btrfs_item_size(right, item);
3886                 if (this_item_size + sizeof(*item) + push_space > free_space)
3887                         break;
3888 
3889                 push_items++;
3890                 push_space += this_item_size + sizeof(*item);
3891         }
3892 
3893         if (push_items == 0) {
3894                 ret = 1;
3895                 goto out;
3896         }
3897         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3898 
3899         /* push data from right to left */
3900         copy_extent_buffer(left, right,
3901                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3902                            btrfs_item_nr_offset(0),
3903                            push_items * sizeof(struct btrfs_item));
3904 
3905         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3906                      btrfs_item_offset_nr(right, push_items - 1);
3907 
3908         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3909                      leaf_data_end(fs_info, left) - push_space,
3910                      BTRFS_LEAF_DATA_OFFSET +
3911                      btrfs_item_offset_nr(right, push_items - 1),
3912                      push_space);
3913         old_left_nritems = btrfs_header_nritems(left);
3914         BUG_ON(old_left_nritems <= 0);
3915 
3916         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3917         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3918                 u32 ioff;
3919 
3920                 item = btrfs_item_nr(i);
3921 
3922                 ioff = btrfs_token_item_offset(left, item, &token);
3923                 btrfs_set_token_item_offset(left, item,
3924                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3925                       &token);
3926         }
3927         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3928 
3929         /* fixup right node */
3930         if (push_items > right_nritems)
3931                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3932                        right_nritems);
3933 
3934         if (push_items < right_nritems) {
3935                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3936                                                   leaf_data_end(fs_info, right);
3937                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3938                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3939                                       BTRFS_LEAF_DATA_OFFSET +
3940                                       leaf_data_end(fs_info, right), push_space);
3941 
3942                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3943                               btrfs_item_nr_offset(push_items),
3944                              (btrfs_header_nritems(right) - push_items) *
3945                              sizeof(struct btrfs_item));
3946         }
3947         right_nritems -= push_items;
3948         btrfs_set_header_nritems(right, right_nritems);
3949         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3950         for (i = 0; i < right_nritems; i++) {
3951                 item = btrfs_item_nr(i);
3952 
3953                 push_space = push_space - btrfs_token_item_size(right,
3954                                                                 item, &token);
3955                 btrfs_set_token_item_offset(right, item, push_space, &token);
3956         }
3957 
3958         btrfs_mark_buffer_dirty(left);
3959         if (right_nritems)
3960                 btrfs_mark_buffer_dirty(right);
3961         else
3962                 clean_tree_block(fs_info, right);
3963 
3964         btrfs_item_key(right, &disk_key, 0);
3965         fixup_low_keys(fs_info, path, &disk_key, 1);
3966 
3967         /* then fixup the leaf pointer in the path */
3968         if (path->slots[0] < push_items) {
3969                 path->slots[0] += old_left_nritems;
3970                 btrfs_tree_unlock(path->nodes[0]);
3971                 free_extent_buffer(path->nodes[0]);
3972                 path->nodes[0] = left;
3973                 path->slots[1] -= 1;
3974         } else {
3975                 btrfs_tree_unlock(left);
3976                 free_extent_buffer(left);
3977                 path->slots[0] -= push_items;
3978         }
3979         BUG_ON(path->slots[0] < 0);
3980         return ret;
3981 out:
3982         btrfs_tree_unlock(left);
3983         free_extent_buffer(left);
3984         return ret;
3985 }
3986 
3987 /*
3988  * push some data in the path leaf to the left, trying to free up at
3989  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3990  *
3991  * max_slot can put a limit on how far into the leaf we'll push items.  The
3992  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3993  * items
3994  */
3995 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3996                           *root, struct btrfs_path *path, int min_data_size,
3997                           int data_size, int empty, u32 max_slot)
3998 {
3999         struct btrfs_fs_info *fs_info = root->fs_info;
4000         struct extent_buffer *right = path->nodes[0];
4001         struct extent_buffer *left;
4002         int slot;
4003         int free_space;
4004         u32 right_nritems;
4005         int ret = 0;
4006 
4007         slot = path->slots[1];
4008         if (slot == 0)
4009                 return 1;
4010         if (!path->nodes[1])
4011                 return 1;
4012 
4013         right_nritems = btrfs_header_nritems(right);
4014         if (right_nritems == 0)
4015                 return 1;
4016 
4017         btrfs_assert_tree_locked(path->nodes[1]);
4018 
4019         left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4020         /*
4021          * slot - 1 is not valid or we fail to read the left node,
4022          * no big deal, just return.
4023          */
4024         if (IS_ERR(left))
4025                 return 1;
4026 
4027         btrfs_tree_lock(left);
4028         btrfs_set_lock_blocking(left);
4029 
4030         free_space = btrfs_leaf_free_space(fs_info, left);
4031         if (free_space < data_size) {
4032                 ret = 1;
4033                 goto out;
4034         }
4035 
4036         /* cow and double check */
4037         ret = btrfs_cow_block(trans, root, left,
4038                               path->nodes[1], slot - 1, &left);
4039         if (ret) {
4040                 /* we hit -ENOSPC, but it isn't fatal here */
4041                 if (ret == -ENOSPC)
4042                         ret = 1;
4043                 goto out;
4044         }
4045 
4046         free_space = btrfs_leaf_free_space(fs_info, left);
4047         if (free_space < data_size) {
4048                 ret = 1;
4049                 goto out;
4050         }
4051 
4052         return __push_leaf_left(fs_info, path, min_data_size,
4053                                empty, left, free_space, right_nritems,
4054                                max_slot);
4055 out:
4056         btrfs_tree_unlock(left);
4057         free_extent_buffer(left);
4058         return ret;
4059 }
4060 
4061 /*
4062  * split the path's leaf in two, making sure there is at least data_size
4063  * available for the resulting leaf level of the path.
4064  */
4065 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4066                                     struct btrfs_fs_info *fs_info,
4067                                     struct btrfs_path *path,
4068                                     struct extent_buffer *l,
4069                                     struct extent_buffer *right,
4070                                     int slot, int mid, int nritems)
4071 {
4072         int data_copy_size;
4073         int rt_data_off;
4074         int i;
4075         struct btrfs_disk_key disk_key;
4076         struct btrfs_map_token token;
4077 
4078         btrfs_init_map_token(&token);
4079 
4080         nritems = nritems - mid;
4081         btrfs_set_header_nritems(right, nritems);
4082         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4083 
4084         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4085                            btrfs_item_nr_offset(mid),
4086                            nritems * sizeof(struct btrfs_item));
4087 
4088         copy_extent_buffer(right, l,
4089                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4090                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4091                      leaf_data_end(fs_info, l), data_copy_size);
4092 
4093         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4094 
4095         for (i = 0; i < nritems; i++) {
4096                 struct btrfs_item *item = btrfs_item_nr(i);
4097                 u32 ioff;
4098 
4099                 ioff = btrfs_token_item_offset(right, item, &token);
4100                 btrfs_set_token_item_offset(right, item,
4101                                             ioff + rt_data_off, &token);
4102         }
4103 
4104         btrfs_set_header_nritems(l, mid);
4105         btrfs_item_key(right, &disk_key, 0);
4106         insert_ptr(trans, fs_info, path, &disk_key, right->start,
4107                    path->slots[1] + 1, 1);
4108 
4109         btrfs_mark_buffer_dirty(right);
4110         btrfs_mark_buffer_dirty(l);
4111         BUG_ON(path->slots[0] != slot);
4112 
4113         if (mid <= slot) {
4114                 btrfs_tree_unlock(path->nodes[0]);
4115                 free_extent_buffer(path->nodes[0]);
4116                 path->nodes[0] = right;
4117                 path->slots[0] -= mid;
4118                 path->slots[1] += 1;
4119         } else {
4120                 btrfs_tree_unlock(right);
4121                 free_extent_buffer(right);
4122         }
4123 
4124         BUG_ON(path->slots[0] < 0);
4125 }
4126 
4127 /*
4128  * double splits happen when we need to insert a big item in the middle
4129  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4130  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4131  *          A                 B                 C
4132  *
4133  * We avoid this by trying to push the items on either side of our target
4134  * into the adjacent leaves.  If all goes well we can avoid the double split
4135  * completely.
4136  */
4137 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4138                                           struct btrfs_root *root,
4139                                           struct btrfs_path *path,
4140                                           int data_size)
4141 {
4142         struct btrfs_fs_info *fs_info = root->fs_info;
4143         int ret;
4144         int progress = 0;
4145         int slot;
4146         u32 nritems;
4147         int space_needed = data_size;
4148 
4149         slot = path->slots[0];
4150         if (slot < btrfs_header_nritems(path->nodes[0]))
4151                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4152 
4153         /*
4154          * try to push all the items after our slot into the
4155          * right leaf
4156          */
4157         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4158         if (ret < 0)
4159                 return ret;
4160 
4161         if (ret == 0)
4162                 progress++;
4163 
4164         nritems = btrfs_header_nritems(path->nodes[0]);
4165         /*
4166          * our goal is to get our slot at the start or end of a leaf.  If
4167          * we've done so we're done
4168          */
4169         if (path->slots[0] == 0 || path->slots[0] == nritems)
4170                 return 0;
4171 
4172         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4173                 return 0;
4174 
4175         /* try to push all the items before our slot into the next leaf */
4176         slot = path->slots[0];
4177         space_needed = data_size;
4178         if (slot > 0)
4179                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4180         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4181         if (ret < 0)
4182                 return ret;
4183 
4184         if (ret == 0)
4185                 progress++;
4186 
4187         if (progress)
4188                 return 0;
4189         return 1;
4190 }
4191 
4192 /*
4193  * split the path's leaf in two, making sure there is at least data_size
4194  * available for the resulting leaf level of the path.
4195  *
4196  * returns 0 if all went well and < 0 on failure.
4197  */
4198 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4199                                struct btrfs_root *root,
4200                                const struct btrfs_key *ins_key,
4201                                struct btrfs_path *path, int data_size,
4202                                int extend)
4203 {
4204         struct btrfs_disk_key disk_key;
4205         struct extent_buffer *l;
4206         u32 nritems;
4207         int mid;
4208         int slot;
4209         struct extent_buffer *right;
4210         struct btrfs_fs_info *fs_info = root->fs_info;
4211         int ret = 0;
4212         int wret;
4213         int split;
4214         int num_doubles = 0;
4215         int tried_avoid_double = 0;
4216 
4217         l = path->nodes[0];
4218         slot = path->slots[0];
4219         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4220             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4221                 return -EOVERFLOW;
4222 
4223         /* first try to make some room by pushing left and right */
4224         if (data_size && path->nodes[1]) {
4225                 int space_needed = data_size;
4226 
4227                 if (slot < btrfs_header_nritems(l))
4228                         space_needed -= btrfs_leaf_free_space(fs_info, l);
4229 
4230                 wret = push_leaf_right(trans, root, path, space_needed,
4231                                        space_needed, 0, 0);
4232                 if (wret < 0)
4233                         return wret;
4234                 if (wret) {
4235                         space_needed = data_size;
4236                         if (slot > 0)
4237                                 space_needed -= btrfs_leaf_free_space(fs_info,
4238                                                                       l);
4239                         wret = push_leaf_left(trans, root, path, space_needed,
4240                                               space_needed, 0, (u32)-1);
4241                         if (wret < 0)
4242                                 return wret;
4243                 }
4244                 l = path->nodes[0];
4245 
4246                 /* did the pushes work? */
4247                 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4248                         return 0;
4249         }
4250 
4251         if (!path->nodes[1]) {
4252                 ret = insert_new_root(trans, root, path, 1);
4253                 if (ret)
4254                         return ret;
4255         }
4256 again:
4257         split = 1;
4258         l = path->nodes[0];
4259         slot = path->slots[0];
4260         nritems = btrfs_header_nritems(l);
4261         mid = (nritems + 1) / 2;
4262 
4263         if (mid <= slot) {
4264                 if (nritems == 1 ||
4265                     leaf_space_used(l, mid, nritems - mid) + data_size >
4266                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4267                         if (slot >= nritems) {
4268                                 split = 0;
4269                         } else {
4270                                 mid = slot;
4271                                 if (mid != nritems &&
4272                                     leaf_space_used(l, mid, nritems - mid) +
4273                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4274                                         if (data_size && !tried_avoid_double)
4275                                                 goto push_for_double;
4276                                         split = 2;
4277                                 }
4278                         }
4279                 }
4280         } else {
4281                 if (leaf_space_used(l, 0, mid) + data_size >
4282                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4283                         if (!extend && data_size && slot == 0) {
4284                                 split = 0;
4285                         } else if ((extend || !data_size) && slot == 0) {
4286                                 mid = 1;
4287                         } else {
4288                                 mid = slot;
4289                                 if (mid != nritems &&
4290                                     leaf_space_used(l, mid, nritems - mid) +
4291                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4292                                         if (data_size && !tried_avoid_double)
4293                                                 goto push_for_double;
4294                                         split = 2;
4295                                 }
4296                         }
4297                 }
4298         }
4299 
4300         if (split == 0)
4301                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4302         else
4303                 btrfs_item_key(l, &disk_key, mid);
4304 
4305         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4306                         &disk_key, 0, l->start, 0);
4307         if (IS_ERR(right))
4308                 return PTR_ERR(right);
4309 
4310         root_add_used(root, fs_info->nodesize);
4311 
4312         memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4313         btrfs_set_header_bytenr(right, right->start);
4314         btrfs_set_header_generation(right, trans->transid);
4315         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4316         btrfs_set_header_owner(right, root->root_key.objectid);
4317         btrfs_set_header_level(right, 0);
4318         write_extent_buffer_fsid(right, fs_info->fsid);
4319         write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4320 
4321         if (split == 0) {
4322                 if (mid <= slot) {
4323                         btrfs_set_header_nritems(right, 0);
4324                         insert_ptr(trans, fs_info, path, &disk_key,
4325                                    right->start, path->slots[1] + 1, 1);
4326                         btrfs_tree_unlock(path->nodes[0]);
4327                         free_extent_buffer(path->nodes[0]);
4328                         path->nodes[0] = right;
4329                         path->slots[0] = 0;
4330                         path->slots[1] += 1;
4331                 } else {
4332                         btrfs_set_header_nritems(right, 0);
4333                         insert_ptr(trans, fs_info, path, &disk_key,
4334                                    right->start, path->slots[1], 1);
4335                         btrfs_tree_unlock(path->nodes[0]);
4336                         free_extent_buffer(path->nodes[0]);
4337                         path->nodes[0] = right;
4338                         path->slots[0] = 0;
4339                         if (path->slots[1] == 0)
4340                                 fixup_low_keys(fs_info, path, &disk_key, 1);
4341                 }
4342                 /*
4343                  * We create a new leaf 'right' for the required ins_len and
4344                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4345                  * the content of ins_len to 'right'.
4346                  */
4347                 return ret;
4348         }
4349 
4350         copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4351 
4352         if (split == 2) {
4353                 BUG_ON(num_doubles != 0);
4354                 num_doubles++;
4355                 goto again;
4356         }
4357 
4358         return 0;
4359 
4360 push_for_double:
4361         push_for_double_split(trans, root, path, data_size);
4362         tried_avoid_double = 1;
4363         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4364                 return 0;
4365         goto again;
4366 }
4367 
4368 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4369                                          struct btrfs_root *root,
4370                                          struct btrfs_path *path, int ins_len)
4371 {
4372         struct btrfs_fs_info *fs_info = root->fs_info;
4373         struct btrfs_key key;
4374         struct extent_buffer *leaf;
4375         struct btrfs_file_extent_item *fi;
4376         u64 extent_len = 0;
4377         u32 item_size;
4378         int ret;
4379 
4380         leaf = path->nodes[0];
4381         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4382 
4383         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4384                key.type != BTRFS_EXTENT_CSUM_KEY);
4385 
4386         if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4387                 return 0;
4388 
4389         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4390         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4391                 fi = btrfs_item_ptr(leaf, path->slots[0],
4392                                     struct btrfs_file_extent_item);
4393                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4394         }
4395         btrfs_release_path(path);
4396 
4397         path->keep_locks = 1;
4398         path->search_for_split = 1;
4399         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4400         path->search_for_split = 0;
4401         if (ret > 0)
4402                 ret = -EAGAIN;
4403         if (ret < 0)
4404                 goto err;
4405 
4406         ret = -EAGAIN;
4407         leaf = path->nodes[0];
4408         /* if our item isn't there, return now */
4409         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4410                 goto err;
4411 
4412         /* the leaf has  changed, it now has room.  return now */
4413         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4414                 goto err;
4415 
4416         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4417                 fi = btrfs_item_ptr(leaf, path->slots[0],
4418                                     struct btrfs_file_extent_item);
4419                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4420                         goto err;
4421         }
4422 
4423         btrfs_set_path_blocking(path);
4424         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4425         if (ret)
4426                 goto err;
4427 
4428         path->keep_locks = 0;
4429         btrfs_unlock_up_safe(path, 1);
4430         return 0;
4431 err:
4432         path->keep_locks = 0;
4433         return ret;
4434 }
4435 
4436 static noinline int split_item(struct btrfs_fs_info *fs_info,
4437                                struct btrfs_path *path,
4438                                const struct btrfs_key *new_key,
4439                                unsigned long split_offset)
4440 {
4441         struct extent_buffer *leaf;
4442         struct btrfs_item *item;
4443         struct btrfs_item *new_item;
4444         int slot;
4445         char *buf;
4446         u32 nritems;
4447         u32 item_size;
4448         u32 orig_offset;
4449         struct btrfs_disk_key disk_key;
4450 
4451         leaf = path->nodes[0];
4452         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4453 
4454         btrfs_set_path_blocking(path);
4455 
4456         item = btrfs_item_nr(path->slots[0]);
4457         orig_offset = btrfs_item_offset(leaf, item);
4458         item_size = btrfs_item_size(leaf, item);
4459 
4460         buf = kmalloc(item_size, GFP_NOFS);
4461         if (!buf)
4462                 return -ENOMEM;
4463 
4464         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4465                             path->slots[0]), item_size);
4466 
4467         slot = path->slots[0] + 1;
4468         nritems = btrfs_header_nritems(leaf);
4469         if (slot != nritems) {
4470                 /* shift the items */
4471                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4472                                 btrfs_item_nr_offset(slot),
4473                                 (nritems - slot) * sizeof(struct btrfs_item));
4474         }
4475 
4476         btrfs_cpu_key_to_disk(&disk_key, new_key);
4477         btrfs_set_item_key(leaf, &disk_key, slot);
4478 
4479         new_item = btrfs_item_nr(slot);
4480 
4481         btrfs_set_item_offset(leaf, new_item, orig_offset);
4482         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4483 
4484         btrfs_set_item_offset(leaf, item,
4485                               orig_offset + item_size - split_offset);
4486         btrfs_set_item_size(leaf, item, split_offset);
4487 
4488         btrfs_set_header_nritems(leaf, nritems + 1);
4489 
4490         /* write the data for the start of the original item */
4491         write_extent_buffer(leaf, buf,
4492                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4493                             split_offset);
4494 
4495         /* write the data for the new item */
4496         write_extent_buffer(leaf, buf + split_offset,
4497                             btrfs_item_ptr_offset(leaf, slot),
4498                             item_size - split_offset);
4499         btrfs_mark_buffer_dirty(leaf);
4500 
4501         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4502         kfree(buf);
4503         return 0;
4504 }
4505 
4506 /*
4507  * This function splits a single item into two items,
4508  * giving 'new_key' to the new item and splitting the
4509  * old one at split_offset (from the start of the item).
4510  *
4511  * The path may be released by this operation.  After
4512  * the split, the path is pointing to the old item.  The
4513  * new item is going to be in the same node as the old one.
4514  *
4515  * Note, the item being split must be smaller enough to live alone on
4516  * a tree block with room for one extra struct btrfs_item
4517  *
4518  * This allows us to split the item in place, keeping a lock on the
4519  * leaf the entire time.
4520  */
4521 int btrfs_split_item(struct btrfs_trans_handle *trans,
4522                      struct btrfs_root *root,
4523                      struct btrfs_path *path,
4524                      const struct btrfs_key *new_key,
4525                      unsigned long split_offset)
4526 {
4527         int ret;
4528         ret = setup_leaf_for_split(trans, root, path,
4529                                    sizeof(struct btrfs_item));
4530         if (ret)
4531                 return ret;
4532 
4533         ret = split_item(root->fs_info, path, new_key, split_offset);
4534         return ret;
4535 }
4536 
4537 /*
4538  * This function duplicate a item, giving 'new_key' to the new item.
4539  * It guarantees both items live in the same tree leaf and the new item
4540  * is contiguous with the original item.
4541  *
4542  * This allows us to split file extent in place, keeping a lock on the
4543  * leaf the entire time.
4544  */
4545 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4546                          struct btrfs_root *root,
4547                          struct btrfs_path *path,
4548                          const struct btrfs_key *new_key)
4549 {
4550         struct extent_buffer *leaf;
4551         int ret;
4552         u32 item_size;
4553 
4554         leaf = path->nodes[0];
4555         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4556         ret = setup_leaf_for_split(trans, root, path,
4557                                    item_size + sizeof(struct btrfs_item));
4558         if (ret)
4559                 return ret;
4560 
4561         path->slots[0]++;
4562         setup_items_for_insert(root, path, new_key, &item_size,
4563                                item_size, item_size +
4564                                sizeof(struct btrfs_item), 1);
4565         leaf = path->nodes[0];
4566         memcpy_extent_buffer(leaf,
4567                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4568                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4569                              item_size);
4570         return 0;
4571 }
4572 
4573 /*
4574  * make the item pointed to by the path smaller.  new_size indicates
4575  * how small to make it, and from_end tells us if we just chop bytes
4576  * off the end of the item or if we shift the item to chop bytes off
4577  * the front.
4578  */
4579 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4580                          struct btrfs_path *path, u32 new_size, int from_end)
4581 {
4582         int slot;
4583         struct extent_buffer *leaf;
4584         struct btrfs_item *item;
4585         u32 nritems;
4586         unsigned int data_end;
4587         unsigned int old_data_start;
4588         unsigned int old_size;
4589         unsigned int size_diff;
4590         int i;
4591         struct btrfs_map_token token;
4592 
4593         btrfs_init_map_token(&token);
4594 
4595         leaf = path->nodes[0];
4596         slot = path->slots[0];
4597 
4598         old_size = btrfs_item_size_nr(leaf, slot);
4599         if (old_size == new_size)
4600                 return;
4601 
4602         nritems = btrfs_header_nritems(leaf);
4603         data_end = leaf_data_end(fs_info, leaf);
4604 
4605         old_data_start = btrfs_item_offset_nr(leaf, slot);
4606 
4607         size_diff = old_size - new_size;
4608 
4609         BUG_ON(slot < 0);
4610         BUG_ON(slot >= nritems);
4611 
4612         /*
4613          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4614          */
4615         /* first correct the data pointers */
4616         for (i = slot; i < nritems; i++) {
4617                 u32 ioff;
4618                 item = btrfs_item_nr(i);
4619 
4620                 ioff = btrfs_token_item_offset(leaf, item, &token);
4621                 btrfs_set_token_item_offset(leaf, item,
4622                                             ioff + size_diff, &token);
4623         }
4624 
4625         /* shift the data */
4626         if (from_end) {
4627                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4628                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4629                               data_end, old_data_start + new_size - data_end);
4630         } else {
4631                 struct btrfs_disk_key disk_key;
4632                 u64 offset;
4633 
4634                 btrfs_item_key(leaf, &disk_key, slot);
4635 
4636                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4637                         unsigned long ptr;
4638                         struct btrfs_file_extent_item *fi;
4639 
4640                         fi = btrfs_item_ptr(leaf, slot,
4641                                             struct btrfs_file_extent_item);
4642                         fi = (struct btrfs_file_extent_item *)(
4643                              (unsigned long)fi - size_diff);
4644 
4645                         if (btrfs_file_extent_type(leaf, fi) ==
4646                             BTRFS_FILE_EXTENT_INLINE) {
4647                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4648                                 memmove_extent_buffer(leaf, ptr,
4649                                       (unsigned long)fi,
4650                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4651                         }
4652                 }
4653 
4654                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4655                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4656                               data_end, old_data_start - data_end);
4657 
4658                 offset = btrfs_disk_key_offset(&disk_key);
4659                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4660                 btrfs_set_item_key(leaf, &disk_key, slot);
4661                 if (slot == 0)
4662                         fixup_low_keys(fs_info, path, &disk_key, 1);
4663         }
4664 
4665         item = btrfs_item_nr(slot);
4666         btrfs_set_item_size(leaf, item, new_size);
4667         btrfs_mark_buffer_dirty(leaf);
4668 
4669         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4670                 btrfs_print_leaf(leaf);
4671                 BUG();
4672         }
4673 }
4674 
4675 /*
4676  * make the item pointed to by the path bigger, data_size is the added size.
4677  */
4678 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4679                        u32 data_size)
4680 {
4681         int slot;
4682         struct extent_buffer *leaf;
4683         struct btrfs_item *item;
4684         u32 nritems;
4685         unsigned int data_end;
4686         unsigned int old_data;
4687         unsigned int old_size;
4688         int i;
4689         struct btrfs_map_token token;
4690 
4691         btrfs_init_map_token(&token);
4692 
4693         leaf = path->nodes[0];
4694 
4695         nritems = btrfs_header_nritems(leaf);
4696         data_end = leaf_data_end(fs_info, leaf);
4697 
4698         if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4699                 btrfs_print_leaf(leaf);
4700                 BUG();
4701         }
4702         slot = path->slots[0];
4703         old_data = btrfs_item_end_nr(leaf, slot);
4704 
4705         BUG_ON(slot < 0);
4706         if (slot >= nritems) {
4707                 btrfs_print_leaf(leaf);
4708                 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4709                            slot, nritems);
4710                 BUG_ON(1);
4711         }
4712 
4713         /*
4714          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4715          */
4716         /* first correct the data pointers */
4717         for (i = slot; i < nritems; i++) {
4718                 u32 ioff;
4719                 item = btrfs_item_nr(i);
4720 
4721                 ioff = btrfs_token_item_offset(leaf, item, &token);
4722                 btrfs_set_token_item_offset(leaf, item,
4723                                             ioff - data_size, &token);
4724         }
4725 
4726         /* shift the data */
4727         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4728                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4729                       data_end, old_data - data_end);
4730 
4731         data_end = old_data;
4732         old_size = btrfs_item_size_nr(leaf, slot);
4733         item = btrfs_item_nr(slot);
4734         btrfs_set_item_size(leaf, item, old_size + data_size);
4735         btrfs_mark_buffer_dirty(leaf);
4736 
4737         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4738                 btrfs_print_leaf(leaf);
4739                 BUG();
4740         }
4741 }
4742 
4743 /*
4744  * this is a helper for btrfs_insert_empty_items, the main goal here is
4745  * to save stack depth by doing the bulk of the work in a function
4746  * that doesn't call btrfs_search_slot
4747  */
4748 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4749                             const struct btrfs_key *cpu_key, u32 *data_size,
4750                             u32 total_data, u32 total_size, int nr)
4751 {
4752         struct btrfs_fs_info *fs_info = root->fs_info;
4753         struct btrfs_item *item;
4754         int i;
4755         u32 nritems;
4756         unsigned int data_end;
4757         struct btrfs_disk_key disk_key;
4758         struct extent_buffer *leaf;
4759         int slot;
4760         struct btrfs_map_token token;
4761 
4762         if (path->slots[0] == 0) {
4763                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4764                 fixup_low_keys(fs_info, path, &disk_key, 1);
4765         }
4766         btrfs_unlock_up_safe(path, 1);
4767 
4768         btrfs_init_map_token(&token);
4769 
4770         leaf = path->nodes[0];
4771         slot = path->slots[0];
4772 
4773         nritems = btrfs_header_nritems(leaf);
4774         data_end = leaf_data_end(fs_info, leaf);
4775 
4776         if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4777                 btrfs_print_leaf(leaf);
4778                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4779                            total_size, btrfs_leaf_free_space(fs_info, leaf));
4780                 BUG();
4781         }
4782 
4783         if (slot != nritems) {
4784                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4785 
4786                 if (old_data < data_end) {
4787                         btrfs_print_leaf(leaf);
4788                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4789                                    slot, old_data, data_end);
4790                         BUG_ON(1);
4791                 }
4792                 /*
4793                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4794                  */
4795                 /* first correct the data pointers */
4796                 for (i = slot; i < nritems; i++) {
4797                         u32 ioff;
4798 
4799                         item = btrfs_item_nr(i);
4800                         ioff = btrfs_token_item_offset(leaf, item, &token);
4801                         btrfs_set_token_item_offset(leaf, item,
4802                                                     ioff - total_data, &token);
4803                 }
4804                 /* shift the items */
4805                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4806                               btrfs_item_nr_offset(slot),
4807                               (nritems - slot) * sizeof(struct btrfs_item));
4808 
4809                 /* shift the data */
4810                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4811                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4812                               data_end, old_data - data_end);
4813                 data_end = old_data;
4814         }
4815 
4816         /* setup the item for the new data */
4817         for (i = 0; i < nr; i++) {
4818                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4819                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4820                 item = btrfs_item_nr(slot + i);
4821                 btrfs_set_token_item_offset(leaf, item,
4822                                             data_end - data_size[i], &token);
4823                 data_end -= data_size[i];
4824                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4825         }
4826 
4827         btrfs_set_header_nritems(leaf, nritems + nr);
4828         btrfs_mark_buffer_dirty(leaf);
4829 
4830         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4831                 btrfs_print_leaf(leaf);
4832                 BUG();
4833         }
4834 }
4835 
4836 /*
4837  * Given a key and some data, insert items into the tree.
4838  * This does all the path init required, making room in the tree if needed.
4839  */
4840 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4841                             struct btrfs_root *root,
4842                             struct btrfs_path *path,
4843                             const struct btrfs_key *cpu_key, u32 *data_size,
4844                             int nr)
4845 {
4846         int ret = 0;
4847         int slot;
4848         int i;
4849         u32 total_size = 0;
4850         u32 total_data = 0;
4851 
4852         for (i = 0; i < nr; i++)
4853                 total_data += data_size[i];
4854 
4855         total_size = total_data + (nr * sizeof(struct btrfs_item));
4856         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4857         if (ret == 0)
4858                 return -EEXIST;
4859         if (ret < 0)
4860                 return ret;
4861 
4862         slot = path->slots[0];
4863         BUG_ON(slot < 0);
4864 
4865         setup_items_for_insert(root, path, cpu_key, data_size,
4866                                total_data, total_size, nr);
4867         return 0;
4868 }
4869 
4870 /*
4871  * Given a key and some data, insert an item into the tree.
4872  * This does all the path init required, making room in the tree if needed.
4873  */
4874 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4875                       const struct btrfs_key *cpu_key, void *data,
4876                       u32 data_size)
4877 {
4878         int ret = 0;
4879         struct btrfs_path *path;
4880         struct extent_buffer *leaf;
4881         unsigned long ptr;
4882 
4883         path = btrfs_alloc_path();
4884         if (!path)
4885                 return -ENOMEM;
4886         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4887         if (!ret) {
4888                 leaf = path->nodes[0];
4889                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4890                 write_extent_buffer(leaf, data, ptr, data_size);
4891                 btrfs_mark_buffer_dirty(leaf);
4892         }
4893         btrfs_free_path(path);
4894         return ret;
4895 }
4896 
4897 /*
4898  * delete the pointer from a given node.
4899  *
4900  * the tree should have been previously balanced so the deletion does not
4901  * empty a node.
4902  */
4903 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4904                     int level, int slot)
4905 {
4906         struct btrfs_fs_info *fs_info = root->fs_info;
4907         struct extent_buffer *parent = path->nodes[level];
4908         u32 nritems;
4909         int ret;
4910 
4911         nritems = btrfs_header_nritems(parent);
4912         if (slot != nritems - 1) {
4913                 if (level) {
4914                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4915                                         nritems - slot - 1);
4916                         BUG_ON(ret < 0);
4917                 }
4918                 memmove_extent_buffer(parent,
4919                               btrfs_node_key_ptr_offset(slot),
4920                               btrfs_node_key_ptr_offset(slot + 1),
4921                               sizeof(struct btrfs_key_ptr) *
4922                               (nritems - slot - 1));
4923         } else if (level) {
4924                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4925                                 GFP_NOFS);
4926                 BUG_ON(ret < 0);
4927         }
4928 
4929         nritems--;
4930         btrfs_set_header_nritems(parent, nritems);
4931         if (nritems == 0 && parent == root->node) {
4932                 BUG_ON(btrfs_header_level(root->node) != 1);
4933                 /* just turn the root into a leaf and break */
4934                 btrfs_set_header_level(root->node, 0);
4935         } else if (slot == 0) {
4936                 struct btrfs_disk_key disk_key;
4937 
4938                 btrfs_node_key(parent, &disk_key, 0);
4939                 fixup_low_keys(fs_info, path, &disk_key, level + 1);
4940         }
4941         btrfs_mark_buffer_dirty(parent);
4942 }
4943 
4944 /*
4945  * a helper function to delete the leaf pointed to by path->slots[1] and
4946  * path->nodes[1].
4947  *
4948  * This deletes the pointer in path->nodes[1] and frees the leaf
4949  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4950  *
4951  * The path must have already been setup for deleting the leaf, including
4952  * all the proper balancing.  path->nodes[1] must be locked.
4953  */
4954 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4955                                     struct btrfs_root *root,
4956                                     struct btrfs_path *path,
4957                                     struct extent_buffer *leaf)
4958 {
4959         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4960         del_ptr(root, path, 1, path->slots[1]);
4961 
4962         /*
4963          * btrfs_free_extent is expensive, we want to make sure we
4964          * aren't holding any locks when we call it
4965          */
4966         btrfs_unlock_up_safe(path, 0);
4967 
4968         root_sub_used(root, leaf->len);
4969 
4970         extent_buffer_get(leaf);
4971         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4972         free_extent_buffer_stale(leaf);
4973 }
4974 /*
4975  * delete the item at the leaf level in path.  If that empties
4976  * the leaf, remove it from the tree
4977  */
4978 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4979                     struct btrfs_path *path, int slot, int nr)
4980 {
4981         struct btrfs_fs_info *fs_info = root->fs_info;
4982         struct extent_buffer *leaf;
4983         struct btrfs_item *item;
4984         u32 last_off;
4985         u32 dsize = 0;
4986         int ret = 0;
4987         int wret;
4988         int i;
4989         u32 nritems;
4990         struct btrfs_map_token token;
4991 
4992         btrfs_init_map_token(&token);
4993 
4994         leaf = path->nodes[0];
4995         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4996 
4997         for (i = 0; i < nr; i++)
4998                 dsize += btrfs_item_size_nr(leaf, slot + i);
4999 
5000         nritems = btrfs_header_nritems(leaf);
5001 
5002         if (slot + nr != nritems) {
5003                 int data_end = leaf_data_end(fs_info, leaf);
5004 
5005                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5006                               data_end + dsize,
5007                               BTRFS_LEAF_DATA_OFFSET + data_end,
5008                               last_off - data_end);
5009 
5010                 for (i = slot + nr; i < nritems; i++) {
5011                         u32 ioff;
5012 
5013                         item = btrfs_item_nr(i);
5014                         ioff = btrfs_token_item_offset(leaf, item, &token);
5015                         btrfs_set_token_item_offset(leaf, item,
5016                                                     ioff + dsize, &token);
5017                 }
5018 
5019                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5020                               btrfs_item_nr_offset(slot + nr),
5021                               sizeof(struct btrfs_item) *
5022                               (nritems - slot - nr));
5023         }
5024         btrfs_set_header_nritems(leaf, nritems - nr);
5025         nritems -= nr;
5026 
5027         /* delete the leaf if we've emptied it */
5028         if (nritems == 0) {
5029                 if (leaf == root->node) {
5030                         btrfs_set_header_level(leaf, 0);
5031                 } else {
5032                         btrfs_set_path_blocking(path);
5033                         clean_tree_block(fs_info, leaf);
5034                         btrfs_del_leaf(trans, root, path, leaf);
5035                 }
5036         } else {
5037                 int used = leaf_space_used(leaf, 0, nritems);
5038                 if (slot == 0) {
5039                         struct btrfs_disk_key disk_key;
5040 
5041                         btrfs_item_key(leaf, &disk_key, 0);
5042                         fixup_low_keys(fs_info, path, &disk_key, 1);
5043                 }
5044 
5045                 /* delete the leaf if it is mostly empty */
5046                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5047                         /* push_leaf_left fixes the path.
5048                          * make sure the path still points to our leaf
5049                          * for possible call to del_ptr below
5050                          */
5051                         slot = path->slots[1];
5052                         extent_buffer_get(leaf);
5053 
5054                         btrfs_set_path_blocking(path);
5055                         wret = push_leaf_left(trans, root, path, 1, 1,
5056                                               1, (u32)-1);
5057                         if (wret < 0 && wret != -ENOSPC)
5058                                 ret = wret;
5059 
5060                         if (path->nodes[0] == leaf &&
5061                             btrfs_header_nritems(leaf)) {
5062                                 wret = push_leaf_right(trans, root, path, 1,
5063                                                        1, 1, 0);
5064                                 if (wret < 0 && wret != -ENOSPC)
5065                                         ret = wret;
5066                         }
5067 
5068                         if (btrfs_header_nritems(leaf) == 0) {
5069                                 path->slots[1] = slot;
5070                                 btrfs_del_leaf(trans, root, path, leaf);
5071                                 free_extent_buffer(leaf);
5072                                 ret = 0;
5073                         } else {
5074                                 /* if we're still in the path, make sure
5075                                  * we're dirty.  Otherwise, one of the
5076                                  * push_leaf functions must have already
5077                                  * dirtied this buffer
5078                                  */
5079                                 if (path->nodes[0] == leaf)
5080                                         btrfs_mark_buffer_dirty(leaf);
5081                                 free_extent_buffer(leaf);
5082                         }
5083                 } else {
5084                         btrfs_mark_buffer_dirty(leaf);
5085                 }
5086         }
5087         return ret;
5088 }
5089 
5090 /*
5091  * search the tree again to find a leaf with lesser keys
5092  * returns 0 if it found something or 1 if there are no lesser leaves.
5093  * returns < 0 on io errors.
5094  *
5095  * This may release the path, and so you may lose any locks held at the
5096  * time you call it.
5097  */
5098 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5099 {
5100         struct btrfs_key key;
5101         struct btrfs_disk_key found_key;
5102         int ret;
5103 
5104         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5105 
5106         if (key.offset > 0) {
5107                 key.offset--;
5108         } else if (key.type > 0) {
5109                 key.type--;
5110                 key.offset = (u64)-1;
5111         } else if (key.objectid > 0) {
5112                 key.objectid--;
5113                 key.type = (u8)-1;
5114                 key.offset = (u64)-1;
5115         } else {
5116                 return 1;
5117         }
5118 
5119         btrfs_release_path(path);
5120         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5121         if (ret < 0)
5122                 return ret;
5123         btrfs_item_key(path->nodes[0], &found_key, 0);
5124         ret = comp_keys(&found_key, &key);
5125         /*
5126          * We might have had an item with the previous key in the tree right
5127          * before we released our path. And after we released our path, that
5128          * item might have been pushed to the first slot (0) of the leaf we
5129          * were holding due to a tree balance. Alternatively, an item with the
5130          * previous key can exist as the only element of a leaf (big fat item).
5131          * Therefore account for these 2 cases, so that our callers (like
5132          * btrfs_previous_item) don't miss an existing item with a key matching
5133          * the previous key we computed above.
5134          */
5135         if (ret <= 0)
5136                 return 0;
5137         return 1;
5138 }
5139 
5140 /*
5141  * A helper function to walk down the tree starting at min_key, and looking
5142  * for nodes or leaves that are have a minimum transaction id.
5143  * This is used by the btree defrag code, and tree logging
5144  *
5145  * This does not cow, but it does stuff the starting key it finds back
5146  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5147  * key and get a writable path.
5148  *
5149  * This honors path->lowest_level to prevent descent past a given level
5150  * of the tree.
5151  *
5152  * min_trans indicates the oldest transaction that you are interested
5153  * in walking through.  Any nodes or leaves older than min_trans are
5154  * skipped over (without reading them).
5155  *
5156  * returns zero if something useful was found, < 0 on error and 1 if there
5157  * was nothing in the tree that matched the search criteria.
5158  */
5159 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5160                          struct btrfs_path *path,
5161                          u64 min_trans)
5162 {
5163         struct btrfs_fs_info *fs_info = root->fs_info;
5164         struct extent_buffer *cur;
5165         struct btrfs_key found_key;
5166         int slot;
5167         int sret;
5168         u32 nritems;
5169         int level;
5170         int ret = 1;
5171         int keep_locks = path->keep_locks;
5172 
5173         path->keep_locks = 1;
5174 again:
5175         cur = btrfs_read_lock_root_node(root);
5176         level = btrfs_header_level(cur);
5177         WARN_ON(path->nodes[level]);
5178         path->nodes[level] = cur;
5179         path->locks[level] = BTRFS_READ_LOCK;
5180 
5181         if (btrfs_header_generation(cur) < min_trans) {
5182                 ret = 1;
5183                 goto out;
5184         }
5185         while (1) {
5186                 nritems = btrfs_header_nritems(cur);
5187                 level = btrfs_header_level(cur);
5188                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5189 
5190                 /* at the lowest level, we're done, setup the path and exit */
5191                 if (level == path->lowest_level) {
5192                         if (slot >= nritems)
5193                                 goto find_next_key;
5194                         ret = 0;
5195                         path->slots[level] = slot;
5196                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5197                         goto out;
5198                 }
5199                 if (sret && slot > 0)
5200                         slot--;
5201                 /*
5202                  * check this node pointer against the min_trans parameters.
5203                  * If it is too old, old, skip to the next one.
5204                  */
5205                 while (slot < nritems) {
5206                         u64 gen;
5207 
5208                         gen = btrfs_node_ptr_generation(cur, slot);
5209                         if (gen < min_trans) {
5210                                 slot++;
5211                                 continue;
5212                         }
5213                         break;
5214                 }
5215 find_next_key:
5216                 /*
5217                  * we didn't find a candidate key in this node, walk forward
5218                  * and find another one
5219                  */
5220                 if (slot >= nritems) {
5221                         path->slots[level] = slot;
5222                         btrfs_set_path_blocking(path);
5223                         sret = btrfs_find_next_key(root, path, min_key, level,
5224                                                   min_trans);
5225                         if (sret == 0) {
5226                                 btrfs_release_path(path);
5227                                 goto again;
5228                         } else {
5229                                 goto out;
5230                         }
5231                 }
5232                 /* save our key for returning back */
5233                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5234                 path->slots[level] = slot;
5235                 if (level == path->lowest_level) {
5236                         ret = 0;
5237                         goto out;
5238                 }
5239                 btrfs_set_path_blocking(path);
5240                 cur = read_node_slot(fs_info, cur, slot);
5241                 if (IS_ERR(cur)) {
5242                         ret = PTR_ERR(cur);
5243                         goto out;
5244                 }
5245 
5246                 btrfs_tree_read_lock(cur);
5247 
5248                 path->locks[level - 1] = BTRFS_READ_LOCK;
5249                 path->nodes[level - 1] = cur;
5250                 unlock_up(path, level, 1, 0, NULL);
5251                 btrfs_clear_path_blocking(path, NULL, 0);
5252         }
5253 out:
5254         path->keep_locks = keep_locks;
5255         if (ret == 0) {
5256                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5257                 btrfs_set_path_blocking(path);
5258                 memcpy(min_key, &found_key, sizeof(found_key));
5259         }
5260         return ret;
5261 }
5262 
5263 static int tree_move_down(struct btrfs_fs_info *fs_info,
5264                            struct btrfs_path *path,
5265                            int *level)
5266 {
5267         struct extent_buffer *eb;
5268 
5269         BUG_ON(*level == 0);
5270         eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5271         if (IS_ERR(eb))
5272                 return PTR_ERR(eb);
5273 
5274         path->nodes[*level - 1] = eb;
5275         path->slots[*level - 1] = 0;
5276         (*level)--;
5277         return 0;
5278 }
5279 
5280 static int tree_move_next_or_upnext(struct btrfs_path *path,
5281                                     int *level, int root_level)
5282 {
5283         int ret = 0;
5284         int nritems;
5285         nritems = btrfs_header_nritems(path->nodes[*level]);
5286 
5287         path->slots[*level]++;
5288 
5289         while (path->slots[*level] >= nritems) {
5290                 if (*level == root_level)
5291                         return -1;
5292 
5293                 /* move upnext */
5294                 path->slots[*level] = 0;
5295                 free_extent_buffer(path->nodes[*level]);
5296                 path->nodes[*level] = NULL;
5297                 (*level)++;
5298                 path->slots[*level]++;
5299 
5300                 nritems = btrfs_header_nritems(path->nodes[*level]);
5301                 ret = 1;
5302         }
5303         return ret;
5304 }
5305 
5306 /*
5307  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5308  * or down.
5309  */
5310 static int tree_advance(struct btrfs_fs_info *fs_info,
5311                         struct btrfs_path *path,
5312                         int *level, int root_level,
5313                         int allow_down,
5314                         struct btrfs_key *key)
5315 {
5316         int ret;
5317 
5318         if (*level == 0 || !allow_down) {
5319                 ret = tree_move_next_or_upnext(path, level, root_level);
5320         } else {
5321                 ret = tree_move_down(fs_info, path, level);
5322         }
5323         if (ret >= 0) {
5324                 if (*level == 0)
5325                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5326                                         path->slots[*level]);
5327                 else
5328                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5329                                         path->slots[*level]);
5330         }
5331         return ret;
5332 }
5333 
5334 static int tree_compare_item(struct btrfs_path *left_path,
5335                              struct btrfs_path *right_path,
5336                              char *tmp_buf)
5337 {
5338         int cmp;
5339         int len1, len2;
5340         unsigned long off1, off2;
5341 
5342         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5343         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5344         if (len1 != len2)
5345                 return 1;
5346 
5347         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5348         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5349                                 right_path->slots[0]);
5350 
5351         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5352 
5353         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5354         if (cmp)
5355                 return 1;
5356         return 0;
5357 }
5358 
5359 #define ADVANCE 1
5360 #define ADVANCE_ONLY_NEXT -1
5361 
5362 /*
5363  * This function compares two trees and calls the provided callback for
5364  * every changed/new/deleted item it finds.
5365  * If shared tree blocks are encountered, whole subtrees are skipped, making
5366  * the compare pretty fast on snapshotted subvolumes.
5367  *
5368  * This currently works on commit roots only. As commit roots are read only,
5369  * we don't do any locking. The commit roots are protected with transactions.
5370  * Transactions are ended and rejoined when a commit is tried in between.
5371  *
5372  * This function checks for modifications done to the trees while comparing.
5373  * If it detects a change, it aborts immediately.
5374  */
5375 int btrfs_compare_trees(struct btrfs_root *left_root,
5376                         struct btrfs_root *right_root,
5377                         btrfs_changed_cb_t changed_cb, void *ctx)
5378 {
5379         struct btrfs_fs_info *fs_info = left_root->fs_info;
5380         int ret;
5381         int cmp;
5382         struct btrfs_path *left_path = NULL;
5383         struct btrfs_path *right_path = NULL;
5384         struct btrfs_key left_key;
5385         struct btrfs_key right_key;
5386         char *tmp_buf = NULL;
5387         int left_root_level;
5388         int right_root_level;
5389         int left_level;
5390         int right_level;
5391         int left_end_reached;
5392         int right_end_reached;
5393         int advance_left;
5394         int advance_right;
5395         u64 left_blockptr;
5396         u64 right_blockptr;
5397         u64 left_gen;
5398         u64 right_gen;
5399 
5400         left_path = btrfs_alloc_path();
5401         if (!left_path) {
5402                 ret = -ENOMEM;
5403                 goto out;
5404         }
5405         right_path = btrfs_alloc_path();
5406         if (!right_path) {
5407                 ret = -ENOMEM;
5408                 goto out;
5409         }
5410 
5411         tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5412         if (!tmp_buf) {
5413                 ret = -ENOMEM;
5414                 goto out;
5415         }
5416 
5417         left_path->search_commit_root = 1;
5418         left_path->skip_locking = 1;
5419         right_path->search_commit_root = 1;
5420         right_path->skip_locking = 1;
5421 
5422         /*
5423          * Strategy: Go to the first items of both trees. Then do
5424          *
5425          * If both trees are at level 0
5426          *   Compare keys of current items
5427          *     If left < right treat left item as new, advance left tree
5428          *       and repeat
5429          *     If left > right treat right item as deleted, advance right tree
5430          *       and repeat
5431          *     If left == right do deep compare of items, treat as changed if
5432          *       needed, advance both trees and repeat
5433          * If both trees are at the same level but not at level 0
5434          *   Compare keys of current nodes/leafs
5435          *     If left < right advance left tree and repeat
5436          *     If left > right advance right tree and repeat
5437          *     If left == right compare blockptrs of the next nodes/leafs
5438          *       If they match advance both trees but stay at the same level
5439          *         and repeat
5440          *       If they don't match advance both trees while allowing to go
5441          *         deeper and repeat
5442          * If tree levels are different
5443          *   Advance the tree that needs it and repeat
5444          *
5445          * Advancing a tree means:
5446          *   If we are at level 0, try to go to the next slot. If that's not
5447          *   possible, go one level up and repeat. Stop when we found a level
5448          *   where we could go to the next slot. We may at this point be on a
5449          *   node or a leaf.
5450          *
5451          *   If we are not at level 0 and not on shared tree blocks, go one
5452          *   level deeper.
5453          *
5454          *   If we are not at level 0 and on shared tree blocks, go one slot to
5455          *   the right if possible or go up and right.
5456          */
5457 
5458         down_read(&fs_info->commit_root_sem);
5459         left_level = btrfs_header_level(left_root->commit_root);
5460         left_root_level = left_level;
5461         left_path->nodes[left_level] =
5462                         btrfs_clone_extent_buffer(left_root->commit_root);
5463         if (!left_path->nodes[left_level]) {
5464                 up_read(&fs_info->commit_root_sem);
5465                 ret = -ENOMEM;
5466                 goto out;
5467         }
5468         extent_buffer_get(left_path->nodes[left_level]);
5469 
5470         right_level = btrfs_header_level(right_root->commit_root);
5471         right_root_level = right_level;
5472         right_path->nodes[right_level] =
5473                         btrfs_clone_extent_buffer(right_root->commit_root);
5474         if (!right_path->nodes[right_level]) {
5475                 up_read(&fs_info->commit_root_sem);
5476                 ret = -ENOMEM;
5477                 goto out;
5478         }
5479         extent_buffer_get(right_path->nodes[right_level]);
5480         up_read(&fs_info->commit_root_sem);
5481 
5482         if (left_level == 0)
5483                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5484                                 &left_key, left_path->slots[left_level]);
5485         else
5486                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5487                                 &left_key, left_path->slots[left_level]);
5488         if (right_level == 0)
5489                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5490                                 &right_key, right_path->slots[right_level]);
5491         else
5492                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5493                                 &right_key, right_path->slots[right_level]);
5494 
5495         left_end_reached = right_end_reached = 0;
5496         advance_left = advance_right = 0;
5497 
5498         while (1) {
5499                 if (advance_left && !left_end_reached) {
5500                         ret = tree_advance(fs_info, left_path, &left_level,
5501                                         left_root_level,
5502                                         advance_left != ADVANCE_ONLY_NEXT,
5503                                         &left_key);
5504                         if (ret == -1)
5505                                 left_end_reached = ADVANCE;
5506                         else if (ret < 0)
5507                                 goto out;
5508                         advance_left = 0;
5509                 }
5510                 if (advance_right && !right_end_reached) {
5511                         ret = tree_advance(fs_info, right_path, &right_level,
5512                                         right_root_level,
5513                                         advance_right != ADVANCE_ONLY_NEXT,
5514                                         &right_key);
5515                         if (ret == -1)
5516                                 right_end_reached = ADVANCE;
5517                         else if (ret < 0)
5518                                 goto out;
5519                         advance_right = 0;
5520                 }
5521 
5522                 if (left_end_reached && right_end_reached) {
5523                         ret = 0;
5524                         goto out;
5525                 } else if (left_end_reached) {
5526                         if (right_level == 0) {
5527                                 ret = changed_cb(left_path, right_path,
5528                                                 &right_key,
5529                                                 BTRFS_COMPARE_TREE_DELETED,
5530                                                 ctx);
5531                                 if (ret < 0)
5532                                         goto out;
5533                         }
5534                         advance_right = ADVANCE;
5535                         continue;
5536                 } else if (right_end_reached) {
5537                         if (left_level == 0) {
5538                                 ret = changed_cb(left_path, right_path,
5539                                                 &left_key,
5540                                                 BTRFS_COMPARE_TREE_NEW,
5541                                                 ctx);
5542                                 if (ret < 0)
5543                                         goto out;
5544                         }
5545                         advance_left = ADVANCE;
5546                         continue;
5547                 }
5548 
5549                 if (left_level == 0 && right_level == 0) {
5550                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5551                         if (cmp < 0) {
5552                                 ret = changed_cb(left_path, right_path,
5553                                                 &left_key,
5554                                                 BTRFS_COMPARE_TREE_NEW,
5555                                                 ctx);
5556                                 if (ret < 0)
5557                                         goto out;
5558                                 advance_left = ADVANCE;
5559                         } else if (cmp > 0) {
5560                                 ret = changed_cb(left_path, right_path,
5561                                                 &right_key,
5562                                                 BTRFS_COMPARE_TREE_DELETED,
5563                                                 ctx);
5564                                 if (ret < 0)
5565                                         goto out;
5566                                 advance_right = ADVANCE;
5567                         } else {
5568                                 enum btrfs_compare_tree_result result;
5569 
5570                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5571                                 ret = tree_compare_item(left_path, right_path,
5572                                                         tmp_buf);
5573                                 if (ret)
5574                                         result = BTRFS_COMPARE_TREE_CHANGED;
5575                                 else
5576                                         result = BTRFS_COMPARE_TREE_SAME;
5577                                 ret = changed_cb(left_path, right_path,
5578                                                  &left_key, result, ctx);
5579                                 if (ret < 0)
5580                                         goto out;
5581                                 advance_left = ADVANCE;
5582                                 advance_right = ADVANCE;
5583                         }
5584                 } else if (left_level == right_level) {
5585                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5586                         if (cmp < 0) {
5587                                 advance_left = ADVANCE;
5588                         } else if (cmp > 0) {
5589                                 advance_right = ADVANCE;
5590                         } else {
5591                                 left_blockptr = btrfs_node_blockptr(
5592                                                 left_path->nodes[left_level],
5593                                                 left_path->slots[left_level]);
5594                                 right_blockptr = btrfs_node_blockptr(
5595                                                 right_path->nodes[right_level],
5596                                                 right_path->slots[right_level]);
5597                                 left_gen = btrfs_node_ptr_generation(
5598                                                 left_path->nodes[left_level],
5599                                                 left_path->slots[left_level]);
5600                                 right_gen = btrfs_node_ptr_generation(
5601                                                 right_path->nodes[right_level],
5602                                                 right_path->slots[right_level]);
5603                                 if (left_blockptr == right_blockptr &&
5604                                     left_gen == right_gen) {
5605                                         /*
5606                                          * As we're on a shared block, don't
5607                                          * allow to go deeper.
5608                                          */
5609                                         advance_left = ADVANCE_ONLY_NEXT;
5610                                         advance_right = ADVANCE_ONLY_NEXT;
5611                                 } else {
5612                                         advance_left = ADVANCE;
5613                                         advance_right = ADVANCE;
5614                                 }
5615