~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/ocfs2/alloc.c

Version: ~ [ linux-4.15-rc8 ] ~ [ linux-4.14.13 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.76 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.111 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.48 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.91 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.53 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.98 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.39.4 ] ~ [ linux-2.6.38.8 ] ~ [ linux-2.6.37.6 ] ~ [ linux-2.6.36.4 ] ~ [ linux-2.6.35.14 ] ~ [ linux-2.6.34.15 ] ~ [ linux-2.6.33.20 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.27.62 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* -*- mode: c; c-basic-offset: 8; -*-
  2  * vim: noexpandtab sw=8 ts=8 sts=0:
  3  *
  4  * alloc.c
  5  *
  6  * Extent allocs and frees
  7  *
  8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  9  *
 10  * This program is free software; you can redistribute it and/or
 11  * modify it under the terms of the GNU General Public
 12  * License as published by the Free Software Foundation; either
 13  * version 2 of the License, or (at your option) any later version.
 14  *
 15  * This program is distributed in the hope that it will be useful,
 16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 18  * General Public License for more details.
 19  *
 20  * You should have received a copy of the GNU General Public
 21  * License along with this program; if not, write to the
 22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 23  * Boston, MA 021110-1307, USA.
 24  */
 25 
 26 #include <linux/fs.h>
 27 #include <linux/types.h>
 28 #include <linux/slab.h>
 29 #include <linux/highmem.h>
 30 #include <linux/swap.h>
 31 #include <linux/quotaops.h>
 32 #include <linux/blkdev.h>
 33 #include <linux/sched/signal.h>
 34 
 35 #include <cluster/masklog.h>
 36 
 37 #include "ocfs2.h"
 38 
 39 #include "alloc.h"
 40 #include "aops.h"
 41 #include "blockcheck.h"
 42 #include "dlmglue.h"
 43 #include "extent_map.h"
 44 #include "inode.h"
 45 #include "journal.h"
 46 #include "localalloc.h"
 47 #include "suballoc.h"
 48 #include "sysfile.h"
 49 #include "file.h"
 50 #include "super.h"
 51 #include "uptodate.h"
 52 #include "xattr.h"
 53 #include "refcounttree.h"
 54 #include "ocfs2_trace.h"
 55 
 56 #include "buffer_head_io.h"
 57 
 58 enum ocfs2_contig_type {
 59         CONTIG_NONE = 0,
 60         CONTIG_LEFT,
 61         CONTIG_RIGHT,
 62         CONTIG_LEFTRIGHT,
 63 };
 64 
 65 static enum ocfs2_contig_type
 66         ocfs2_extent_rec_contig(struct super_block *sb,
 67                                 struct ocfs2_extent_rec *ext,
 68                                 struct ocfs2_extent_rec *insert_rec);
 69 /*
 70  * Operations for a specific extent tree type.
 71  *
 72  * To implement an on-disk btree (extent tree) type in ocfs2, add
 73  * an ocfs2_extent_tree_operations structure and the matching
 74  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
 75  * for the allocation portion of the extent tree.
 76  */
 77 struct ocfs2_extent_tree_operations {
 78         /*
 79          * last_eb_blk is the block number of the right most leaf extent
 80          * block.  Most on-disk structures containing an extent tree store
 81          * this value for fast access.  The ->eo_set_last_eb_blk() and
 82          * ->eo_get_last_eb_blk() operations access this value.  They are
 83          *  both required.
 84          */
 85         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
 86                                    u64 blkno);
 87         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
 88 
 89         /*
 90          * The on-disk structure usually keeps track of how many total
 91          * clusters are stored in this extent tree.  This function updates
 92          * that value.  new_clusters is the delta, and must be
 93          * added to the total.  Required.
 94          */
 95         void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
 96                                    u32 new_clusters);
 97 
 98         /*
 99          * If this extent tree is supported by an extent map, insert
100          * a record into the map.
101          */
102         void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
103                                      struct ocfs2_extent_rec *rec);
104 
105         /*
106          * If this extent tree is supported by an extent map, truncate the
107          * map to clusters,
108          */
109         void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
110                                        u32 clusters);
111 
112         /*
113          * If ->eo_insert_check() exists, it is called before rec is
114          * inserted into the extent tree.  It is optional.
115          */
116         int (*eo_insert_check)(struct ocfs2_extent_tree *et,
117                                struct ocfs2_extent_rec *rec);
118         int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
119 
120         /*
121          * --------------------------------------------------------------
122          * The remaining are internal to ocfs2_extent_tree and don't have
123          * accessor functions
124          */
125 
126         /*
127          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
128          * It is required.
129          */
130         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
131 
132         /*
133          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
134          * it exists.  If it does not, et->et_max_leaf_clusters is set
135          * to 0 (unlimited).  Optional.
136          */
137         void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
138 
139         /*
140          * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
141          * are contiguous or not. Optional. Don't need to set it if use
142          * ocfs2_extent_rec as the tree leaf.
143          */
144         enum ocfs2_contig_type
145                 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
146                                     struct ocfs2_extent_rec *ext,
147                                     struct ocfs2_extent_rec *insert_rec);
148 };
149 
150 
151 /*
152  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
153  * in the methods.
154  */
155 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
156 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
157                                          u64 blkno);
158 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
159                                          u32 clusters);
160 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
161                                            struct ocfs2_extent_rec *rec);
162 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
163                                              u32 clusters);
164 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
165                                      struct ocfs2_extent_rec *rec);
166 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
167 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
168 static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
169         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
170         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
171         .eo_update_clusters     = ocfs2_dinode_update_clusters,
172         .eo_extent_map_insert   = ocfs2_dinode_extent_map_insert,
173         .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
174         .eo_insert_check        = ocfs2_dinode_insert_check,
175         .eo_sanity_check        = ocfs2_dinode_sanity_check,
176         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
177 };
178 
179 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
180                                          u64 blkno)
181 {
182         struct ocfs2_dinode *di = et->et_object;
183 
184         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
185         di->i_last_eb_blk = cpu_to_le64(blkno);
186 }
187 
188 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
189 {
190         struct ocfs2_dinode *di = et->et_object;
191 
192         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
193         return le64_to_cpu(di->i_last_eb_blk);
194 }
195 
196 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
197                                          u32 clusters)
198 {
199         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
200         struct ocfs2_dinode *di = et->et_object;
201 
202         le32_add_cpu(&di->i_clusters, clusters);
203         spin_lock(&oi->ip_lock);
204         oi->ip_clusters = le32_to_cpu(di->i_clusters);
205         spin_unlock(&oi->ip_lock);
206 }
207 
208 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
209                                            struct ocfs2_extent_rec *rec)
210 {
211         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
212 
213         ocfs2_extent_map_insert_rec(inode, rec);
214 }
215 
216 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
217                                              u32 clusters)
218 {
219         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
220 
221         ocfs2_extent_map_trunc(inode, clusters);
222 }
223 
224 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
225                                      struct ocfs2_extent_rec *rec)
226 {
227         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
228         struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
229 
230         BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
231         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
232                         (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
233                         "Device %s, asking for sparse allocation: inode %llu, "
234                         "cpos %u, clusters %u\n",
235                         osb->dev_str,
236                         (unsigned long long)oi->ip_blkno,
237                         rec->e_cpos, oi->ip_clusters);
238 
239         return 0;
240 }
241 
242 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
243 {
244         struct ocfs2_dinode *di = et->et_object;
245 
246         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
247         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
248 
249         return 0;
250 }
251 
252 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
253 {
254         struct ocfs2_dinode *di = et->et_object;
255 
256         et->et_root_el = &di->id2.i_list;
257 }
258 
259 
260 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
261 {
262         struct ocfs2_xattr_value_buf *vb = et->et_object;
263 
264         et->et_root_el = &vb->vb_xv->xr_list;
265 }
266 
267 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
268                                               u64 blkno)
269 {
270         struct ocfs2_xattr_value_buf *vb = et->et_object;
271 
272         vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
273 }
274 
275 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
276 {
277         struct ocfs2_xattr_value_buf *vb = et->et_object;
278 
279         return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
280 }
281 
282 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
283                                               u32 clusters)
284 {
285         struct ocfs2_xattr_value_buf *vb = et->et_object;
286 
287         le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
288 }
289 
290 static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
291         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
292         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
293         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
294         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
295 };
296 
297 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
298 {
299         struct ocfs2_xattr_block *xb = et->et_object;
300 
301         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
302 }
303 
304 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
305 {
306         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
307         et->et_max_leaf_clusters =
308                 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
309 }
310 
311 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
312                                              u64 blkno)
313 {
314         struct ocfs2_xattr_block *xb = et->et_object;
315         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
316 
317         xt->xt_last_eb_blk = cpu_to_le64(blkno);
318 }
319 
320 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
321 {
322         struct ocfs2_xattr_block *xb = et->et_object;
323         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
324 
325         return le64_to_cpu(xt->xt_last_eb_blk);
326 }
327 
328 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
329                                              u32 clusters)
330 {
331         struct ocfs2_xattr_block *xb = et->et_object;
332 
333         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
334 }
335 
336 static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
337         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
338         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
339         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
340         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
341         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
342 };
343 
344 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
345                                           u64 blkno)
346 {
347         struct ocfs2_dx_root_block *dx_root = et->et_object;
348 
349         dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
350 }
351 
352 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
353 {
354         struct ocfs2_dx_root_block *dx_root = et->et_object;
355 
356         return le64_to_cpu(dx_root->dr_last_eb_blk);
357 }
358 
359 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
360                                           u32 clusters)
361 {
362         struct ocfs2_dx_root_block *dx_root = et->et_object;
363 
364         le32_add_cpu(&dx_root->dr_clusters, clusters);
365 }
366 
367 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
368 {
369         struct ocfs2_dx_root_block *dx_root = et->et_object;
370 
371         BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
372 
373         return 0;
374 }
375 
376 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
377 {
378         struct ocfs2_dx_root_block *dx_root = et->et_object;
379 
380         et->et_root_el = &dx_root->dr_list;
381 }
382 
383 static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
384         .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
385         .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
386         .eo_update_clusters     = ocfs2_dx_root_update_clusters,
387         .eo_sanity_check        = ocfs2_dx_root_sanity_check,
388         .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
389 };
390 
391 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
392 {
393         struct ocfs2_refcount_block *rb = et->et_object;
394 
395         et->et_root_el = &rb->rf_list;
396 }
397 
398 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
399                                                 u64 blkno)
400 {
401         struct ocfs2_refcount_block *rb = et->et_object;
402 
403         rb->rf_last_eb_blk = cpu_to_le64(blkno);
404 }
405 
406 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
407 {
408         struct ocfs2_refcount_block *rb = et->et_object;
409 
410         return le64_to_cpu(rb->rf_last_eb_blk);
411 }
412 
413 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
414                                                 u32 clusters)
415 {
416         struct ocfs2_refcount_block *rb = et->et_object;
417 
418         le32_add_cpu(&rb->rf_clusters, clusters);
419 }
420 
421 static enum ocfs2_contig_type
422 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
423                                   struct ocfs2_extent_rec *ext,
424                                   struct ocfs2_extent_rec *insert_rec)
425 {
426         return CONTIG_NONE;
427 }
428 
429 static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
430         .eo_set_last_eb_blk     = ocfs2_refcount_tree_set_last_eb_blk,
431         .eo_get_last_eb_blk     = ocfs2_refcount_tree_get_last_eb_blk,
432         .eo_update_clusters     = ocfs2_refcount_tree_update_clusters,
433         .eo_fill_root_el        = ocfs2_refcount_tree_fill_root_el,
434         .eo_extent_contig       = ocfs2_refcount_tree_extent_contig,
435 };
436 
437 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
438                                      struct ocfs2_caching_info *ci,
439                                      struct buffer_head *bh,
440                                      ocfs2_journal_access_func access,
441                                      void *obj,
442                                      const struct ocfs2_extent_tree_operations *ops)
443 {
444         et->et_ops = ops;
445         et->et_root_bh = bh;
446         et->et_ci = ci;
447         et->et_root_journal_access = access;
448         if (!obj)
449                 obj = (void *)bh->b_data;
450         et->et_object = obj;
451 
452         et->et_ops->eo_fill_root_el(et);
453         if (!et->et_ops->eo_fill_max_leaf_clusters)
454                 et->et_max_leaf_clusters = 0;
455         else
456                 et->et_ops->eo_fill_max_leaf_clusters(et);
457 }
458 
459 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
460                                    struct ocfs2_caching_info *ci,
461                                    struct buffer_head *bh)
462 {
463         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
464                                  NULL, &ocfs2_dinode_et_ops);
465 }
466 
467 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
468                                        struct ocfs2_caching_info *ci,
469                                        struct buffer_head *bh)
470 {
471         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
472                                  NULL, &ocfs2_xattr_tree_et_ops);
473 }
474 
475 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
476                                         struct ocfs2_caching_info *ci,
477                                         struct ocfs2_xattr_value_buf *vb)
478 {
479         __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
480                                  &ocfs2_xattr_value_et_ops);
481 }
482 
483 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
484                                     struct ocfs2_caching_info *ci,
485                                     struct buffer_head *bh)
486 {
487         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
488                                  NULL, &ocfs2_dx_root_et_ops);
489 }
490 
491 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
492                                      struct ocfs2_caching_info *ci,
493                                      struct buffer_head *bh)
494 {
495         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
496                                  NULL, &ocfs2_refcount_tree_et_ops);
497 }
498 
499 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
500                                             u64 new_last_eb_blk)
501 {
502         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
503 }
504 
505 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
506 {
507         return et->et_ops->eo_get_last_eb_blk(et);
508 }
509 
510 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
511                                             u32 clusters)
512 {
513         et->et_ops->eo_update_clusters(et, clusters);
514 }
515 
516 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
517                                               struct ocfs2_extent_rec *rec)
518 {
519         if (et->et_ops->eo_extent_map_insert)
520                 et->et_ops->eo_extent_map_insert(et, rec);
521 }
522 
523 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
524                                                 u32 clusters)
525 {
526         if (et->et_ops->eo_extent_map_truncate)
527                 et->et_ops->eo_extent_map_truncate(et, clusters);
528 }
529 
530 static inline int ocfs2_et_root_journal_access(handle_t *handle,
531                                                struct ocfs2_extent_tree *et,
532                                                int type)
533 {
534         return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
535                                           type);
536 }
537 
538 static inline enum ocfs2_contig_type
539         ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
540                                struct ocfs2_extent_rec *rec,
541                                struct ocfs2_extent_rec *insert_rec)
542 {
543         if (et->et_ops->eo_extent_contig)
544                 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
545 
546         return ocfs2_extent_rec_contig(
547                                 ocfs2_metadata_cache_get_super(et->et_ci),
548                                 rec, insert_rec);
549 }
550 
551 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
552                                         struct ocfs2_extent_rec *rec)
553 {
554         int ret = 0;
555 
556         if (et->et_ops->eo_insert_check)
557                 ret = et->et_ops->eo_insert_check(et, rec);
558         return ret;
559 }
560 
561 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
562 {
563         int ret = 0;
564 
565         if (et->et_ops->eo_sanity_check)
566                 ret = et->et_ops->eo_sanity_check(et);
567         return ret;
568 }
569 
570 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
571                                          struct ocfs2_extent_block *eb);
572 static void ocfs2_adjust_rightmost_records(handle_t *handle,
573                                            struct ocfs2_extent_tree *et,
574                                            struct ocfs2_path *path,
575                                            struct ocfs2_extent_rec *insert_rec);
576 /*
577  * Reset the actual path elements so that we can re-use the structure
578  * to build another path. Generally, this involves freeing the buffer
579  * heads.
580  */
581 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
582 {
583         int i, start = 0, depth = 0;
584         struct ocfs2_path_item *node;
585 
586         if (keep_root)
587                 start = 1;
588 
589         for(i = start; i < path_num_items(path); i++) {
590                 node = &path->p_node[i];
591 
592                 brelse(node->bh);
593                 node->bh = NULL;
594                 node->el = NULL;
595         }
596 
597         /*
598          * Tree depth may change during truncate, or insert. If we're
599          * keeping the root extent list, then make sure that our path
600          * structure reflects the proper depth.
601          */
602         if (keep_root)
603                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
604         else
605                 path_root_access(path) = NULL;
606 
607         path->p_tree_depth = depth;
608 }
609 
610 void ocfs2_free_path(struct ocfs2_path *path)
611 {
612         if (path) {
613                 ocfs2_reinit_path(path, 0);
614                 kfree(path);
615         }
616 }
617 
618 /*
619  * All the elements of src into dest. After this call, src could be freed
620  * without affecting dest.
621  *
622  * Both paths should have the same root. Any non-root elements of dest
623  * will be freed.
624  */
625 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
626 {
627         int i;
628 
629         BUG_ON(path_root_bh(dest) != path_root_bh(src));
630         BUG_ON(path_root_el(dest) != path_root_el(src));
631         BUG_ON(path_root_access(dest) != path_root_access(src));
632 
633         ocfs2_reinit_path(dest, 1);
634 
635         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
636                 dest->p_node[i].bh = src->p_node[i].bh;
637                 dest->p_node[i].el = src->p_node[i].el;
638 
639                 if (dest->p_node[i].bh)
640                         get_bh(dest->p_node[i].bh);
641         }
642 }
643 
644 /*
645  * Make the *dest path the same as src and re-initialize src path to
646  * have a root only.
647  */
648 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
649 {
650         int i;
651 
652         BUG_ON(path_root_bh(dest) != path_root_bh(src));
653         BUG_ON(path_root_access(dest) != path_root_access(src));
654 
655         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
656                 brelse(dest->p_node[i].bh);
657 
658                 dest->p_node[i].bh = src->p_node[i].bh;
659                 dest->p_node[i].el = src->p_node[i].el;
660 
661                 src->p_node[i].bh = NULL;
662                 src->p_node[i].el = NULL;
663         }
664 }
665 
666 /*
667  * Insert an extent block at given index.
668  *
669  * This will not take an additional reference on eb_bh.
670  */
671 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
672                                         struct buffer_head *eb_bh)
673 {
674         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
675 
676         /*
677          * Right now, no root bh is an extent block, so this helps
678          * catch code errors with dinode trees. The assertion can be
679          * safely removed if we ever need to insert extent block
680          * structures at the root.
681          */
682         BUG_ON(index == 0);
683 
684         path->p_node[index].bh = eb_bh;
685         path->p_node[index].el = &eb->h_list;
686 }
687 
688 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
689                                          struct ocfs2_extent_list *root_el,
690                                          ocfs2_journal_access_func access)
691 {
692         struct ocfs2_path *path;
693 
694         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
695 
696         path = kzalloc(sizeof(*path), GFP_NOFS);
697         if (path) {
698                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
699                 get_bh(root_bh);
700                 path_root_bh(path) = root_bh;
701                 path_root_el(path) = root_el;
702                 path_root_access(path) = access;
703         }
704 
705         return path;
706 }
707 
708 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
709 {
710         return ocfs2_new_path(path_root_bh(path), path_root_el(path),
711                               path_root_access(path));
712 }
713 
714 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
715 {
716         return ocfs2_new_path(et->et_root_bh, et->et_root_el,
717                               et->et_root_journal_access);
718 }
719 
720 /*
721  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
722  * otherwise it's the root_access function.
723  *
724  * I don't like the way this function's name looks next to
725  * ocfs2_journal_access_path(), but I don't have a better one.
726  */
727 int ocfs2_path_bh_journal_access(handle_t *handle,
728                                  struct ocfs2_caching_info *ci,
729                                  struct ocfs2_path *path,
730                                  int idx)
731 {
732         ocfs2_journal_access_func access = path_root_access(path);
733 
734         if (!access)
735                 access = ocfs2_journal_access;
736 
737         if (idx)
738                 access = ocfs2_journal_access_eb;
739 
740         return access(handle, ci, path->p_node[idx].bh,
741                       OCFS2_JOURNAL_ACCESS_WRITE);
742 }
743 
744 /*
745  * Convenience function to journal all components in a path.
746  */
747 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
748                               handle_t *handle,
749                               struct ocfs2_path *path)
750 {
751         int i, ret = 0;
752 
753         if (!path)
754                 goto out;
755 
756         for(i = 0; i < path_num_items(path); i++) {
757                 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
758                 if (ret < 0) {
759                         mlog_errno(ret);
760                         goto out;
761                 }
762         }
763 
764 out:
765         return ret;
766 }
767 
768 /*
769  * Return the index of the extent record which contains cluster #v_cluster.
770  * -1 is returned if it was not found.
771  *
772  * Should work fine on interior and exterior nodes.
773  */
774 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
775 {
776         int ret = -1;
777         int i;
778         struct ocfs2_extent_rec *rec;
779         u32 rec_end, rec_start, clusters;
780 
781         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
782                 rec = &el->l_recs[i];
783 
784                 rec_start = le32_to_cpu(rec->e_cpos);
785                 clusters = ocfs2_rec_clusters(el, rec);
786 
787                 rec_end = rec_start + clusters;
788 
789                 if (v_cluster >= rec_start && v_cluster < rec_end) {
790                         ret = i;
791                         break;
792                 }
793         }
794 
795         return ret;
796 }
797 
798 /*
799  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
800  * ocfs2_extent_rec_contig only work properly against leaf nodes!
801  */
802 static int ocfs2_block_extent_contig(struct super_block *sb,
803                                      struct ocfs2_extent_rec *ext,
804                                      u64 blkno)
805 {
806         u64 blk_end = le64_to_cpu(ext->e_blkno);
807 
808         blk_end += ocfs2_clusters_to_blocks(sb,
809                                     le16_to_cpu(ext->e_leaf_clusters));
810 
811         return blkno == blk_end;
812 }
813 
814 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
815                                   struct ocfs2_extent_rec *right)
816 {
817         u32 left_range;
818 
819         left_range = le32_to_cpu(left->e_cpos) +
820                 le16_to_cpu(left->e_leaf_clusters);
821 
822         return (left_range == le32_to_cpu(right->e_cpos));
823 }
824 
825 static enum ocfs2_contig_type
826         ocfs2_extent_rec_contig(struct super_block *sb,
827                                 struct ocfs2_extent_rec *ext,
828                                 struct ocfs2_extent_rec *insert_rec)
829 {
830         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
831 
832         /*
833          * Refuse to coalesce extent records with different flag
834          * fields - we don't want to mix unwritten extents with user
835          * data.
836          */
837         if (ext->e_flags != insert_rec->e_flags)
838                 return CONTIG_NONE;
839 
840         if (ocfs2_extents_adjacent(ext, insert_rec) &&
841             ocfs2_block_extent_contig(sb, ext, blkno))
842                         return CONTIG_RIGHT;
843 
844         blkno = le64_to_cpu(ext->e_blkno);
845         if (ocfs2_extents_adjacent(insert_rec, ext) &&
846             ocfs2_block_extent_contig(sb, insert_rec, blkno))
847                 return CONTIG_LEFT;
848 
849         return CONTIG_NONE;
850 }
851 
852 /*
853  * NOTE: We can have pretty much any combination of contiguousness and
854  * appending.
855  *
856  * The usefulness of APPEND_TAIL is more in that it lets us know that
857  * we'll have to update the path to that leaf.
858  */
859 enum ocfs2_append_type {
860         APPEND_NONE = 0,
861         APPEND_TAIL,
862 };
863 
864 enum ocfs2_split_type {
865         SPLIT_NONE = 0,
866         SPLIT_LEFT,
867         SPLIT_RIGHT,
868 };
869 
870 struct ocfs2_insert_type {
871         enum ocfs2_split_type   ins_split;
872         enum ocfs2_append_type  ins_appending;
873         enum ocfs2_contig_type  ins_contig;
874         int                     ins_contig_index;
875         int                     ins_tree_depth;
876 };
877 
878 struct ocfs2_merge_ctxt {
879         enum ocfs2_contig_type  c_contig_type;
880         int                     c_has_empty_extent;
881         int                     c_split_covers_rec;
882 };
883 
884 static int ocfs2_validate_extent_block(struct super_block *sb,
885                                        struct buffer_head *bh)
886 {
887         int rc;
888         struct ocfs2_extent_block *eb =
889                 (struct ocfs2_extent_block *)bh->b_data;
890 
891         trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
892 
893         BUG_ON(!buffer_uptodate(bh));
894 
895         /*
896          * If the ecc fails, we return the error but otherwise
897          * leave the filesystem running.  We know any error is
898          * local to this block.
899          */
900         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
901         if (rc) {
902                 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
903                      (unsigned long long)bh->b_blocknr);
904                 return rc;
905         }
906 
907         /*
908          * Errors after here are fatal.
909          */
910 
911         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
912                 rc = ocfs2_error(sb,
913                                  "Extent block #%llu has bad signature %.*s\n",
914                                  (unsigned long long)bh->b_blocknr, 7,
915                                  eb->h_signature);
916                 goto bail;
917         }
918 
919         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
920                 rc = ocfs2_error(sb,
921                                  "Extent block #%llu has an invalid h_blkno of %llu\n",
922                                  (unsigned long long)bh->b_blocknr,
923                                  (unsigned long long)le64_to_cpu(eb->h_blkno));
924                 goto bail;
925         }
926 
927         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
928                 rc = ocfs2_error(sb,
929                                  "Extent block #%llu has an invalid h_fs_generation of #%u\n",
930                                  (unsigned long long)bh->b_blocknr,
931                                  le32_to_cpu(eb->h_fs_generation));
932                 goto bail;
933         }
934 bail:
935         return rc;
936 }
937 
938 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
939                             struct buffer_head **bh)
940 {
941         int rc;
942         struct buffer_head *tmp = *bh;
943 
944         rc = ocfs2_read_block(ci, eb_blkno, &tmp,
945                               ocfs2_validate_extent_block);
946 
947         /* If ocfs2_read_block() got us a new bh, pass it up. */
948         if (!rc && !*bh)
949                 *bh = tmp;
950 
951         return rc;
952 }
953 
954 
955 /*
956  * How many free extents have we got before we need more meta data?
957  */
958 int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
959 {
960         int retval;
961         struct ocfs2_extent_list *el = NULL;
962         struct ocfs2_extent_block *eb;
963         struct buffer_head *eb_bh = NULL;
964         u64 last_eb_blk = 0;
965 
966         el = et->et_root_el;
967         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
968 
969         if (last_eb_blk) {
970                 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
971                                                  &eb_bh);
972                 if (retval < 0) {
973                         mlog_errno(retval);
974                         goto bail;
975                 }
976                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
977                 el = &eb->h_list;
978         }
979 
980         BUG_ON(el->l_tree_depth != 0);
981 
982         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
983 bail:
984         brelse(eb_bh);
985 
986         trace_ocfs2_num_free_extents(retval);
987         return retval;
988 }
989 
990 /* expects array to already be allocated
991  *
992  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
993  * l_count for you
994  */
995 static int ocfs2_create_new_meta_bhs(handle_t *handle,
996                                      struct ocfs2_extent_tree *et,
997                                      int wanted,
998                                      struct ocfs2_alloc_context *meta_ac,
999                                      struct buffer_head *bhs[])
1000 {
1001         int count, status, i;
1002         u16 suballoc_bit_start;
1003         u32 num_got;
1004         u64 suballoc_loc, first_blkno;
1005         struct ocfs2_super *osb =
1006                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1007         struct ocfs2_extent_block *eb;
1008 
1009         count = 0;
1010         while (count < wanted) {
1011                 status = ocfs2_claim_metadata(handle,
1012                                               meta_ac,
1013                                               wanted - count,
1014                                               &suballoc_loc,
1015                                               &suballoc_bit_start,
1016                                               &num_got,
1017                                               &first_blkno);
1018                 if (status < 0) {
1019                         mlog_errno(status);
1020                         goto bail;
1021                 }
1022 
1023                 for(i = count;  i < (num_got + count); i++) {
1024                         bhs[i] = sb_getblk(osb->sb, first_blkno);
1025                         if (bhs[i] == NULL) {
1026                                 status = -ENOMEM;
1027                                 mlog_errno(status);
1028                                 goto bail;
1029                         }
1030                         ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1031 
1032                         status = ocfs2_journal_access_eb(handle, et->et_ci,
1033                                                          bhs[i],
1034                                                          OCFS2_JOURNAL_ACCESS_CREATE);
1035                         if (status < 0) {
1036                                 mlog_errno(status);
1037                                 goto bail;
1038                         }
1039 
1040                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1041                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1042                         /* Ok, setup the minimal stuff here. */
1043                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1044                         eb->h_blkno = cpu_to_le64(first_blkno);
1045                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1046                         eb->h_suballoc_slot =
1047                                 cpu_to_le16(meta_ac->ac_alloc_slot);
1048                         eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1049                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1050                         eb->h_list.l_count =
1051                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1052 
1053                         suballoc_bit_start++;
1054                         first_blkno++;
1055 
1056                         /* We'll also be dirtied by the caller, so
1057                          * this isn't absolutely necessary. */
1058                         ocfs2_journal_dirty(handle, bhs[i]);
1059                 }
1060 
1061                 count += num_got;
1062         }
1063 
1064         status = 0;
1065 bail:
1066         if (status < 0) {
1067                 for(i = 0; i < wanted; i++) {
1068                         brelse(bhs[i]);
1069                         bhs[i] = NULL;
1070                 }
1071                 mlog_errno(status);
1072         }
1073         return status;
1074 }
1075 
1076 /*
1077  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1078  *
1079  * Returns the sum of the rightmost extent rec logical offset and
1080  * cluster count.
1081  *
1082  * ocfs2_add_branch() uses this to determine what logical cluster
1083  * value should be populated into the leftmost new branch records.
1084  *
1085  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1086  * value for the new topmost tree record.
1087  */
1088 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1089 {
1090         int i;
1091 
1092         i = le16_to_cpu(el->l_next_free_rec) - 1;
1093 
1094         return le32_to_cpu(el->l_recs[i].e_cpos) +
1095                 ocfs2_rec_clusters(el, &el->l_recs[i]);
1096 }
1097 
1098 /*
1099  * Change range of the branches in the right most path according to the leaf
1100  * extent block's rightmost record.
1101  */
1102 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1103                                          struct ocfs2_extent_tree *et)
1104 {
1105         int status;
1106         struct ocfs2_path *path = NULL;
1107         struct ocfs2_extent_list *el;
1108         struct ocfs2_extent_rec *rec;
1109 
1110         path = ocfs2_new_path_from_et(et);
1111         if (!path) {
1112                 status = -ENOMEM;
1113                 return status;
1114         }
1115 
1116         status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1117         if (status < 0) {
1118                 mlog_errno(status);
1119                 goto out;
1120         }
1121 
1122         status = ocfs2_extend_trans(handle, path_num_items(path));
1123         if (status < 0) {
1124                 mlog_errno(status);
1125                 goto out;
1126         }
1127 
1128         status = ocfs2_journal_access_path(et->et_ci, handle, path);
1129         if (status < 0) {
1130                 mlog_errno(status);
1131                 goto out;
1132         }
1133 
1134         el = path_leaf_el(path);
1135         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
1136 
1137         ocfs2_adjust_rightmost_records(handle, et, path, rec);
1138 
1139 out:
1140         ocfs2_free_path(path);
1141         return status;
1142 }
1143 
1144 /*
1145  * Add an entire tree branch to our inode. eb_bh is the extent block
1146  * to start at, if we don't want to start the branch at the root
1147  * structure.
1148  *
1149  * last_eb_bh is required as we have to update it's next_leaf pointer
1150  * for the new last extent block.
1151  *
1152  * the new branch will be 'empty' in the sense that every block will
1153  * contain a single record with cluster count == 0.
1154  */
1155 static int ocfs2_add_branch(handle_t *handle,
1156                             struct ocfs2_extent_tree *et,
1157                             struct buffer_head *eb_bh,
1158                             struct buffer_head **last_eb_bh,
1159                             struct ocfs2_alloc_context *meta_ac)
1160 {
1161         int status, new_blocks, i;
1162         u64 next_blkno, new_last_eb_blk;
1163         struct buffer_head *bh;
1164         struct buffer_head **new_eb_bhs = NULL;
1165         struct ocfs2_extent_block *eb;
1166         struct ocfs2_extent_list  *eb_el;
1167         struct ocfs2_extent_list  *el;
1168         u32 new_cpos, root_end;
1169 
1170         BUG_ON(!last_eb_bh || !*last_eb_bh);
1171 
1172         if (eb_bh) {
1173                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1174                 el = &eb->h_list;
1175         } else
1176                 el = et->et_root_el;
1177 
1178         /* we never add a branch to a leaf. */
1179         BUG_ON(!el->l_tree_depth);
1180 
1181         new_blocks = le16_to_cpu(el->l_tree_depth);
1182 
1183         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1184         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1185         root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1186 
1187         /*
1188          * If there is a gap before the root end and the real end
1189          * of the righmost leaf block, we need to remove the gap
1190          * between new_cpos and root_end first so that the tree
1191          * is consistent after we add a new branch(it will start
1192          * from new_cpos).
1193          */
1194         if (root_end > new_cpos) {
1195                 trace_ocfs2_adjust_rightmost_branch(
1196                         (unsigned long long)
1197                         ocfs2_metadata_cache_owner(et->et_ci),
1198                         root_end, new_cpos);
1199 
1200                 status = ocfs2_adjust_rightmost_branch(handle, et);
1201                 if (status) {
1202                         mlog_errno(status);
1203                         goto bail;
1204                 }
1205         }
1206 
1207         /* allocate the number of new eb blocks we need */
1208         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1209                              GFP_KERNEL);
1210         if (!new_eb_bhs) {
1211                 status = -ENOMEM;
1212                 mlog_errno(status);
1213                 goto bail;
1214         }
1215 
1216         status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1217                                            meta_ac, new_eb_bhs);
1218         if (status < 0) {
1219                 mlog_errno(status);
1220                 goto bail;
1221         }
1222 
1223         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1224          * linked with the rest of the tree.
1225          * conversly, new_eb_bhs[0] is the new bottommost leaf.
1226          *
1227          * when we leave the loop, new_last_eb_blk will point to the
1228          * newest leaf, and next_blkno will point to the topmost extent
1229          * block. */
1230         next_blkno = new_last_eb_blk = 0;
1231         for(i = 0; i < new_blocks; i++) {
1232                 bh = new_eb_bhs[i];
1233                 eb = (struct ocfs2_extent_block *) bh->b_data;
1234                 /* ocfs2_create_new_meta_bhs() should create it right! */
1235                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1236                 eb_el = &eb->h_list;
1237 
1238                 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1239                                                  OCFS2_JOURNAL_ACCESS_CREATE);
1240                 if (status < 0) {
1241                         mlog_errno(status);
1242                         goto bail;
1243                 }
1244 
1245                 eb->h_next_leaf_blk = 0;
1246                 eb_el->l_tree_depth = cpu_to_le16(i);
1247                 eb_el->l_next_free_rec = cpu_to_le16(1);
1248                 /*
1249                  * This actually counts as an empty extent as
1250                  * c_clusters == 0
1251                  */
1252                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1253                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1254                 /*
1255                  * eb_el isn't always an interior node, but even leaf
1256                  * nodes want a zero'd flags and reserved field so
1257                  * this gets the whole 32 bits regardless of use.
1258                  */
1259                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1260                 if (!eb_el->l_tree_depth)
1261                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1262 
1263                 ocfs2_journal_dirty(handle, bh);
1264                 next_blkno = le64_to_cpu(eb->h_blkno);
1265         }
1266 
1267         /* This is a bit hairy. We want to update up to three blocks
1268          * here without leaving any of them in an inconsistent state
1269          * in case of error. We don't have to worry about
1270          * journal_dirty erroring as it won't unless we've aborted the
1271          * handle (in which case we would never be here) so reserving
1272          * the write with journal_access is all we need to do. */
1273         status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1274                                          OCFS2_JOURNAL_ACCESS_WRITE);
1275         if (status < 0) {
1276                 mlog_errno(status);
1277                 goto bail;
1278         }
1279         status = ocfs2_et_root_journal_access(handle, et,
1280                                               OCFS2_JOURNAL_ACCESS_WRITE);
1281         if (status < 0) {
1282                 mlog_errno(status);
1283                 goto bail;
1284         }
1285         if (eb_bh) {
1286                 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1287                                                  OCFS2_JOURNAL_ACCESS_WRITE);
1288                 if (status < 0) {
1289                         mlog_errno(status);
1290                         goto bail;
1291                 }
1292         }
1293 
1294         /* Link the new branch into the rest of the tree (el will
1295          * either be on the root_bh, or the extent block passed in. */
1296         i = le16_to_cpu(el->l_next_free_rec);
1297         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1298         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1299         el->l_recs[i].e_int_clusters = 0;
1300         le16_add_cpu(&el->l_next_free_rec, 1);
1301 
1302         /* fe needs a new last extent block pointer, as does the
1303          * next_leaf on the previously last-extent-block. */
1304         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1305 
1306         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1307         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1308 
1309         ocfs2_journal_dirty(handle, *last_eb_bh);
1310         ocfs2_journal_dirty(handle, et->et_root_bh);
1311         if (eb_bh)
1312                 ocfs2_journal_dirty(handle, eb_bh);
1313 
1314         /*
1315          * Some callers want to track the rightmost leaf so pass it
1316          * back here.
1317          */
1318         brelse(*last_eb_bh);
1319         get_bh(new_eb_bhs[0]);
1320         *last_eb_bh = new_eb_bhs[0];
1321 
1322         status = 0;
1323 bail:
1324         if (new_eb_bhs) {
1325                 for (i = 0; i < new_blocks; i++)
1326                         brelse(new_eb_bhs[i]);
1327                 kfree(new_eb_bhs);
1328         }
1329 
1330         return status;
1331 }
1332 
1333 /*
1334  * adds another level to the allocation tree.
1335  * returns back the new extent block so you can add a branch to it
1336  * after this call.
1337  */
1338 static int ocfs2_shift_tree_depth(handle_t *handle,
1339                                   struct ocfs2_extent_tree *et,
1340                                   struct ocfs2_alloc_context *meta_ac,
1341                                   struct buffer_head **ret_new_eb_bh)
1342 {
1343         int status, i;
1344         u32 new_clusters;
1345         struct buffer_head *new_eb_bh = NULL;
1346         struct ocfs2_extent_block *eb;
1347         struct ocfs2_extent_list  *root_el;
1348         struct ocfs2_extent_list  *eb_el;
1349 
1350         status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1351                                            &new_eb_bh);
1352         if (status < 0) {
1353                 mlog_errno(status);
1354                 goto bail;
1355         }
1356 
1357         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1358         /* ocfs2_create_new_meta_bhs() should create it right! */
1359         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1360 
1361         eb_el = &eb->h_list;
1362         root_el = et->et_root_el;
1363 
1364         status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1365                                          OCFS2_JOURNAL_ACCESS_CREATE);
1366         if (status < 0) {
1367                 mlog_errno(status);
1368                 goto bail;
1369         }
1370 
1371         /* copy the root extent list data into the new extent block */
1372         eb_el->l_tree_depth = root_el->l_tree_depth;
1373         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1374         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1375                 eb_el->l_recs[i] = root_el->l_recs[i];
1376 
1377         ocfs2_journal_dirty(handle, new_eb_bh);
1378 
1379         status = ocfs2_et_root_journal_access(handle, et,
1380                                               OCFS2_JOURNAL_ACCESS_WRITE);
1381         if (status < 0) {
1382                 mlog_errno(status);
1383                 goto bail;
1384         }
1385 
1386         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1387 
1388         /* update root_bh now */
1389         le16_add_cpu(&root_el->l_tree_depth, 1);
1390         root_el->l_recs[0].e_cpos = 0;
1391         root_el->l_recs[0].e_blkno = eb->h_blkno;
1392         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1393         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1394                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1395         root_el->l_next_free_rec = cpu_to_le16(1);
1396 
1397         /* If this is our 1st tree depth shift, then last_eb_blk
1398          * becomes the allocated extent block */
1399         if (root_el->l_tree_depth == cpu_to_le16(1))
1400                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1401 
1402         ocfs2_journal_dirty(handle, et->et_root_bh);
1403 
1404         *ret_new_eb_bh = new_eb_bh;
1405         new_eb_bh = NULL;
1406         status = 0;
1407 bail:
1408         brelse(new_eb_bh);
1409 
1410         return status;
1411 }
1412 
1413 /*
1414  * Should only be called when there is no space left in any of the
1415  * leaf nodes. What we want to do is find the lowest tree depth
1416  * non-leaf extent block with room for new records. There are three
1417  * valid results of this search:
1418  *
1419  * 1) a lowest extent block is found, then we pass it back in
1420  *    *lowest_eb_bh and return ''
1421  *
1422  * 2) the search fails to find anything, but the root_el has room. We
1423  *    pass NULL back in *lowest_eb_bh, but still return ''
1424  *
1425  * 3) the search fails to find anything AND the root_el is full, in
1426  *    which case we return > 0
1427  *
1428  * return status < 0 indicates an error.
1429  */
1430 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1431                                     struct buffer_head **target_bh)
1432 {
1433         int status = 0, i;
1434         u64 blkno;
1435         struct ocfs2_extent_block *eb;
1436         struct ocfs2_extent_list  *el;
1437         struct buffer_head *bh = NULL;
1438         struct buffer_head *lowest_bh = NULL;
1439 
1440         *target_bh = NULL;
1441 
1442         el = et->et_root_el;
1443 
1444         while(le16_to_cpu(el->l_tree_depth) > 1) {
1445                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1446                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1447                                     "Owner %llu has empty extent list (next_free_rec == 0)\n",
1448                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1449                         status = -EIO;
1450                         goto bail;
1451                 }
1452                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1453                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1454                 if (!blkno) {
1455                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1456                                     "Owner %llu has extent list where extent # %d has no physical block start\n",
1457                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1458                         status = -EIO;
1459                         goto bail;
1460                 }
1461 
1462                 brelse(bh);
1463                 bh = NULL;
1464 
1465                 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1466                 if (status < 0) {
1467                         mlog_errno(status);
1468                         goto bail;
1469                 }
1470 
1471                 eb = (struct ocfs2_extent_block *) bh->b_data;
1472                 el = &eb->h_list;
1473 
1474                 if (le16_to_cpu(el->l_next_free_rec) <
1475                     le16_to_cpu(el->l_count)) {
1476                         brelse(lowest_bh);
1477                         lowest_bh = bh;
1478                         get_bh(lowest_bh);
1479                 }
1480         }
1481 
1482         /* If we didn't find one and the fe doesn't have any room,
1483          * then return '1' */
1484         el = et->et_root_el;
1485         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1486                 status = 1;
1487 
1488         *target_bh = lowest_bh;
1489 bail:
1490         brelse(bh);
1491 
1492         return status;
1493 }
1494 
1495 /*
1496  * Grow a b-tree so that it has more records.
1497  *
1498  * We might shift the tree depth in which case existing paths should
1499  * be considered invalid.
1500  *
1501  * Tree depth after the grow is returned via *final_depth.
1502  *
1503  * *last_eb_bh will be updated by ocfs2_add_branch().
1504  */
1505 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1506                            int *final_depth, struct buffer_head **last_eb_bh,
1507                            struct ocfs2_alloc_context *meta_ac)
1508 {
1509         int ret, shift;
1510         struct ocfs2_extent_list *el = et->et_root_el;
1511         int depth = le16_to_cpu(el->l_tree_depth);
1512         struct buffer_head *bh = NULL;
1513 
1514         BUG_ON(meta_ac == NULL);
1515 
1516         shift = ocfs2_find_branch_target(et, &bh);
1517         if (shift < 0) {
1518                 ret = shift;
1519                 mlog_errno(ret);
1520                 goto out;
1521         }
1522 
1523         /* We traveled all the way to the bottom of the allocation tree
1524          * and didn't find room for any more extents - we need to add
1525          * another tree level */
1526         if (shift) {
1527                 BUG_ON(bh);
1528                 trace_ocfs2_grow_tree(
1529                         (unsigned long long)
1530                         ocfs2_metadata_cache_owner(et->et_ci),
1531                         depth);
1532 
1533                 /* ocfs2_shift_tree_depth will return us a buffer with
1534                  * the new extent block (so we can pass that to
1535                  * ocfs2_add_branch). */
1536                 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1537                 if (ret < 0) {
1538                         mlog_errno(ret);
1539                         goto out;
1540                 }
1541                 depth++;
1542                 if (depth == 1) {
1543                         /*
1544                          * Special case: we have room now if we shifted from
1545                          * tree_depth 0, so no more work needs to be done.
1546                          *
1547                          * We won't be calling add_branch, so pass
1548                          * back *last_eb_bh as the new leaf. At depth
1549                          * zero, it should always be null so there's
1550                          * no reason to brelse.
1551                          */
1552                         BUG_ON(*last_eb_bh);
1553                         get_bh(bh);
1554                         *last_eb_bh = bh;
1555                         goto out;
1556                 }
1557         }
1558 
1559         /* call ocfs2_add_branch to add the final part of the tree with
1560          * the new data. */
1561         ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1562                                meta_ac);
1563         if (ret < 0) {
1564                 mlog_errno(ret);
1565                 goto out;
1566         }
1567 
1568 out:
1569         if (final_depth)
1570                 *final_depth = depth;
1571         brelse(bh);
1572         return ret;
1573 }
1574 
1575 /*
1576  * This function will discard the rightmost extent record.
1577  */
1578 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1579 {
1580         int next_free = le16_to_cpu(el->l_next_free_rec);
1581         int count = le16_to_cpu(el->l_count);
1582         unsigned int num_bytes;
1583 
1584         BUG_ON(!next_free);
1585         /* This will cause us to go off the end of our extent list. */
1586         BUG_ON(next_free >= count);
1587 
1588         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1589 
1590         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1591 }
1592 
1593 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1594                               struct ocfs2_extent_rec *insert_rec)
1595 {
1596         int i, insert_index, next_free, has_empty, num_bytes;
1597         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1598         struct ocfs2_extent_rec *rec;
1599 
1600         next_free = le16_to_cpu(el->l_next_free_rec);
1601         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1602 
1603         BUG_ON(!next_free);
1604 
1605         /* The tree code before us didn't allow enough room in the leaf. */
1606         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1607 
1608         /*
1609          * The easiest way to approach this is to just remove the
1610          * empty extent and temporarily decrement next_free.
1611          */
1612         if (has_empty) {
1613                 /*
1614                  * If next_free was 1 (only an empty extent), this
1615                  * loop won't execute, which is fine. We still want
1616                  * the decrement above to happen.
1617                  */
1618                 for(i = 0; i < (next_free - 1); i++)
1619                         el->l_recs[i] = el->l_recs[i+1];
1620 
1621                 next_free--;
1622         }
1623 
1624         /*
1625          * Figure out what the new record index should be.
1626          */
1627         for(i = 0; i < next_free; i++) {
1628                 rec = &el->l_recs[i];
1629 
1630                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1631                         break;
1632         }
1633         insert_index = i;
1634 
1635         trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1636                                 has_empty, next_free,
1637                                 le16_to_cpu(el->l_count));
1638 
1639         BUG_ON(insert_index < 0);
1640         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1641         BUG_ON(insert_index > next_free);
1642 
1643         /*
1644          * No need to memmove if we're just adding to the tail.
1645          */
1646         if (insert_index != next_free) {
1647                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1648 
1649                 num_bytes = next_free - insert_index;
1650                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1651                 memmove(&el->l_recs[insert_index + 1],
1652                         &el->l_recs[insert_index],
1653                         num_bytes);
1654         }
1655 
1656         /*
1657          * Either we had an empty extent, and need to re-increment or
1658          * there was no empty extent on a non full rightmost leaf node,
1659          * in which case we still need to increment.
1660          */
1661         next_free++;
1662         el->l_next_free_rec = cpu_to_le16(next_free);
1663         /*
1664          * Make sure none of the math above just messed up our tree.
1665          */
1666         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1667 
1668         el->l_recs[insert_index] = *insert_rec;
1669 
1670 }
1671 
1672 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1673 {
1674         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1675 
1676         BUG_ON(num_recs == 0);
1677 
1678         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1679                 num_recs--;
1680                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1681                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1682                 memset(&el->l_recs[num_recs], 0,
1683                        sizeof(struct ocfs2_extent_rec));
1684                 el->l_next_free_rec = cpu_to_le16(num_recs);
1685         }
1686 }
1687 
1688 /*
1689  * Create an empty extent record .
1690  *
1691  * l_next_free_rec may be updated.
1692  *
1693  * If an empty extent already exists do nothing.
1694  */
1695 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1696 {
1697         int next_free = le16_to_cpu(el->l_next_free_rec);
1698 
1699         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1700 
1701         if (next_free == 0)
1702                 goto set_and_inc;
1703 
1704         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1705                 return;
1706 
1707         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1708                         "Asked to create an empty extent in a full list:\n"
1709                         "count = %u, tree depth = %u",
1710                         le16_to_cpu(el->l_count),
1711                         le16_to_cpu(el->l_tree_depth));
1712 
1713         ocfs2_shift_records_right(el);
1714 
1715 set_and_inc:
1716         le16_add_cpu(&el->l_next_free_rec, 1);
1717         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1718 }
1719 
1720 /*
1721  * For a rotation which involves two leaf nodes, the "root node" is
1722  * the lowest level tree node which contains a path to both leafs. This
1723  * resulting set of information can be used to form a complete "subtree"
1724  *
1725  * This function is passed two full paths from the dinode down to a
1726  * pair of adjacent leaves. It's task is to figure out which path
1727  * index contains the subtree root - this can be the root index itself
1728  * in a worst-case rotation.
1729  *
1730  * The array index of the subtree root is passed back.
1731  */
1732 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1733                             struct ocfs2_path *left,
1734                             struct ocfs2_path *right)
1735 {
1736         int i = 0;
1737 
1738         /*
1739          * Check that the caller passed in two paths from the same tree.
1740          */
1741         BUG_ON(path_root_bh(left) != path_root_bh(right));
1742 
1743         do {
1744                 i++;
1745 
1746                 /*
1747                  * The caller didn't pass two adjacent paths.
1748                  */
1749                 mlog_bug_on_msg(i > left->p_tree_depth,
1750                                 "Owner %llu, left depth %u, right depth %u\n"
1751                                 "left leaf blk %llu, right leaf blk %llu\n",
1752                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1753                                 left->p_tree_depth, right->p_tree_depth,
1754                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1755                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1756         } while (left->p_node[i].bh->b_blocknr ==
1757                  right->p_node[i].bh->b_blocknr);
1758 
1759         return i - 1;
1760 }
1761 
1762 typedef void (path_insert_t)(void *, struct buffer_head *);
1763 
1764 /*
1765  * Traverse a btree path in search of cpos, starting at root_el.
1766  *
1767  * This code can be called with a cpos larger than the tree, in which
1768  * case it will return the rightmost path.
1769  */
1770 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1771                              struct ocfs2_extent_list *root_el, u32 cpos,
1772                              path_insert_t *func, void *data)
1773 {
1774         int i, ret = 0;
1775         u32 range;
1776         u64 blkno;
1777         struct buffer_head *bh = NULL;
1778         struct ocfs2_extent_block *eb;
1779         struct ocfs2_extent_list *el;
1780         struct ocfs2_extent_rec *rec;
1781 
1782         el = root_el;
1783         while (el->l_tree_depth) {
1784                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1785                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1786                                     "Owner %llu has empty extent list at depth %u\n",
1787                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1788                                     le16_to_cpu(el->l_tree_depth));
1789                         ret = -EROFS;
1790                         goto out;
1791 
1792                 }
1793 
1794                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1795                         rec = &el->l_recs[i];
1796 
1797                         /*
1798                          * In the case that cpos is off the allocation
1799                          * tree, this should just wind up returning the
1800                          * rightmost record.
1801                          */
1802                         range = le32_to_cpu(rec->e_cpos) +
1803                                 ocfs2_rec_clusters(el, rec);
1804                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1805                             break;
1806                 }
1807 
1808                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1809                 if (blkno == 0) {
1810                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1811                                     "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
1812                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1813                                     le16_to_cpu(el->l_tree_depth), i);
1814                         ret = -EROFS;
1815                         goto out;
1816                 }
1817 
1818                 brelse(bh);
1819                 bh = NULL;
1820                 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1821                 if (ret) {
1822                         mlog_errno(ret);
1823                         goto out;
1824                 }
1825 
1826                 eb = (struct ocfs2_extent_block *) bh->b_data;
1827                 el = &eb->h_list;
1828 
1829                 if (le16_to_cpu(el->l_next_free_rec) >
1830                     le16_to_cpu(el->l_count)) {
1831                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1832                                     "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
1833                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1834                                     (unsigned long long)bh->b_blocknr,
1835                                     le16_to_cpu(el->l_next_free_rec),
1836                                     le16_to_cpu(el->l_count));
1837                         ret = -EROFS;
1838                         goto out;
1839                 }
1840 
1841                 if (func)
1842                         func(data, bh);
1843         }
1844 
1845 out:
1846         /*
1847          * Catch any trailing bh that the loop didn't handle.
1848          */
1849         brelse(bh);
1850 
1851         return ret;
1852 }
1853 
1854 /*
1855  * Given an initialized path (that is, it has a valid root extent
1856  * list), this function will traverse the btree in search of the path
1857  * which would contain cpos.
1858  *
1859  * The path traveled is recorded in the path structure.
1860  *
1861  * Note that this will not do any comparisons on leaf node extent
1862  * records, so it will work fine in the case that we just added a tree
1863  * branch.
1864  */
1865 struct find_path_data {
1866         int index;
1867         struct ocfs2_path *path;
1868 };
1869 static void find_path_ins(void *data, struct buffer_head *bh)
1870 {
1871         struct find_path_data *fp = data;
1872 
1873         get_bh(bh);
1874         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1875         fp->index++;
1876 }
1877 int ocfs2_find_path(struct ocfs2_caching_info *ci,
1878                     struct ocfs2_path *path, u32 cpos)
1879 {
1880         struct find_path_data data;
1881 
1882         data.index = 1;
1883         data.path = path;
1884         return __ocfs2_find_path(ci, path_root_el(path), cpos,
1885                                  find_path_ins, &data);
1886 }
1887 
1888 static void find_leaf_ins(void *data, struct buffer_head *bh)
1889 {
1890         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1891         struct ocfs2_extent_list *el = &eb->h_list;
1892         struct buffer_head **ret = data;
1893 
1894         /* We want to retain only the leaf block. */
1895         if (le16_to_cpu(el->l_tree_depth) == 0) {
1896                 get_bh(bh);
1897                 *ret = bh;
1898         }
1899 }
1900 /*
1901  * Find the leaf block in the tree which would contain cpos. No
1902  * checking of the actual leaf is done.
1903  *
1904  * Some paths want to call this instead of allocating a path structure
1905  * and calling ocfs2_find_path().
1906  *
1907  * This function doesn't handle non btree extent lists.
1908  */
1909 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1910                     struct ocfs2_extent_list *root_el, u32 cpos,
1911                     struct buffer_head **leaf_bh)
1912 {
1913         int ret;
1914         struct buffer_head *bh = NULL;
1915 
1916         ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1917         if (ret) {
1918                 mlog_errno(ret);
1919                 goto out;
1920         }
1921 
1922         *leaf_bh = bh;
1923 out:
1924         return ret;
1925 }
1926 
1927 /*
1928  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1929  *
1930  * Basically, we've moved stuff around at the bottom of the tree and
1931  * we need to fix up the extent records above the changes to reflect
1932  * the new changes.
1933  *
1934  * left_rec: the record on the left.
1935  * right_rec: the record to the right of left_rec
1936  * right_child_el: is the child list pointed to by right_rec
1937  *
1938  * By definition, this only works on interior nodes.
1939  */
1940 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1941                                   struct ocfs2_extent_rec *right_rec,
1942                                   struct ocfs2_extent_list *right_child_el)
1943 {
1944         u32 left_clusters, right_end;
1945 
1946         /*
1947          * Interior nodes never have holes. Their cpos is the cpos of
1948          * the leftmost record in their child list. Their cluster
1949          * count covers the full theoretical range of their child list
1950          * - the range between their cpos and the cpos of the record
1951          * immediately to their right.
1952          */
1953         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1954         if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1955                 BUG_ON(right_child_el->l_tree_depth);
1956                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1957                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1958         }
1959         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1960         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1961 
1962         /*
1963          * Calculate the rightmost cluster count boundary before
1964          * moving cpos - we will need to adjust clusters after
1965          * updating e_cpos to keep the same highest cluster count.
1966          */
1967         right_end = le32_to_cpu(right_rec->e_cpos);
1968         right_end += le32_to_cpu(right_rec->e_int_clusters);
1969 
1970         right_rec->e_cpos = left_rec->e_cpos;
1971         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1972 
1973         right_end -= le32_to_cpu(right_rec->e_cpos);
1974         right_rec->e_int_clusters = cpu_to_le32(right_end);
1975 }
1976 
1977 /*
1978  * Adjust the adjacent root node records involved in a
1979  * rotation. left_el_blkno is passed in as a key so that we can easily
1980  * find it's index in the root list.
1981  */
1982 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1983                                       struct ocfs2_extent_list *left_el,
1984                                       struct ocfs2_extent_list *right_el,
1985                                       u64 left_el_blkno)
1986 {
1987         int i;
1988 
1989         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1990                le16_to_cpu(left_el->l_tree_depth));
1991 
1992         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1993                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1994                         break;
1995         }
1996 
1997         /*
1998          * The path walking code should have never returned a root and
1999          * two paths which are not adjacent.
2000          */
2001         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2002 
2003         ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
2004                                       &root_el->l_recs[i + 1], right_el);
2005 }
2006 
2007 /*
2008  * We've changed a leaf block (in right_path) and need to reflect that
2009  * change back up the subtree.
2010  *
2011  * This happens in multiple places:
2012  *   - When we've moved an extent record from the left path leaf to the right
2013  *     path leaf to make room for an empty extent in the left path leaf.
2014  *   - When our insert into the right path leaf is at the leftmost edge
2015  *     and requires an update of the path immediately to it's left. This
2016  *     can occur at the end of some types of rotation and appending inserts.
2017  *   - When we've adjusted the last extent record in the left path leaf and the
2018  *     1st extent record in the right path leaf during cross extent block merge.
2019  */
2020 static void ocfs2_complete_edge_insert(handle_t *handle,
2021                                        struct ocfs2_path *left_path,
2022                                        struct ocfs2_path *right_path,
2023                                        int subtree_index)
2024 {
2025         int i, idx;
2026         struct ocfs2_extent_list *el, *left_el, *right_el;
2027         struct ocfs2_extent_rec *left_rec, *right_rec;
2028         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2029 
2030         /*
2031          * Update the counts and position values within all the
2032          * interior nodes to reflect the leaf rotation we just did.
2033          *
2034          * The root node is handled below the loop.
2035          *
2036          * We begin the loop with right_el and left_el pointing to the
2037          * leaf lists and work our way up.
2038          *
2039          * NOTE: within this loop, left_el and right_el always refer
2040          * to the *child* lists.
2041          */
2042         left_el = path_leaf_el(left_path);
2043         right_el = path_leaf_el(right_path);
2044         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2045                 trace_ocfs2_complete_edge_insert(i);
2046 
2047                 /*
2048                  * One nice property of knowing that all of these
2049                  * nodes are below the root is that we only deal with
2050                  * the leftmost right node record and the rightmost
2051                  * left node record.
2052                  */
2053                 el = left_path->p_node[i].el;
2054                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2055                 left_rec = &el->l_recs[idx];
2056 
2057                 el = right_path->p_node[i].el;
2058                 right_rec = &el->l_recs[0];
2059 
2060                 ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
2061 
2062                 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2063                 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2064 
2065                 /*
2066                  * Setup our list pointers now so that the current
2067                  * parents become children in the next iteration.
2068                  */
2069                 left_el = left_path->p_node[i].el;
2070                 right_el = right_path->p_node[i].el;
2071         }
2072 
2073         /*
2074          * At the root node, adjust the two adjacent records which
2075          * begin our path to the leaves.
2076          */
2077 
2078         el = left_path->p_node[subtree_index].el;
2079         left_el = left_path->p_node[subtree_index + 1].el;
2080         right_el = right_path->p_node[subtree_index + 1].el;
2081 
2082         ocfs2_adjust_root_records(el, left_el, right_el,
2083                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
2084 
2085         root_bh = left_path->p_node[subtree_index].bh;
2086 
2087         ocfs2_journal_dirty(handle, root_bh);
2088 }
2089 
2090 static int ocfs2_rotate_subtree_right(handle_t *handle,
2091                                       struct ocfs2_extent_tree *et,
2092                                       struct ocfs2_path *left_path,
2093                                       struct ocfs2_path *right_path,
2094                                       int subtree_index)
2095 {
2096         int ret, i;
2097         struct buffer_head *right_leaf_bh;
2098         struct buffer_head *left_leaf_bh = NULL;
2099         struct buffer_head *root_bh;
2100         struct ocfs2_extent_list *right_el, *left_el;
2101         struct ocfs2_extent_rec move_rec;
2102 
2103         left_leaf_bh = path_leaf_bh(left_path);
2104         left_el = path_leaf_el(left_path);
2105 
2106         if (left_el->l_next_free_rec != left_el->l_count) {
2107                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2108                             "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
2109                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2110                             (unsigned long long)left_leaf_bh->b_blocknr,
2111                             le16_to_cpu(left_el->l_next_free_rec));
2112                 return -EROFS;
2113         }
2114 
2115         /*
2116          * This extent block may already have an empty record, so we
2117          * return early if so.
2118          */
2119         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2120                 return 0;
2121 
2122         root_bh = left_path->p_node[subtree_index].bh;
2123         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2124 
2125         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2126                                            subtree_index);
2127         if (ret) {
2128                 mlog_errno(ret);
2129                 goto out;
2130         }
2131 
2132         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2133                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2134                                                    right_path, i);
2135                 if (ret) {
2136                         mlog_errno(ret);
2137                         goto out;
2138                 }
2139 
2140                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2141                                                    left_path, i);
2142                 if (ret) {
2143                         mlog_errno(ret);
2144                         goto out;
2145                 }
2146         }
2147 
2148         right_leaf_bh = path_leaf_bh(right_path);
2149         right_el = path_leaf_el(right_path);
2150 
2151         /* This is a code error, not a disk corruption. */
2152         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2153                         "because rightmost leaf block %llu is empty\n",
2154                         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2155                         (unsigned long long)right_leaf_bh->b_blocknr);
2156 
2157         ocfs2_create_empty_extent(right_el);
2158 
2159         ocfs2_journal_dirty(handle, right_leaf_bh);
2160 
2161         /* Do the copy now. */
2162         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2163         move_rec = left_el->l_recs[i];
2164         right_el->l_recs[0] = move_rec;
2165 
2166         /*
2167          * Clear out the record we just copied and shift everything
2168          * over, leaving an empty extent in the left leaf.
2169          *
2170          * We temporarily subtract from next_free_rec so that the
2171          * shift will lose the tail record (which is now defunct).
2172          */
2173         le16_add_cpu(&left_el->l_next_free_rec, -1);
2174         ocfs2_shift_records_right(left_el);
2175         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2176         le16_add_cpu(&left_el->l_next_free_rec, 1);
2177 
2178         ocfs2_journal_dirty(handle, left_leaf_bh);
2179 
2180         ocfs2_complete_edge_insert(handle, left_path, right_path,
2181                                    subtree_index);
2182 
2183 out:
2184         return ret;
2185 }
2186 
2187 /*
2188  * Given a full path, determine what cpos value would return us a path
2189  * containing the leaf immediately to the left of the current one.
2190  *
2191  * Will return zero if the path passed in is already the leftmost path.
2192  */
2193 int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2194                                   struct ocfs2_path *path, u32 *cpos)
2195 {
2196         int i, j, ret = 0;
2197         u64 blkno;
2198         struct ocfs2_extent_list *el;
2199 
2200         BUG_ON(path->p_tree_depth == 0);
2201 
2202         *cpos = 0;
2203 
2204         blkno = path_leaf_bh(path)->b_blocknr;
2205 
2206         /* Start at the tree node just above the leaf and work our way up. */
2207         i = path->p_tree_depth - 1;
2208         while (i >= 0) {
2209                 el = path->p_node[i].el;
2210 
2211                 /*
2212                  * Find the extent record just before the one in our
2213                  * path.
2214                  */
2215                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2216                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2217                                 if (j == 0) {
2218                                         if (i == 0) {
2219                                                 /*
2220                                                  * We've determined that the
2221                                                  * path specified is already
2222                                                  * the leftmost one - return a
2223                                                  * cpos of zero.
2224                                                  */
2225                                                 goto out;
2226                                         }
2227                                         /*
2228                                          * The leftmost record points to our
2229                                          * leaf - we need to travel up the
2230                                          * tree one level.
2231                                          */
2232                                         goto next_node;
2233                                 }
2234 
2235                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2236                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2237                                                            &el->l_recs[j - 1]);
2238                                 *cpos = *cpos - 1;
2239                                 goto out;
2240                         }
2241                 }
2242 
2243                 /*
2244                  * If we got here, we never found a valid node where
2245                  * the tree indicated one should be.
2246                  */
2247                 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2248                             (unsigned long long)blkno);
2249                 ret = -EROFS;
2250                 goto out;
2251 
2252 next_node:
2253                 blkno = path->p_node[i].bh->b_blocknr;
2254                 i--;
2255         }
2256 
2257 out:
2258         return ret;
2259 }
2260 
2261 /*
2262  * Extend the transaction by enough credits to complete the rotation,
2263  * and still leave at least the original number of credits allocated
2264  * to this transaction.
2265  */
2266 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2267                                            int op_credits,
2268                                            struct ocfs2_path *path)
2269 {
2270         int ret = 0;
2271         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2272 
2273         if (handle->h_buffer_credits < credits)
2274                 ret = ocfs2_extend_trans(handle,
2275                                          credits - handle->h_buffer_credits);
2276 
2277         return ret;
2278 }
2279 
2280 /*
2281  * Trap the case where we're inserting into the theoretical range past
2282  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2283  * whose cpos is less than ours into the right leaf.
2284  *
2285  * It's only necessary to look at the rightmost record of the left
2286  * leaf because the logic that calls us should ensure that the
2287  * theoretical ranges in the path components above the leaves are
2288  * correct.
2289  */
2290 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2291                                                  u32 insert_cpos)
2292 {
2293         struct ocfs2_extent_list *left_el;
2294         struct ocfs2_extent_rec *rec;
2295         int next_free;
2296 
2297         left_el = path_leaf_el(left_path);
2298         next_free = le16_to_cpu(left_el->l_next_free_rec);
2299         rec = &left_el->l_recs[next_free - 1];
2300 
2301         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2302                 return 1;
2303         return 0;
2304 }
2305 
2306 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2307 {
2308         int next_free = le16_to_cpu(el->l_next_free_rec);
2309         unsigned int range;
2310         struct ocfs2_extent_rec *rec;
2311 
2312         if (next_free == 0)
2313                 return 0;
2314 
2315         rec = &el->l_recs[0];
2316         if (ocfs2_is_empty_extent(rec)) {
2317                 /* Empty list. */
2318                 if (next_free == 1)
2319                         return 0;
2320                 rec = &el->l_recs[1];
2321         }
2322 
2323         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2324         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2325                 return 1;
2326         return 0;
2327 }
2328 
2329 /*
2330  * Rotate all the records in a btree right one record, starting at insert_cpos.
2331  *
2332  * The path to the rightmost leaf should be passed in.
2333  *
2334  * The array is assumed to be large enough to hold an entire path (tree depth).
2335  *
2336  * Upon successful return from this function:
2337  *
2338  * - The 'right_path' array will contain a path to the leaf block
2339  *   whose range contains e_cpos.
2340  * - That leaf block will have a single empty extent in list index 0.
2341  * - In the case that the rotation requires a post-insert update,
2342  *   *ret_left_path will contain a valid path which can be passed to
2343  *   ocfs2_insert_path().
2344  */
2345 static int ocfs2_rotate_tree_right(handle_t *handle,
2346                                    struct ocfs2_extent_tree *et,
2347                                    enum ocfs2_split_type split,
2348                                    u32 insert_cpos,
2349                                    struct ocfs2_path *right_path,
2350                                    struct ocfs2_path **ret_left_path)
2351 {
2352         int ret, start, orig_credits = handle->h_buffer_credits;
2353         u32 cpos;
2354         struct ocfs2_path *left_path = NULL;
2355         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2356 
2357         *ret_left_path = NULL;
2358 
2359         left_path = ocfs2_new_path_from_path(right_path);
2360         if (!left_path) {
2361                 ret = -ENOMEM;
2362                 mlog_errno(ret);
2363                 goto out;
2364         }
2365 
2366         ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2367         if (ret) {
2368                 mlog_errno(ret);
2369                 goto out;
2370         }
2371 
2372         trace_ocfs2_rotate_tree_right(
2373                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2374                 insert_cpos, cpos);
2375 
2376         /*
2377          * What we want to do here is:
2378          *
2379          * 1) Start with the rightmost path.
2380          *
2381          * 2) Determine a path to the leaf block directly to the left
2382          *    of that leaf.
2383          *
2384          * 3) Determine the 'subtree root' - the lowest level tree node
2385          *    which contains a path to both leaves.
2386          *
2387          * 4) Rotate the subtree.
2388          *
2389          * 5) Find the next subtree by considering the left path to be
2390          *    the new right path.
2391          *
2392          * The check at the top of this while loop also accepts
2393          * insert_cpos == cpos because cpos is only a _theoretical_
2394          * value to get us the left path - insert_cpos might very well
2395          * be filling that hole.
2396          *
2397          * Stop at a cpos of '' because we either started at the
2398          * leftmost branch (i.e., a tree with one branch and a
2399          * rotation inside of it), or we've gone as far as we can in
2400          * rotating subtrees.
2401          */
2402         while (cpos && insert_cpos <= cpos) {
2403                 trace_ocfs2_rotate_tree_right(
2404                         (unsigned long long)
2405                         ocfs2_metadata_cache_owner(et->et_ci),
2406                         insert_cpos, cpos);
2407 
2408                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2409                 if (ret) {
2410                         mlog_errno(ret);
2411                         goto out;
2412                 }
2413 
2414                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2415                                 path_leaf_bh(right_path),
2416                                 "Owner %llu: error during insert of %u "
2417                                 "(left path cpos %u) results in two identical "
2418                                 "paths ending at %llu\n",
2419                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2420                                 insert_cpos, cpos,
2421                                 (unsigned long long)
2422                                 path_leaf_bh(left_path)->b_blocknr);
2423 
2424                 if (split == SPLIT_NONE &&
2425                     ocfs2_rotate_requires_path_adjustment(left_path,
2426                                                           insert_cpos)) {
2427 
2428                         /*
2429                          * We've rotated the tree as much as we
2430                          * should. The rest is up to
2431                          * ocfs2_insert_path() to complete, after the
2432                          * record insertion. We indicate this
2433                          * situation by returning the left path.
2434                          *
2435                          * The reason we don't adjust the records here
2436                          * before the record insert is that an error
2437                          * later might break the rule where a parent
2438                          * record e_cpos will reflect the actual
2439                          * e_cpos of the 1st nonempty record of the
2440                          * child list.
2441                          */
2442                         *ret_left_path = left_path;
2443                         goto out_ret_path;
2444                 }
2445 
2446                 start = ocfs2_find_subtree_root(et, left_path, right_path);
2447 
2448                 trace_ocfs2_rotate_subtree(start,
2449                         (unsigned long long)
2450                         right_path->p_node[start].bh->b_blocknr,
2451                         right_path->p_tree_depth);
2452 
2453                 ret = ocfs2_extend_rotate_transaction(handle, start,
2454                                                       orig_credits, right_path);
2455                 if (ret) {
2456                         mlog_errno(ret);
2457                         goto out;
2458                 }
2459 
2460                 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2461                                                  right_path, start);
2462                 if (ret) {
2463                         mlog_errno(ret);
2464                         goto out;
2465                 }
2466 
2467                 if (split != SPLIT_NONE &&
2468                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2469                                                 insert_cpos)) {
2470                         /*
2471                          * A rotate moves the rightmost left leaf
2472                          * record over to the leftmost right leaf
2473                          * slot. If we're doing an extent split
2474                          * instead of a real insert, then we have to
2475                          * check that the extent to be split wasn't
2476                          * just moved over. If it was, then we can
2477                          * exit here, passing left_path back -
2478                          * ocfs2_split_extent() is smart enough to
2479                          * search both leaves.
2480                          */
2481                         *ret_left_path = left_path;
2482                         goto out_ret_path;
2483                 }
2484 
2485                 /*
2486                  * There is no need to re-read the next right path
2487                  * as we know that it'll be our current left
2488                  * path. Optimize by copying values instead.
2489                  */
2490                 ocfs2_mv_path(right_path, left_path);
2491 
2492                 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2493                 if (ret) {
2494                         mlog_errno(ret);
2495                         goto out;
2496                 }
2497         }
2498 
2499 out:
2500         ocfs2_free_path(left_path);
2501 
2502 out_ret_path:
2503         return ret;
2504 }
2505 
2506 static int ocfs2_update_edge_lengths(handle_t *handle,
2507                                      struct ocfs2_extent_tree *et,
2508                                      struct ocfs2_path *path)
2509 {
2510         int i, idx, ret;
2511         struct ocfs2_extent_rec *rec;
2512         struct ocfs2_extent_list *el;
2513         struct ocfs2_extent_block *eb;
2514         u32 range;
2515 
2516         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2517         if (ret) {
2518                 mlog_errno(ret);
2519                 goto out;
2520         }
2521 
2522         /* Path should always be rightmost. */
2523         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2524         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2525 
2526         el = &eb->h_list;
2527         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2528         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2529         rec = &el->l_recs[idx];
2530         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2531 
2532         for (i = 0; i < path->p_tree_depth; i++) {
2533                 el = path->p_node[i].el;
2534                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2535                 rec = &el->l_recs[idx];
2536 
2537                 rec->e_int_clusters = cpu_to_le32(range);
2538                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2539 
2540                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2541         }
2542 out:
2543         return ret;
2544 }
2545 
2546 static void ocfs2_unlink_path(handle_t *handle,
2547                               struct ocfs2_extent_tree *et,
2548                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2549                               struct ocfs2_path *path, int unlink_start)
2550 {
2551         int ret, i;
2552         struct ocfs2_extent_block *eb;
2553         struct ocfs2_extent_list *el;
2554         struct buffer_head *bh;
2555 
2556         for(i = unlink_start; i < path_num_items(path); i++) {
2557                 bh = path->p_node[i].bh;
2558 
2559                 eb = (struct ocfs2_extent_block *)bh->b_data;
2560                 /*
2561                  * Not all nodes might have had their final count
2562                  * decremented by the caller - handle this here.
2563                  */
2564                 el = &eb->h_list;
2565                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2566                         mlog(ML_ERROR,
2567                              "Inode %llu, attempted to remove extent block "
2568                              "%llu with %u records\n",
2569                              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2570                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2571                              le16_to_cpu(el->l_next_free_rec));
2572 
2573                         ocfs2_journal_dirty(handle, bh);
2574                         ocfs2_remove_from_cache(et->et_ci, bh);
2575                         continue;
2576                 }
2577 
2578                 el->l_next_free_rec = 0;
2579                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2580 
2581                 ocfs2_journal_dirty(handle, bh);
2582 
2583                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2584                 if (ret)
2585                         mlog_errno(ret);
2586 
2587                 ocfs2_remove_from_cache(et->et_ci, bh);
2588         }
2589 }
2590 
2591 static void ocfs2_unlink_subtree(handle_t *handle,
2592                                  struct ocfs2_extent_tree *et,
2593                                  struct ocfs2_path *left_path,
2594                                  struct ocfs2_path *right_path,
2595                                  int subtree_index,
2596                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2597 {
2598         int i;
2599         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2600         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2601         struct ocfs2_extent_list *el;
2602         struct ocfs2_extent_block *eb;
2603 
2604         el = path_leaf_el(left_path);
2605 
2606         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2607 
2608         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2609                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2610                         break;
2611 
2612         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2613 
2614         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2615         le16_add_cpu(&root_el->l_next_free_rec, -1);
2616 
2617         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2618         eb->h_next_leaf_blk = 0;
2619 
2620         ocfs2_journal_dirty(handle, root_bh);
2621         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2622 
2623         ocfs2_unlink_path(handle, et, dealloc, right_path,
2624                           subtree_index + 1);
2625 }
2626 
2627 static int ocfs2_rotate_subtree_left(handle_t *handle,
2628                                      struct ocfs2_extent_tree *et,
2629                                      struct ocfs2_path *left_path,
2630                                      struct ocfs2_path *right_path,
2631                                      int subtree_index,
2632                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2633                                      int *deleted)
2634 {
2635         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2636         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2637         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2638         struct ocfs2_extent_block *eb;
2639 
2640         *deleted = 0;
2641 
2642         right_leaf_el = path_leaf_el(right_path);
2643         left_leaf_el = path_leaf_el(left_path);
2644         root_bh = left_path->p_node[subtree_index].bh;
2645         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2646 
2647         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2648                 return 0;
2649 
2650         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2651         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2652                 /*
2653                  * It's legal for us to proceed if the right leaf is
2654                  * the rightmost one and it has an empty extent. There
2655                  * are two cases to handle - whether the leaf will be
2656                  * empty after removal or not. If the leaf isn't empty
2657                  * then just remove the empty extent up front. The
2658                  * next block will handle empty leaves by flagging
2659                  * them for unlink.
2660                  *
2661                  * Non rightmost leaves will throw -EAGAIN and the
2662                  * caller can manually move the subtree and retry.
2663                  */
2664 
2665                 if (eb->h_next_leaf_blk != 0ULL)
2666                         return -EAGAIN;
2667 
2668                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2669                         ret = ocfs2_journal_access_eb(handle, et->et_ci,
2670                                                       path_leaf_bh(right_path),
2671                                                       OCFS2_JOURNAL_ACCESS_WRITE);
2672                         if (ret) {
2673                                 mlog_errno(ret);
2674                                 goto out;
2675                         }
2676 
2677                         ocfs2_remove_empty_extent(right_leaf_el);
2678                 } else
2679                         right_has_empty = 1;
2680         }
2681 
2682         if (eb->h_next_leaf_blk == 0ULL &&
2683             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2684                 /*
2685                  * We have to update i_last_eb_blk during the meta
2686                  * data delete.
2687                  */
2688                 ret = ocfs2_et_root_journal_access(handle, et,
2689                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2690                 if (ret) {
2691                         mlog_errno(ret);
2692                         goto out;
2693                 }
2694 
2695                 del_right_subtree = 1;
2696         }
2697 
2698         /*
2699          * Getting here with an empty extent in the right path implies
2700          * that it's the rightmost path and will be deleted.
2701          */
2702         BUG_ON(right_has_empty && !del_right_subtree);
2703 
2704         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2705                                            subtree_index);
2706         if (ret) {
2707                 mlog_errno(ret);
2708                 goto out;
2709         }
2710 
2711         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2712                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2713                                                    right_path, i);
2714                 if (ret) {
2715                         mlog_errno(ret);
2716                         goto out;
2717                 }
2718 
2719                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2720                                                    left_path, i);
2721                 if (ret) {
2722                         mlog_errno(ret);
2723                         goto out;
2724                 }
2725         }
2726 
2727         if (!right_has_empty) {
2728                 /*
2729                  * Only do this if we're moving a real
2730                  * record. Otherwise, the action is delayed until
2731                  * after removal of the right path in which case we
2732                  * can do a simple shift to remove the empty extent.
2733                  */
2734                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2735                 memset(&right_leaf_el->l_recs[0], 0,
2736                        sizeof(struct ocfs2_extent_rec));
2737         }
2738         if (eb->h_next_leaf_blk == 0ULL) {
2739                 /*
2740                  * Move recs over to get rid of empty extent, decrease
2741                  * next_free. This is allowed to remove the last
2742                  * extent in our leaf (setting l_next_free_rec to
2743                  * zero) - the delete code below won't care.
2744                  */
2745                 ocfs2_remove_empty_extent(right_leaf_el);
2746         }
2747 
2748         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2749         ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2750 
2751         if (del_right_subtree) {
2752                 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2753                                      subtree_index, dealloc);
2754                 ret = ocfs2_update_edge_lengths(handle, et, left_path);
2755                 if (ret) {
2756                         mlog_errno(ret);
2757                         goto out;
2758                 }
2759 
2760                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2761                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2762 
2763                 /*
2764                  * Removal of the extent in the left leaf was skipped
2765                  * above so we could delete the right path
2766                  * 1st.
2767                  */
2768                 if (right_has_empty)
2769                         ocfs2_remove_empty_extent(left_leaf_el);
2770 
2771                 ocfs2_journal_dirty(handle, et_root_bh);
2772 
2773                 *deleted = 1;
2774         } else
2775                 ocfs2_complete_edge_insert(handle, left_path, right_path,
2776                                            subtree_index);
2777 
2778 out:
2779         return ret;
2780 }
2781 
2782 /*
2783  * Given a full path, determine what cpos value would return us a path
2784  * containing the leaf immediately to the right of the current one.
2785  *
2786  * Will return zero if the path passed in is already the rightmost path.
2787  *
2788  * This looks similar, but is subtly different to
2789  * ocfs2_find_cpos_for_left_leaf().
2790  */
2791 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2792                                    struct ocfs2_path *path, u32 *cpos)
2793 {
2794         int i, j, ret = 0;
2795         u64 blkno;
2796         struct ocfs2_extent_list *el;
2797 
2798         *cpos = 0;
2799 
2800         if (path->p_tree_depth == 0)
2801                 return 0;
2802 
2803         blkno = path_leaf_bh(path)->b_blocknr;
2804 
2805         /* Start at the tree node just above the leaf and work our way up. */
2806         i = path->p_tree_depth - 1;
2807         while (i >= 0) {
2808                 int next_free;
2809 
2810                 el = path->p_node[i].el;
2811 
2812                 /*
2813                  * Find the extent record just after the one in our
2814                  * path.
2815                  */
2816                 next_free = le16_to_cpu(el->l_next_free_rec);
2817                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2818                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2819                                 if (j == (next_free - 1)) {
2820                                         if (i == 0) {
2821                                                 /*
2822                                                  * We've determined that the
2823                                                  * path specified is already
2824                                                  * the rightmost one - return a
2825                                                  * cpos of zero.
2826                                                  */
2827                                                 goto out;
2828                                         }
2829                                         /*
2830                                          * The rightmost record points to our
2831                                          * leaf - we need to travel up the
2832                                          * tree one level.
2833                                          */
2834                                         goto next_node;
2835                                 }
2836 
2837                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2838                                 goto out;
2839                         }
2840                 }
2841 
2842                 /*
2843                  * If we got here, we never found a valid node where
2844                  * the tree indicated one should be.
2845                  */
2846                 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2847                             (unsigned long long)blkno);
2848                 ret = -EROFS;
2849                 goto out;
2850 
2851 next_node:
2852                 blkno = path->p_node[i].bh->b_blocknr;
2853                 i--;
2854         }
2855 
2856 out:
2857         return ret;
2858 }
2859 
2860 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2861                                             struct ocfs2_extent_tree *et,
2862                                             struct ocfs2_path *path)
2863 {
2864         int ret;
2865         struct buffer_head *bh = path_leaf_bh(path);
2866         struct ocfs2_extent_list *el = path_leaf_el(path);
2867 
2868         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2869                 return 0;
2870 
2871         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2872                                            path_num_items(path) - 1);
2873         if (ret) {
2874                 mlog_errno(ret);
2875                 goto out;
2876         }
2877 
2878         ocfs2_remove_empty_extent(el);
2879         ocfs2_journal_dirty(handle, bh);
2880 
2881 out:
2882         return ret;
2883 }
2884 
2885 static int __ocfs2_rotate_tree_left(handle_t *handle,
2886                                     struct ocfs2_extent_tree *et,
2887                                     int orig_credits,
2888                                     struct ocfs2_path *path,
2889                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2890                                     struct ocfs2_path **empty_extent_path)
2891 {
2892         int ret, subtree_root, deleted;
2893         u32 right_cpos;
2894         struct ocfs2_path *left_path = NULL;
2895         struct ocfs2_path *right_path = NULL;
2896         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2897 
2898         if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
2899                 return 0;
2900 
2901         *empty_extent_path = NULL;
2902 
2903         ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2904         if (ret) {
2905                 mlog_errno(ret);
2906                 goto out;
2907         }
2908 
2909         left_path = ocfs2_new_path_from_path(path);
2910         if (!left_path) {
2911                 ret = -ENOMEM;
2912                 mlog_errno(ret);
2913                 goto out;
2914         }
2915 
2916         ocfs2_cp_path(left_path, path);
2917 
2918         right_path = ocfs2_new_path_from_path(path);
2919         if (!right_path) {
2920                 ret = -ENOMEM;
2921                 mlog_errno(ret);
2922                 goto out;
2923         }
2924 
2925         while (right_cpos) {
2926                 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2927                 if (ret) {
2928                         mlog_errno(ret);
2929                         goto out;
2930                 }
2931 
2932                 subtree_root = ocfs2_find_subtree_root(et, left_path,
2933                                                        right_path);
2934 
2935                 trace_ocfs2_rotate_subtree(subtree_root,
2936                      (unsigned long long)
2937                      right_path->p_node[subtree_root].bh->b_blocknr,
2938                      right_path->p_tree_depth);
2939 
2940                 ret = ocfs2_extend_rotate_transaction(handle, 0,
2941                                                       orig_credits, left_path);
2942                 if (ret) {
2943                         mlog_errno(ret);
2944                         goto out;
2945                 }
2946 
2947                 /*
2948                  * Caller might still want to make changes to the
2949                  * tree root, so re-add it to the journal here.
2950                  */
2951                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2952                                                    left_path, 0);
2953                 if (ret) {
2954                         mlog_errno(ret);
2955                         goto out;
2956                 }
2957 
2958                 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2959                                                 right_path, subtree_root,
2960                                                 dealloc, &deleted);
2961                 if (ret == -EAGAIN) {
2962                         /*
2963                          * The rotation has to temporarily stop due to
2964                          * the right subtree having an empty
2965                          * extent. Pass it back to the caller for a
2966                          * fixup.
2967                          */
2968                         *empty_extent_path = right_path;
2969                         right_path = NULL;
2970                         goto out;
2971                 }
2972                 if (ret) {
2973                         mlog_errno(ret);
2974                         goto out;
2975                 }
2976 
2977                 /*
2978                  * The subtree rotate might have removed records on
2979                  * the rightmost edge. If so, then rotation is
2980                  * complete.
2981                  */
2982                 if (deleted)
2983                         break;
2984 
2985                 ocfs2_mv_path(left_path, right_path);
2986 
2987                 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
2988                                                      &right_cpos);
2989                 if (ret) {
2990                         mlog_errno(ret);
2991                         goto out;
2992                 }
2993         }
2994 
2995 out:
2996         ocfs2_free_path(right_path);
2997         ocfs2_free_path(left_path);
2998 
2999         return ret;
3000 }
3001 
3002 static int ocfs2_remove_rightmost_path(handle_t *handle,
3003                                 struct ocfs2_extent_tree *et,
3004                                 struct ocfs2_path *path,
3005                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3006 {
3007         int ret, subtree_index;
3008         u32 cpos;
3009         struct ocfs2_path *left_path = NULL;
3010         struct ocfs2_extent_block *eb;
3011         struct ocfs2_extent_list *el;
3012 
3013         ret = ocfs2_et_sanity_check(et);
3014         if (ret)
3015                 goto out;
3016 
3017         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3018         if (ret) {
3019                 mlog_errno(ret);
3020                 goto out;
3021         }
3022 
3023         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3024                                             path, &cpos);
3025         if (ret) {
3026                 mlog_errno(ret);
3027                 goto out;
3028         }
3029 
3030         if (cpos) {
3031                 /*
3032                  * We have a path to the left of this one - it needs
3033                  * an update too.
3034                  */
3035                 left_path = ocfs2_new_path_from_path(path);
3036                 if (!left_path) {
3037                         ret = -ENOMEM;
3038                         mlog_errno(ret);
3039                         goto out;
3040                 }
3041 
3042                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3043                 if (ret) {
3044                         mlog_errno(ret);
3045                         goto out;
3046                 }
3047 
3048                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3049                 if (ret) {
3050                         mlog_errno(ret);
3051                         goto out;
3052                 }
3053 
3054                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3055 
3056                 ocfs2_unlink_subtree(handle, et, left_path, path,
3057                                      subtree_index, dealloc);
3058                 ret = ocfs2_update_edge_lengths(handle, et, left_path);
3059                 if (ret) {
3060                         mlog_errno(ret);
3061                         goto out;
3062                 }
3063 
3064                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3065                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3066         } else {
3067                 /*
3068                  * 'path' is also the leftmost path which
3069                  * means it must be the only one. This gets
3070                  * handled differently because we want to
3071                  * revert the root back to having extents
3072                  * in-line.
3073                  */
3074                 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3075 
3076                 el = et->et_root_el;
3077                 el->l_tree_depth = 0;
3078                 el->l_next_free_rec = 0;
3079                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3080 
3081                 ocfs2_et_set_last_eb_blk(et, 0);
3082         }
3083 
3084         ocfs2_journal_dirty(handle, path_root_bh(path));
3085 
3086 out:
3087         ocfs2_free_path(left_path);
3088         return ret;
3089 }
3090 
3091 static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
3092                                 struct ocfs2_extent_tree *et,
3093                                 struct ocfs2_path *path,
3094                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3095 {
3096         handle_t *handle;
3097         int ret;
3098         int credits = path->p_tree_depth * 2 + 1;
3099 
3100         handle = ocfs2_start_trans(osb, credits);
3101         if (IS_ERR(handle)) {
3102                 ret = PTR_ERR(handle);
3103                 mlog_errno(ret);
3104                 return ret;
3105         }
3106 
3107         ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
3108         if (ret)
3109                 mlog_errno(ret);
3110 
3111         ocfs2_commit_trans(osb, handle);
3112         return ret;
3113 }
3114 
3115 /*
3116  * Left rotation of btree records.
3117  *
3118  * In many ways, this is (unsurprisingly) the opposite of right
3119  * rotation. We start at some non-rightmost path containing an empty
3120  * extent in the leaf block. The code works its way to the rightmost
3121  * path by rotating records to the left in every subtree.
3122  *
3123  * This is used by any code which reduces the number of extent records
3124  * in a leaf. After removal, an empty record should be placed in the
3125  * leftmost list position.
3126  *
3127  * This won't handle a length update of the rightmost path records if
3128  * the rightmost tree leaf record is removed so the caller is
3129  * responsible for detecting and correcting that.
3130  */
3131 static int ocfs2_rotate_tree_left(handle_t *handle,
3132                                   struct ocfs2_extent_tree *et,
3133                                   struct ocfs2_path *path,
3134                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3135 {
3136         int ret, orig_credits = handle->h_buffer_credits;
3137         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3138         struct ocfs2_extent_block *eb;
3139         struct ocfs2_extent_list *el;
3140 
3141         el = path_leaf_el(path);
3142         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3143                 return 0;
3144 
3145         if (path->p_tree_depth == 0) {
3146 rightmost_no_delete:
3147                 /*
3148                  * Inline extents. This is trivially handled, so do
3149                  * it up front.
3150                  */
3151                 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3152                 if (ret)
3153                         mlog_errno(ret);
3154                 goto out;
3155         }
3156 
3157         /*
3158          * Handle rightmost branch now. There's several cases:
3159          *  1) simple rotation leaving records in there. That's trivial.
3160          *  2) rotation requiring a branch delete - there's no more
3161          *     records left. Two cases of this:
3162          *     a) There are branches to the left.
3163          *     b) This is also the leftmost (the only) branch.
3164          *
3165          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3166          *  2a) we need the left branch so that we can update it with the unlink
3167          *  2b) we need to bring the root back to inline extents.
3168          */
3169 
3170         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3171         el = &eb->h_list;
3172         if (eb->h_next_leaf_blk == 0) {
3173                 /*
3174                  * This gets a bit tricky if we're going to delete the
3175                  * rightmost path. Get the other cases out of the way
3176                  * 1st.
3177                  */
3178                 if (le16_to_cpu(el->l_next_free_rec) > 1)
3179                         goto rightmost_no_delete;
3180 
3181                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3182                         ret = -EIO;
3183                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3184                                     "Owner %llu has empty extent block at %llu\n",
3185                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3186                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
3187                         goto out;
3188                 }
3189 
3190                 /*
3191                  * XXX: The caller can not trust "path" any more after
3192                  * this as it will have been deleted. What do we do?
3193                  *
3194                  * In theory the rotate-for-merge code will never get
3195                  * here because it'll always ask for a rotate in a
3196                  * nonempty list.
3197                  */
3198 
3199                 ret = ocfs2_remove_rightmost_path(handle, et, path,
3200                                                   dealloc);
3201                 if (ret)
3202                         mlog_errno(ret);
3203                 goto out;
3204         }
3205 
3206         /*
3207          * Now we can loop, remembering the path we get from -EAGAIN
3208          * and restarting from there.
3209          */
3210 try_rotate:
3211         ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3212                                        dealloc, &restart_path);
3213         if (ret && ret != -EAGAIN) {
3214                 mlog_errno(ret);
3215                 goto out;
3216         }
3217 
3218         while (ret == -EAGAIN) {
3219                 tmp_path = restart_path;
3220                 restart_path = NULL;
3221 
3222                 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3223                                                tmp_path, dealloc,
3224                                                &restart_path);
3225                 if (ret && ret != -EAGAIN) {
3226                         mlog_errno(ret);
3227                         goto out;
3228                 }
3229 
3230                 ocfs2_free_path(tmp_path);
3231                 tmp_path = NULL;
3232 
3233                 if (ret == 0)
3234                         goto try_rotate;
3235         }
3236 
3237 out:
3238         ocfs2_free_path(tmp_path);
3239         ocfs2_free_path(restart_path);
3240         return ret;
3241 }
3242 
3243 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3244                                 int index)
3245 {
3246         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3247         unsigned int size;
3248 
3249         if (rec->e_leaf_clusters == 0) {
3250                 /*
3251                  * We consumed all of the merged-from record. An empty
3252                  * extent cannot exist anywhere but the 1st array
3253                  * position, so move things over if the merged-from
3254                  * record doesn't occupy that position.
3255                  *
3256                  * This creates a new empty extent so the caller
3257                  * should be smart enough to have removed any existing
3258                  * ones.
3259                  */
3260                 if (index > 0) {
3261                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3262                         size = index * sizeof(struct ocfs2_extent_rec);
3263                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3264                 }
3265 
3266                 /*
3267                  * Always memset - the caller doesn't check whether it
3268                  * created an empty extent, so there could be junk in
3269                  * the other fields.
3270                  */
3271                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3272         }
3273 }
3274 
3275 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3276                                 struct ocfs2_path *left_path,
3277                                 struct ocfs2_path **ret_right_path)
3278 {
3279         int ret;
3280         u32 right_cpos;
3281         struct ocfs2_path *right_path = NULL;
3282         struct ocfs2_extent_list *left_el;
3283 
3284         *ret_right_path = NULL;
3285 
3286         /* This function shouldn't be called for non-trees. */
3287         BUG_ON(left_path->p_tree_depth == 0);
3288 
3289         left_el = path_leaf_el(left_path);
3290         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3291 
3292         ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3293                                              left_path, &right_cpos);
3294         if (ret) {
3295                 mlog_errno(ret);
3296                 goto out;
3297         }
3298 
3299         /* This function shouldn't be called for the rightmost leaf. */
3300         BUG_ON(right_cpos == 0);
3301 
3302         right_path = ocfs2_new_path_from_path(left_path);
3303         if (!right_path) {
3304                 ret = -ENOMEM;
3305                 mlog_errno(ret);
3306                 goto out;
3307         }
3308 
3309         ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3310         if (ret) {
3311                 mlog_errno(ret);
3312                 goto out;
3313         }
3314 
3315         *ret_right_path = right_path;
3316 out:
3317         if (ret)
3318                 ocfs2_free_path(right_path);
3319         return ret;
3320 }
3321 
3322 /*
3323  * Remove split_rec clusters from the record at index and merge them
3324  * onto the beginning of the record "next" to it.
3325  * For index < l_count - 1, the next means the extent rec at index + 1.
3326  * For index == l_count - 1, the "next" means the 1st extent rec of the
3327  * next extent block.
3328  */
3329 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3330                                  handle_t *handle,
3331                                  struct ocfs2_extent_tree *et,
3332                                  struct ocfs2_extent_rec *split_rec,
3333                                  int index)
3334 {
3335         int ret, next_free, i;
3336         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3337         struct ocfs2_extent_rec *left_rec;
3338         struct ocfs2_extent_rec *right_rec;
3339         struct ocfs2_extent_list *right_el;
3340         struct ocfs2_path *right_path = NULL;
3341         int subtree_index = 0;
3342         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3343         struct buffer_head *bh = path_leaf_bh(left_path);
3344         struct buffer_head *root_bh = NULL;
3345 
3346         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3347         left_rec = &el->l_recs[index];
3348 
3349         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3350             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3351                 /* we meet with a cross extent block merge. */
3352                 ret = ocfs2_get_right_path(et, left_path, &right_path);
3353                 if (ret) {
3354                         mlog_errno(ret);
3355                         return ret;
3356                 }
3357 
3358                 right_el = path_leaf_el(right_path);
3359                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3360                 BUG_ON(next_free <= 0);
3361                 right_rec = &right_el->l_recs[0];
3362                 if (ocfs2_is_empty_extent(right_rec)) {
3363                         BUG_ON(next_free <= 1);
3364                         right_rec = &right_el->l_recs[1];
3365                 }
3366 
3367                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3368                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3369                        le32_to_cpu(right_rec->e_cpos));
3370 
3371                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3372                                                         right_path);
3373 
3374                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3375                                                       handle->h_buffer_credits,
3376                                                       right_path);
3377                 if (ret) {
3378                         mlog_errno(ret);
3379                         goto out;
3380                 }
3381 
3382                 root_bh = left_path->p_node[subtree_index].bh;
3383                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3384 
3385                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3386                                                    subtree_index);
3387                 if (ret) {
3388                         mlog_errno(ret);
3389                         goto out;
3390                 }
3391 
3392                 for (i = subtree_index + 1;
3393                      i < path_num_items(right_path); i++) {
3394                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3395                                                            right_path, i);
3396                         if (ret) {
3397                                 mlog_errno(ret);
3398                                 goto out;
3399                         }
3400 
3401                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3402                                                            left_path, i);
3403                         if (ret) {
3404                                 mlog_errno(ret);
3405                                 goto out;
3406                         }
3407                 }
3408 
3409         } else {
3410                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3411                 right_rec = &el->l_recs[index + 1];
3412         }
3413 
3414         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3415                                            path_num_items(left_path) - 1);
3416         if (ret) {
3417                 mlog_errno(ret);
3418                 goto out;
3419         }
3420 
3421         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3422 
3423         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3424         le64_add_cpu(&right_rec->e_blkno,
3425                      -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3426                                                split_clusters));
3427         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3428 
3429         ocfs2_cleanup_merge(el, index);
3430 
3431         ocfs2_journal_dirty(handle, bh);
3432         if (right_path) {
3433                 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3434                 ocfs2_complete_edge_insert(handle, left_path, right_path,
3435                                            subtree_index);
3436         }
3437 out:
3438         ocfs2_free_path(right_path);
3439         return ret;
3440 }
3441 
3442 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3443                                struct ocfs2_path *right_path,
3444                                struct ocfs2_path **ret_left_path)
3445 {
3446         int ret;
3447         u32 left_cpos;
3448         struct ocfs2_path *left_path = NULL;
3449 
3450         *ret_left_path = NULL;
3451 
3452         /* This function shouldn't be called for non-trees. */
3453         BUG_ON(right_path->p_tree_depth == 0);
3454 
3455         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3456                                             right_path, &left_cpos);
3457         if (ret) {
3458                 mlog_errno(ret);
3459                 goto out;
3460         }
3461 
3462         /* This function shouldn't be called for the leftmost leaf. */
3463         BUG_ON(left_cpos == 0);
3464 
3465         left_path = ocfs2_new_path_from_path(right_path);
3466         if (!left_path) {
3467                 ret = -ENOMEM;
3468                 mlog_errno(ret);
3469                 goto out;
3470         }
3471 
3472         ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3473         if (ret) {
3474                 mlog_errno(ret);
3475                 goto out;
3476         }
3477 
3478         *ret_left_path = left_path;
3479 out:
3480         if (ret)
3481                 ocfs2_free_path(left_path);
3482         return ret;
3483 }
3484 
3485 /*
3486  * Remove split_rec clusters from the record at index and merge them
3487  * onto the tail of the record "before" it.
3488  * For index > 0, the "before" means the extent rec at index - 1.
3489  *
3490  * For index == 0, the "before" means the last record of the previous
3491  * extent block. And there is also a situation that we may need to
3492  * remove the rightmost leaf extent block in the right_path and change
3493  * the right path to indicate the new rightmost path.
3494  */
3495 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3496                                 handle_t *handle,
3497                                 struct ocfs2_extent_tree *et,
3498                                 struct ocfs2_extent_rec *split_rec,
3499                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3500                                 int index)
3501 {
3502         int ret, i, subtree_index = 0, has_empty_extent = 0;
3503         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3504         struct ocfs2_extent_rec *left_rec;
3505         struct ocfs2_extent_rec *right_rec;
3506         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3507         struct buffer_head *bh = path_leaf_bh(right_path);
3508         struct buffer_head *root_bh = NULL;
3509         struct ocfs2_path *left_path = NULL;
3510         struct ocfs2_extent_list *left_el;
3511 
3512         BUG_ON(index < 0);
3513 
3514         right_rec = &el->l_recs[index];
3515         if (index == 0) {
3516                 /* we meet with a cross extent block merge. */
3517                 ret = ocfs2_get_left_path(et, right_path, &left_path);
3518                 if (ret) {
3519                         mlog_errno(ret);
3520                         return ret;
3521                 }
3522 
3523                 left_el = path_leaf_el(left_path);
3524                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3525                        le16_to_cpu(left_el->l_count));
3526 
3527                 left_rec = &left_el->l_recs[
3528                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3529                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3530                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3531                        le32_to_cpu(split_rec->e_cpos));
3532 
3533                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3534                                                         right_path);
3535 
3536                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3537                                                       handle->h_buffer_credits,
3538                                                       left_path);
3539                 if (ret) {
3540                         mlog_errno(ret);
3541                         goto out;
3542                 }
3543 
3544                 root_bh = left_path->p_node[subtree_index].bh;
3545                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3546 
3547                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3548                                                    subtree_index);
3549                 if (ret) {
3550                         mlog_errno(ret);
3551                         goto out;
3552                 }
3553 
3554                 for (i = subtree_index + 1;
3555                      i < path_num_items(right_path); i++) {
3556                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3557                                                            right_path, i);
3558                         if (ret) {
3559                                 mlog_errno(ret);
3560                                 goto out;
3561                         }
3562 
3563                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3564                                                            left_path, i);
3565                         if (ret) {
3566                                 mlog_errno(ret);
3567                                 goto out;
3568                         }
3569                 }
3570         } else {
3571                 left_rec = &el->l_recs[index - 1];
3572                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3573                         has_empty_extent = 1;
3574         }
3575 
3576         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3577                                            path_num_items(right_path) - 1);
3578         if (ret) {
3579                 mlog_errno(ret);
3580                 goto out;
3581         }
3582 
3583         if (has_empty_extent && index == 1) {
3584                 /*
3585                  * The easy case - we can just plop the record right in.
3586                  */
3587                 *left_rec = *split_rec;
3588         } else
3589                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3590 
3591         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3592         le64_add_cpu(&right_rec->e_blkno,
3593                      ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3594                                               split_clusters));
3595         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3596 
3597         ocfs2_cleanup_merge(el, index);
3598 
3599         ocfs2_journal_dirty(handle, bh);
3600         if (left_path) {
3601                 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3602 
3603                 /*
3604                  * In the situation that the right_rec is empty and the extent
3605                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3606                  * it and we need to delete the right extent block.
3607                  */
3608                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3609                     le16_to_cpu(el->l_next_free_rec) == 1) {
3610                         /* extend credit for ocfs2_remove_rightmost_path */
3611                         ret = ocfs2_extend_rotate_transaction(handle, 0,
3612                                         handle->h_buffer_credits,
3613                                         right_path);
3614                         if (ret) {
3615                                 mlog_errno(ret);
3616                                 goto out;
3617                         }
3618 
3619                         ret = ocfs2_remove_rightmost_path(handle, et,
3620                                                           right_path,
3621                                                           dealloc);
3622                         if (ret) {
3623                                 mlog_errno(ret);
3624                                 goto out;
3625                         }
3626 
3627                         /* Now the rightmost extent block has been deleted.
3628                          * So we use the new rightmost path.
3629                          */
3630                         ocfs2_mv_path(right_path, left_path);
3631                         left_path = NULL;
3632                 } else
3633                         ocfs2_complete_edge_insert(handle, left_path,
3634                                                    right_path, subtree_index);
3635         }
3636 out:
3637         ocfs2_free_path(left_path);
3638         return ret;
3639 }
3640 
3641 static int ocfs2_try_to_merge_extent(handle_t *handle,
3642                                      struct ocfs2_extent_tree *et,
3643                                      struct ocfs2_path *path,
3644                                      int split_index,
3645                                      struct ocfs2_extent_rec *split_rec,
3646                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3647                                      struct ocfs2_merge_ctxt *ctxt)
3648 {
3649         int ret = 0;
3650         struct ocfs2_extent_list *el = path_leaf_el(path);
3651         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3652 
3653         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3654 
3655         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3656                 /* extend credit for ocfs2_remove_rightmost_path */
3657                 ret = ocfs2_extend_rotate_transaction(handle, 0,
3658                                 handle->h_buffer_credits,
3659                                 path);
3660                 if (ret) {
3661                         mlog_errno(ret);
3662                         goto out;
3663                 }
3664                 /*
3665                  * The merge code will need to create an empty
3666                  * extent to take the place of the newly
3667                  * emptied slot. Remove any pre-existing empty
3668                  * extents - having more than one in a leaf is
3669                  * illegal.
3670                  */
3671                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3672                 if (ret) {
3673                         mlog_errno(ret);
3674                         goto out;
3675                 }
3676                 split_index--;
3677                 rec = &el->l_recs[split_index];
3678         }
3679 
3680         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3681                 /*
3682                  * Left-right contig implies this.
3683                  */
3684                 BUG_ON(!ctxt->c_split_covers_rec);
3685 
3686                 /*
3687                  * Since the leftright insert always covers the entire
3688                  * extent, this call will delete the insert record
3689                  * entirely, resulting in an empty extent record added to
3690                  * the extent block.
3691                  *
3692                  * Since the adding of an empty extent shifts
3693                  * everything back to the right, there's no need to
3694                  * update split_index here.
3695                  *
3696                  * When the split_index is zero, we need to merge it to the
3697                  * prevoius extent block. It is more efficient and easier
3698                  * if we do merge_right first and merge_left later.
3699                  */
3700                 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3701                                             split_index);
3702                 if (ret) {
3703                         mlog_errno(ret);
3704                         goto out;
3705                 }
3706 
3707                 /*
3708                  * We can only get this from logic error above.
3709                  */
3710                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3711 
3712                 /* extend credit for ocfs2_remove_rightmost_path */
3713                 ret = ocfs2_extend_rotate_transaction(handle, 0,
3714                                         handle->h_buffer_credits,
3715                                         path);
3716                 if (ret) {
3717                         mlog_errno(ret);
3718                         goto out;
3719                 }
3720 
3721                 /* The merge left us with an empty extent, remove it. */
3722                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3723                 if (ret) {
3724                         mlog_errno(ret);
3725                         goto out;
3726                 }
3727 
3728                 rec = &el->l_recs[split_index];
3729 
3730                 /*
3731                  * Note that we don't pass split_rec here on purpose -
3732                  * we've merged it into the rec already.
3733                  */
3734                 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3735                                            dealloc, split_index);
3736 
3737                 if (ret) {
3738                         mlog_errno(ret);
3739                         goto out;
3740                 }
3741 
3742                 /* extend credit for ocfs2_remove_rightmost_path */
3743                 ret = ocfs2_extend_rotate_transaction(handle, 0,
3744                                 handle->h_buffer_credits,
3745                                 path);
3746                 if (ret) {
3747                         mlog_errno(ret);
3748                         goto out;
3749                 }
3750 
3751                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3752                 /*
3753                  * Error from this last rotate is not critical, so
3754                  * print but don't bubble it up.
3755                  */
3756                 if (ret)
3757                         mlog_errno(ret);
3758                 ret = 0;
3759         } else {
3760                 /*
3761                  * Merge a record to the left or right.
3762                  *
3763                  * 'contig_type' is relative to the existing record,
3764                  * so for example, if we're "right contig", it's to
3765                  * the record on the left (hence the left merge).
3766                  */
3767                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3768                         ret = ocfs2_merge_rec_left(path, handle, et,
3769                                                    split_rec, dealloc,
3770                                                    split_index);
3771                         if (ret) {
3772                                 mlog_errno(ret);
3773                                 goto out;
3774                         }
3775                 } else {
3776                         ret = ocfs2_merge_rec_right(path, handle,
3777                                                     et, split_rec,
3778                                                     split_index);
3779                         if (ret) {
3780                                 mlog_errno(ret);
3781                                 goto out;
3782                         }
3783                 }
3784 
3785                 if (ctxt->c_split_covers_rec) {
3786                         /* extend credit for ocfs2_remove_rightmost_path */
3787                         ret = ocfs2_extend_rotate_transaction(handle, 0,
3788                                         handle->h_buffer_credits,
3789                                         path);
3790                         if (ret) {
3791                                 mlog_errno(ret);
3792                                 ret = 0;
3793                                 goto out;
3794                         }
3795 
3796                         /*
3797                          * The merge may have left an empty extent in
3798                          * our leaf. Try to rotate it away.
3799                          */
3800                         ret = ocfs2_rotate_tree_left(handle, et, path,
3801                                                      dealloc);
3802                         if (ret)
3803                                 mlog_errno(ret);
3804                         ret = 0;
3805                 }
3806         }
3807 
3808 out:
3809         return ret;
3810 }
3811 
3812 static void ocfs2_subtract_from_rec(struct super_block *sb,
3813                                     enum ocfs2_split_type split,
3814                                     struct ocfs2_extent_rec *rec,
3815                                     struct ocfs2_extent_rec *split_rec)
3816 {
3817         u64 len_blocks;
3818 
3819         len_blocks = ocfs2_clusters_to_blocks(sb,
3820                                 le16_to_cpu(split_rec->e_leaf_clusters));
3821 
3822         if (split == SPLIT_LEFT) {
3823                 /*
3824                  * Region is on the left edge of the existing
3825                  * record.
3826                  */
3827                 le32_add_cpu(&rec->e_cpos,
3828                              le16_to_cpu(split_rec->e_leaf_clusters));
3829                 le64_add_cpu(&rec->e_blkno, len_blocks);
3830                 le16_add_cpu(&rec->e_leaf_clusters,
3831                              -le16_to_cpu(split_rec->e_leaf_clusters));
3832         } else {
3833                 /*
3834                  * Region is on the right edge of the existing
3835                  * record.
3836                  */
3837                 le16_add_cpu(&rec->e_leaf_clusters,
3838                              -le16_to_cpu(split_rec->e_leaf_clusters));
3839         }
3840 }
3841 
3842 /*
3843  * Do the final bits of extent record insertion at the target leaf
3844  * list. If this leaf is part of an allocation tree, it is assumed
3845  * that the tree above has been prepared.
3846  */
3847 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3848                                  struct ocfs2_extent_rec *insert_rec,
3849                                  struct ocfs2_extent_list *el,
3850                                  struct ocfs2_insert_type *insert)
3851 {
3852         int i = insert->ins_contig_index;
3853         unsigned int range;
3854         struct ocfs2_extent_rec *rec;
3855 
3856         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3857 
3858         if (insert->ins_split != SPLIT_NONE) {
3859                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3860                 BUG_ON(i == -1);
3861                 rec = &el->l_recs[i];
3862                 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3863                                         insert->ins_split, rec,
3864                                         insert_rec);
3865                 goto rotate;
3866         }
3867 
3868         /*
3869          * Contiguous insert - either left or right.
3870          */
3871         if (insert->ins_contig != CONTIG_NONE) {
3872                 rec = &el->l_recs[i];
3873                 if (insert->ins_contig == CONTIG_LEFT) {
3874                         rec->e_blkno = insert_rec->e_blkno;
3875                         rec->e_cpos = insert_rec->e_cpos;
3876                 }
3877                 le16_add_cpu(&rec->e_leaf_clusters,
3878                              le16_to_cpu(insert_rec->e_leaf_clusters));
3879                 return;
3880         }
3881 
3882         /*
3883          * Handle insert into an empty leaf.
3884          */
3885         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3886             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3887              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3888                 el->l_recs[0] = *insert_rec;
3889                 el->l_next_free_rec = cpu_to_le16(1);
3890                 return;
3891         }
3892 
3893         /*
3894          * Appending insert.
3895          */
3896         if (insert->ins_appending == APPEND_TAIL) {
3897                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3898                 rec = &el->l_recs[i];
3899                 range = le32_to_cpu(rec->e_cpos)
3900                         + le16_to_cpu(rec->e_leaf_clusters);
3901                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3902 
3903                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3904                                 le16_to_cpu(el->l_count),
3905                                 "owner %llu, depth %u, count %u, next free %u, "
3906                                 "rec.cpos %u, rec.clusters %u, "
3907                                 "insert.cpos %u, insert.clusters %u\n",
3908                                 ocfs2_metadata_cache_owner(et->et_ci),
3909                                 le16_to_cpu(el->l_tree_depth),
3910                                 le16_to_cpu(el->l_count),
3911                                 le16_to_cpu(el->l_next_free_rec),
3912                                 le32_to_cpu(el->l_recs[i].e_cpos),
3913                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3914                                 le32_to_cpu(insert_rec->e_cpos),
3915                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3916                 i++;
3917                 el->l_recs[i] = *insert_rec;
3918                 le16_add_cpu(&el->l_next_free_rec, 1);
3919                 return;
3920         }
3921 
3922 rotate:
3923         /*
3924          * Ok, we have to rotate.
3925          *
3926          * At this point, it is safe to assume that inserting into an
3927          * empty leaf and appending to a leaf have both been handled
3928          * above.
3929          *
3930          * This leaf needs to have space, either by the empty 1st
3931          * extent record, or by virtue of an l_next_rec < l_count.
3932          */
3933         ocfs2_rotate_leaf(el, insert_rec);
3934 }
3935 
3936 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3937                                            struct ocfs2_extent_tree *et,
3938                                            struct ocfs2_path *path,
3939                                            struct ocfs2_extent_rec *insert_rec)
3940 {
3941         int ret, i, next_free;
3942         struct buffer_head *bh;
3943         struct ocfs2_extent_list *el;
3944         struct ocfs2_extent_rec *rec;
3945 
3946         /*
3947          * Update everything except the leaf block.
3948          */
3949         for (i = 0; i < path->p_tree_depth; i++) {
3950                 bh = path->p_node[i].bh;
3951                 el = path->p_node[i].el;
3952 
3953                 next_free = le16_to_cpu(el->l_next_free_rec);
3954                 if (next_free == 0) {
3955                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3956                                     "Owner %llu has a bad extent list\n",
3957                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3958                         ret = -EIO;
3959                         return;
3960                 }
3961 
3962                 rec = &el->l_recs[next_free - 1];
3963 
3964                 rec->e_int_clusters = insert_rec->e_cpos;
3965                 le32_add_cpu(&rec->e_int_clusters,
3966                              le16_to_cpu(insert_rec->e_leaf_clusters));
3967                 le32_add_cpu(&rec->e_int_clusters,
3968                              -le32_to_cpu(rec->e_cpos));
3969 
3970                 ocfs2_journal_dirty(handle, bh);
3971         }
3972 }
3973 
3974 static int ocfs2_append_rec_to_path(handle_t *handle,
3975                                     struct ocfs2_extent_tree *et,
3976                                     struct ocfs2_extent_rec *insert_rec,
3977                                     struct ocfs2_path *right_path,
3978                                     struct ocfs2_path **ret_left_path)
3979 {
3980         int ret, next_free;
3981         struct ocfs2_extent_list *el;
3982         struct ocfs2_path *left_path = NULL;
3983 
3984         *ret_left_path = NULL;
3985 
3986         /*
3987          * This shouldn't happen for non-trees. The extent rec cluster
3988          * count manipulation below only works for interior nodes.
3989          */
3990         BUG_ON(right_path->p_tree_depth == 0);
3991 
3992         /*
3993          * If our appending insert is at the leftmost edge of a leaf,
3994          * then we might need to update the rightmost records of the
3995          * neighboring path.
3996          */
3997         el = path_leaf_el(right_path);
3998         next_free = le16_to_cpu(el->l_next_free_rec);
3999         if (next_free == 0 ||
4000             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
4001                 u32 left_cpos;
4002 
4003                 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
4004                                                     right_path, &left_cpos);
4005                 if (ret) {
4006                         mlog_errno(ret);
4007                         goto out;
4008                 }
4009 
4010                 trace_ocfs2_append_rec_to_path(
4011                         (unsigned long long)
4012                         ocfs2_metadata_cache_owner(et->et_ci),
4013                         le32_to_cpu(insert_rec->e_cpos),
4014                         left_cpos);
4015 
4016                 /*
4017                  * No need to worry if the append is already in the
4018                  * leftmost leaf.
4019                  */
4020                 if (left_cpos) {
4021                         left_path = ocfs2_new_path_from_path(right_path);
4022                         if (!left_path) {
4023                                 ret = -ENOMEM;
4024                                 mlog_errno(ret);
4025                                 goto out;
4026                         }
4027 
4028                         ret = ocfs2_find_path(et->et_ci, left_path,
4029                                               left_cpos);
4030                         if (ret) {
4031                                 mlog_errno(ret);
4032                                 goto out;
4033                         }
4034 
4035                         /*
4036                          * ocfs2_insert_path() will pass the left_path to the
4037                          * journal for us.
4038                          */
4039                 }
4040         }
4041 
4042         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4043         if (ret) {
4044                 mlog_errno(ret);
4045                 goto out;
4046         }
4047 
4048         ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4049 
4050         *ret_left_path = left_path;
4051         ret = 0;
4052 out:
4053         if (ret != 0)
4054                 ocfs2_free_path(left_path);
4055 
4056         return ret;
4057 }
4058 
4059 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4060                                struct ocfs2_path *left_path,
4061                                struct ocfs2_path *right_path,
4062                                struct ocfs2_extent_rec *split_rec,
4063                                enum ocfs2_split_type split)
4064 {
4065         int index;
4066         u32 cpos = le32_to_cpu(split_rec->e_cpos);
4067         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4068         struct ocfs2_extent_rec *rec, *tmprec;
4069 
4070         right_el = path_leaf_el(right_path);
4071         if (left_path)
4072                 left_el = path_leaf_el(left_path);
4073 
4074         el = right_el;
4075         insert_el = right_el;
4076         index = ocfs2_search_extent_list(el, cpos);
4077         if (index != -1) {
4078                 if (index == 0 && left_path) {
4079                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4080 
4081                         /*
4082                          * This typically means that the record
4083                          * started in the left path but moved to the
4084                          * right as a result of rotation. We either
4085                          * move the existing record to the left, or we
4086                          * do the later insert there.
4087                          *
4088                          * In this case, the left path should always
4089                          * exist as the rotate code will have passed
4090                          * it back for a post-insert update.
4091                          */
4092 
4093                         if (split == SPLIT_LEFT) {
4094                                 /*
4095                                  * It's a left split. Since we know
4096                                  * that the rotate code gave us an
4097                                  * empty extent in the left path, we
4098                                  * can just do the insert there.
4099                                  */
4100                                 insert_el = left_el;
4101                         } else {
4102                                 /*
4103                                  * Right split - we have to move the
4104                                  * existing record over to the left
4105                                  * leaf. The insert will be into the
4106                                  * newly created empty extent in the
4107                                  * right leaf.
4108                                  */
4109                                 tmprec = &right_el->l_recs[index];
4110                                 ocfs2_rotate_leaf(left_el, tmprec);
4111                                 el = left_el;
4112 
4113                                 memset(tmprec, 0, sizeof(*tmprec));
4114                                 index = ocfs2_search_extent_list(left_el, cpos);
4115                                 BUG_ON(index == -1);
4116                         }
4117                 }
4118         } else {
4119                 BUG_ON(!left_path);
4120                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4121                 /*
4122                  * Left path is easy - we can just allow the insert to
4123                  * happen.
4124                  */
4125                 el = left_el;
4126                 insert_el = left_el;
4127                 index = ocfs2_search_extent_list(el, cpos);
4128                 BUG_ON(index == -1);
4129         }
4130 
4131         rec = &el->l_recs[index];
4132         ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4133                                 split, rec, split_rec);
4134         ocfs2_rotate_leaf(insert_el, split_rec);
4135 }
4136 
4137 /*
4138  * This function only does inserts on an allocation b-tree. For tree
4139  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4140  *
4141  * right_path is the path we want to do the actual insert
4142  * in. left_path should only be passed in if we need to update that
4143  * portion of the tree after an edge insert.
4144  */
4145 static int ocfs2_insert_path(handle_t *handle,
4146                              struct ocfs2_extent_tree *et,
4147                              struct ocfs2_path *left_path,
4148                              struct ocfs2_path *right_path,
4149                              struct ocfs2_extent_rec *insert_rec,
4150                              struct ocfs2_insert_type *insert)
4151 {
4152         int ret, subtree_index;
4153         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4154 
4155         if (left_path) {
4156                 /*
4157                  * There's a chance that left_path got passed back to
4158                  * us without being accounted for in the
4159                  * journal. Extend our transaction here to be sure we
4160                  * can change those blocks.
4161                  */
4162                 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4163                 if (ret < 0) {
4164                         mlog_errno(ret);
4165                         goto out;
4166                 }
4167 
4168                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4169                 if (ret < 0) {
4170                         mlog_errno(ret);
4171                         goto out;
4172                 }
4173         }
4174 
4175         /*
4176          * Pass both paths to the journal. The majority of inserts
4177          * will be touching all components anyway.
4178          */
4179         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4180         if (ret < 0) {
4181                 mlog_errno(ret);
4182                 goto out;
4183         }
4184 
4185         if (insert->ins_split != SPLIT_NONE) {
4186                 /*
4187                  * We could call ocfs2_insert_at_leaf() for some types
4188                  * of splits, but it's easier to just let one separate
4189                  * function sort it all out.
4190                  */
4191                 ocfs2_split_record(et, left_path, right_path,
4192                                    insert_rec, insert->ins_split);
4193 
4194                 /*
4195                  * Split might have modified either leaf and we don't
4196                  * have a guarantee that the later edge insert will
4197                  * dirty this for us.
4198                  */
4199                 if (left_path)
4200                         ocfs2_journal_dirty(handle,
4201                                             path_leaf_bh(left_path));
4202         } else
4203                 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4204                                      insert);
4205 
4206         ocfs2_journal_dirty(handle, leaf_bh);
4207 
4208         if (left_path) {
4209                 /*
4210                  * The rotate code has indicated that we need to fix
4211                  * up portions of the tree after the insert.
4212                  *
4213                  * XXX: Should we extend the transaction here?
4214                  */
4215                 subtree_index = ocfs2_find_subtree_root(et, left_path,
4216                                                         right_path);
4217                 ocfs2_complete_edge_insert(handle, left_path, right_path,
4218                                            subtree_index);
4219         }
4220 
4221         ret = 0;
4222 out:
4223         return ret;
4224 }
4225 
4226 static int ocfs2_do_insert_extent(handle_t *handle,
4227                                   struct ocfs2_extent_tree *et,
4228                                   struct ocfs2_extent_rec *insert_rec,
4229                                   struct ocfs2_insert_type *type)
4230 {
4231         int ret, rotate = 0;
4232         u32 cpos;
4233         struct ocfs2_path *right_path = NULL;
4234         struct ocfs2_path *left_path = NULL;
4235         struct ocfs2_extent_list *el;
4236 
4237         el = et->et_root_el;
4238 
4239         ret = ocfs2_et_root_journal_access(handle, et,
4240                                            OCFS2_JOURNAL_ACCESS_WRITE);
4241         if (ret) {
4242                 mlog_errno(ret);
4243                 goto out;
4244         }
4245 
4246         if (le16_to_cpu(el->l_tree_depth) == 0) {
4247                 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4248                 goto out_update_clusters;
4249         }
4250 
4251         right_path = ocfs2_new_path_from_et(et);
4252         if (!right_path) {
4253                 ret = -ENOMEM;
4254                 mlog_errno(ret);
4255                 goto out;
4256         }
4257 
4258         /*
4259          * Determine the path to start with. Rotations need the
4260          * rightmost path, everything else can go directly to the
4261          * target leaf.
4262          */
4263         cpos = le32_to_cpu(insert_rec->e_cpos);
4264         if (type->ins_appending == APPEND_NONE &&
4265             type->ins_contig == CONTIG_NONE) {
4266                 rotate = 1;
4267                 cpos = UINT_MAX;
4268         }
4269 
4270         ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4271         if (ret) {
4272                 mlog_errno(ret);
4273                 goto out;
4274         }
4275 
4276         /*
4277          * Rotations and appends need special treatment - they modify
4278          * parts of the tree's above them.
4279          *
4280          * Both might pass back a path immediate to the left of the
4281          * one being inserted to. This will be cause
4282          * ocfs2_insert_path() to modify the rightmost records of
4283          * left_path to account for an edge insert.
4284          *
4285          * XXX: When modifying this code, keep in mind that an insert
4286          * can wind up skipping both of these two special cases...
4287          */
4288         if (rotate) {
4289                 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4290                                               le32_to_cpu(insert_rec->e_cpos),
4291                                               right_path, &left_path);
4292                 if (ret) {
4293                         mlog_errno(ret);
4294                         goto out;
4295                 }
4296 
4297                 /*
4298                  * ocfs2_rotate_tree_right() might have extended the
4299                  * transaction without re-journaling our tree root.
4300                  */
4301                 ret = ocfs2_et_root_journal_access(handle, et,
4302                                                    OCFS2_JOURNAL_ACCESS_WRITE);
4303                 if (ret) {
4304                         mlog_errno(ret);
4305                         goto out;
4306                 }
4307         } else if (type->ins_appending == APPEND_TAIL
4308                    && type->ins_contig != CONTIG_LEFT) {
4309                 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4310                                                right_path, &left_path);
4311                 if (ret) {
4312                         mlog_errno(ret);
4313                         goto out;
4314                 }
4315         }
4316 
4317         ret = ocfs2_insert_path(handle, et, left_path, right_path,
4318                                 insert_rec, type);
4319         if (ret) {
4320                 mlog_errno(ret);
4321                 goto out;
4322         }
4323 
4324 out_update_clusters:
4325         if (type->ins_split == SPLIT_NONE)
4326                 ocfs2_et_update_clusters(et,
4327                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4328 
4329         ocfs2_journal_dirty(handle, et->et_root_bh);
4330 
4331 out:
4332         ocfs2_free_path(left_path);
4333         ocfs2_free_path(right_path);
4334 
4335         return ret;
4336 }
4337 
4338 static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4339                                struct ocfs2_path *path,
4340                                struct ocfs2_extent_list *el, int index,
4341                                struct ocfs2_extent_rec *split_rec,
4342                                struct ocfs2_merge_ctxt *ctxt)
4343 {
4344         int status = 0;
4345         enum ocfs2_contig_type ret = CONTIG_NONE;
4346         u32 left_cpos, right_cpos;
4347         struct ocfs2_extent_rec *rec = NULL;
4348         struct ocfs2_extent_list *new_el;
4349         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4350         struct buffer_head *bh;
4351         struct ocfs2_extent_block *eb;
4352         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4353 
4354         if (index > 0) {
4355                 rec = &el->l_recs[index - 1];
4356         } else if (path->p_tree_depth > 0) {
4357                 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4358                 if (status)
4359                         goto exit;
4360 
4361                 if (left_cpos != 0) {
4362                         left_path = ocfs2_new_path_from_path(path);
4363                         if (!left_path) {
4364                                 status = -ENOMEM;
4365                                 mlog_errno(status);
4366                                 goto exit;
4367                         }
4368 
4369                         status = ocfs2_find_path(et->et_ci, left_path,
4370                                                  left_cpos);
4371                         if (status)
4372                                 goto free_left_path;
4373 
4374                         new_el = path_leaf_el(left_path);
4375 
4376                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4377                             le16_to_cpu(new_el->l_count)) {
4378                                 bh = path_leaf_bh(left_path);
4379                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4380                                 ocfs2_error(sb,
4381                                             "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
4382                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4383                                             le16_to_cpu(new_el->l_next_free_rec),
4384                                             le16_to_cpu(new_el->l_count));
4385                                 status = -EINVAL;
4386                                 goto free_left_path;
4387                         }
4388                         rec = &new_el->l_recs[
4389                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4390                 }
4391         }
4392 
4393         /*
4394          * We're careful to check for an empty extent record here -
4395          * the merge code will know what to do if it sees one.
4396          */
4397         if (rec) {
4398                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4399                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4400                                 ret = CONTIG_RIGHT;
4401                 } else {
4402                         ret = ocfs2_et_extent_contig(et, rec, split_rec);
4403                 }
4404         }
4405 
4406         rec = NULL;
4407         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4408                 rec = &el->l_recs[index + 1];
4409         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4410                  path->p_tree_depth > 0) {
4411                 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4412                 if (status)
4413                         goto free_left_path;
4414 
4415                 if (right_cpos == 0)
4416                         goto free_left_path;
4417 
4418                 right_path = ocfs2_new_path_from_path(path);
4419                 if (!right_path) {
4420                         status = -ENOMEM;
4421                         mlog_errno(status);
4422                         goto free_left_path;
4423                 }
4424 
4425                 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4426                 if (status)
4427                         goto free_right_path;
4428 
4429                 new_el = path_leaf_el(right_path);
4430                 rec = &new_el->l_recs[0];
4431                 if (ocfs2_is_empty_extent(rec)) {
4432                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4433                                 bh = path_leaf_bh(right_path);
4434                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4435                                 ocfs2_error(sb,
4436                                             "Extent block #%llu has an invalid l_next_free_rec of %d\n",
4437                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4438                                             le16_to_cpu(new_el->l_next_free_rec));
4439                                 status = -EINVAL;
4440                                 goto free_right_path;
4441                         }
4442                         rec = &new_el->l_recs[1];
4443                 }
4444         }
4445 
4446         if (rec) {
4447                 enum ocfs2_contig_type contig_type;
4448 
4449                 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4450 
4451                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4452                         ret = CONTIG_LEFTRIGHT;
4453                 else if (ret == CONTIG_NONE)
4454                         ret = contig_type;
4455         }
4456 
4457 free_right_path:
4458         ocfs2_free_path(right_path);
4459 free_left_path:
4460         ocfs2_free_path(left_path);
4461 exit:
4462         if (status == 0)
4463                 ctxt->c_contig_type = ret;
4464 
4465         return status;
4466 }
4467 
4468 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4469                                      struct ocfs2_insert_type *insert,
4470                                      struct ocfs2_extent_list *el,
4471                                      struct ocfs2_extent_rec *insert_rec)
4472 {
4473         int i;
4474         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4475 
4476         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4477 
4478         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4479                 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4480                                                      insert_rec);
4481                 if (contig_type != CONTIG_NONE) {
4482                         insert->ins_contig_index = i;
4483                         break;
4484                 }
4485         }
4486         insert->ins_contig = contig_type;
4487 
4488         if (insert->ins_contig != CONTIG_NONE) {
4489                 struct ocfs2_extent_rec *rec =
4490                                 &el->l_recs[insert->ins_contig_index];
4491                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4492                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4493 
4494                 /*
4495                  * Caller might want us to limit the size of extents, don't
4496                  * calculate contiguousness if we might exceed that limit.
4497                  */
4498                 if (et->et_max_leaf_clusters &&
4499                     (len > et->et_max_leaf_clusters))
4500                         insert->ins_contig = CONTIG_NONE;
4501         }
4502 }
4503 
4504 /*
4505  * This should only be called against the righmost leaf extent list.
4506  *
4507  * ocfs2_figure_appending_type() will figure out whether we'll have to
4508  * insert at the tail of the rightmost leaf.
4509  *
4510  * This should also work against the root extent list for tree's with 0
4511  * depth. If we consider the root extent list to be the rightmost leaf node
4512  * then the logic here makes sense.
4513  */
4514 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4515                                         struct ocfs2_extent_list *el,
4516                                         struct ocfs2_extent_rec *insert_rec)
4517 {
4518         int i;
4519         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4520         struct ocfs2_extent_rec *rec;
4521 
4522         insert->ins_appending = APPEND_NONE;
4523 
4524         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4525 
4526         if (!el->l_next_free_rec)
4527                 goto set_tail_append;
4528 
4529         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4530                 /* Were all records empty? */
4531                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4532                         goto set_tail_append;
4533         }
4534 
4535         i = le16_to_cpu(el->l_next_free_rec) - 1;
4536         rec = &el->l_recs[i];
4537 
4538         if (cpos >=
4539             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4540                 goto set_tail_append;
4541 
4542         return;
4543 
4544 set_tail_append:
4545         insert->ins_appending = APPEND_TAIL;
4546 }
4547 
4548 /*
4549  * Helper function called at the beginning of an insert.
4550  *
4551  * This computes a few things that are commonly used in the process of
4552  * inserting into the btree:
4553  *   - Whether the new extent is contiguous with an existing one.
4554  *   - The current tree depth.
4555  *   - Whether the insert is an appending one.
4556  *   - The total # of free records in the tree.
4557  *
4558  * All of the information is stored on the ocfs2_insert_type
4559  * structure.
4560  */
4561 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4562                                     struct buffer_head **last_eb_bh,
4563                                     struct ocfs2_extent_rec *insert_rec,
4564                                     int *free_records,
4565                                     struct ocfs2_insert_type *insert)
4566 {
4567         int ret;
4568         struct ocfs2_extent_block *eb;
4569         struct ocfs2_extent_list *el;
4570         struct ocfs2_path *path = NULL;
4571         struct buffer_head *bh = NULL;
4572 
4573         insert->ins_split = SPLIT_NONE;
4574 
4575         el = et->et_root_el;
4576         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4577 
4578         if (el->l_tree_depth) {
4579                 /*
4580                  * If we have tree depth, we read in the
4581                  * rightmost extent block ahead of time as
4582                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4583                  * may want it later.
4584                  */
4585                 ret = ocfs2_read_extent_block(et->et_ci,
4586                                               ocfs2_et_get_last_eb_blk(et),
4587                                               &bh);
4588                 if (ret) {
4589                         mlog_errno(ret);
4590                         goto out;
4591                 }
4592                 eb = (struct ocfs2_extent_block *) bh->b_data;
4593                 el = &eb->h_list;
4594         }
4595 
4596         /*
4597          * Unless we have a contiguous insert, we'll need to know if
4598          * there is room left in our allocation tree for another
4599          * extent record.
4600          *
4601          * XXX: This test is simplistic, we can search for empty
4602          * extent records too.
4603          */
4604         *free_records = le16_to_cpu(el->l_count) -
4605                 le16_to_cpu(el->l_next_free_rec);
4606 
4607         if (!insert->ins_tree_depth) {
4608                 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4609                 ocfs2_figure_appending_type(insert, el, insert_rec);
4610                 return 0;
4611         }
4612 
4613         path = ocfs2_new_path_from_et(et);
4614         if (!path) {
4615                 ret = -ENOMEM;
4616                 mlog_errno(ret);
4617                 goto out;
4618         }
4619 
4620         /*
4621          * In the case that we're inserting past what the tree
4622          * currently accounts for, ocfs2_find_path() will return for
4623          * us the rightmost tree path. This is accounted for below in
4624          * the appending code.
4625          */
4626         ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4627         if (ret) {
4628                 mlog_errno(ret);
4629                 goto out;
4630         }
4631 
4632         el = path_leaf_el(path);
4633 
4634         /*
4635          * Now that we have the path, there's two things we want to determine:
4636          * 1) Contiguousness (also set contig_index if this is so)
4637          *
4638          * 2) Are we doing an append? We can trivially break this up
4639          *     into two types of appends: simple record append, or a
4640          *     rotate inside the tail leaf.
4641          */
4642         ocfs2_figure_contig_type(et, insert, el, insert_rec);
4643 
4644         /*
4645          * The insert code isn't quite ready to deal with all cases of
4646          * left contiguousness. Specifically, if it's an insert into
4647          * the 1st record in a leaf, it will require the adjustment of
4648          * cluster count on the last record of the path directly to it's
4649          * left. For now, just catch that case and fool the layers
4650          * above us. This works just fine for tree_depth == 0, which
4651          * is why we allow that above.
4652          */
4653         if (insert->ins_contig == CONTIG_LEFT &&
4654             insert->ins_contig_index == 0)
4655                 insert->ins_contig = CONTIG_NONE;
4656 
4657         /*
4658          * Ok, so we can simply compare against last_eb to figure out
4659          * whether the path doesn't exist. This will only happen in
4660          * the case that we're doing a tail append, so maybe we can
4661          * take advantage of that information somehow.
4662          */
4663         if (ocfs2_et_get_last_eb_blk(et) ==
4664             path_leaf_bh(path)->b_blocknr) {
4665                 /*
4666                  * Ok, ocfs2_find_path() returned us the rightmost
4667                  * tree path. This might be an appending insert. There are
4668                  * two cases:
4669                  *    1) We're doing a true append at the tail:
4670                  *      -This might even be off the end of the leaf
4671                  *    2) We're "appending" by rotating in the tail
4672                  */
4673                 ocfs2_figure_appending_type(insert, el, insert_rec);
4674         }
4675 
4676 out:
4677         ocfs2_free_path(path);
4678 
4679         if (ret == 0)
4680                 *last_eb_bh = bh;
4681         else
4682                 brelse(bh);
4683         return ret;
4684 }
4685 
4686 /*
4687  * Insert an extent into a btree.
4688  *
4689  * The caller needs to update the owning btree's cluster count.
4690  */
4691 int ocfs2_insert_extent(handle_t *handle,
4692                         struct ocfs2_extent_tree *et,
4693                         u32 cpos,
4694                         u64 start_blk,
4695                         u32 new_clusters,
4696                         u8 flags,
4697                         struct ocfs2_alloc_context *meta_ac)
4698 {
4699         int status;
4700         int uninitialized_var(free_records);
4701         struct buffer_head *last_eb_bh = NULL;
4702         struct ocfs2_insert_type insert = {0, };
4703         struct ocfs2_extent_rec rec;
4704 
4705         trace_ocfs2_insert_extent_start(
4706                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4707                 cpos, new_clusters);
4708 
4709         memset(&rec, 0, sizeof(rec));
4710         rec.e_cpos = cpu_to_le32(cpos);
4711         rec.e_blkno = cpu_to_le64(start_blk);
4712         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4713         rec.e_flags = flags;
4714         status = ocfs2_et_insert_check(et, &rec);
4715         if (status) {
4716                 mlog_errno(status);
4717                 goto bail;
4718         }
4719 
4720         status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4721                                           &free_records, &insert);
4722         if (status < 0) {
4723                 mlog_errno(status);
4724                 goto bail;
4725         }
4726 
4727         trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4728                                   insert.ins_contig_index, free_records,
4729                                   insert.ins_tree_depth);
4730 
4731         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4732                 status = ocfs2_grow_tree(handle, et,
4733                                          &insert.ins_tree_depth, &last_eb_bh,
4734                                          meta_ac);
4735                 if (status) {
4736                         mlog_errno(status);
4737                         goto bail;
4738                 }
4739         }
4740 
4741         /* Finally, we can add clusters. This might rotate the tree for us. */
4742         status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4743         if (status < 0)
4744                 mlog_errno(status);
4745         else
4746                 ocfs2_et_extent_map_insert(et, &rec);
4747 
4748 bail:
4749         brelse(last_eb_bh);
4750 
4751         return status;
4752 }
4753 
4754 /*
4755  * Allcate and add clusters into the extent b-tree.
4756  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4757  * The extent b-tree's root is specified by et, and
4758  * it is not limited to the file storage. Any extent tree can use this
4759  * function if it implements the proper ocfs2_extent_tree.
4760  */
4761 int ocfs2_add_clusters_in_btree(handle_t *handle,
4762                                 struct ocfs2_extent_tree *et,
4763                                 u32 *logical_offset,
4764                                 u32 clusters_to_add,
4765                                 int mark_unwritten,
4766                                 struct ocfs2_alloc_context *data_ac,
4767                                 struct ocfs2_alloc_context *meta_ac,
4768                                 enum ocfs2_alloc_restarted *reason_ret)
4769 {
4770         int status = 0, err = 0;
4771         int need_free = 0;
4772         int free_extents;
4773         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4774         u32 bit_off, num_bits;
4775         u64 block;
4776         u8 flags = 0;
4777         struct ocfs2_super *osb =
4778                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4779 
4780         BUG_ON(!clusters_to_add);
4781 
4782         if (mark_unwritten)
4783                 flags = OCFS2_EXT_UNWRITTEN;
4784 
4785         free_extents = ocfs2_num_free_extents(et);
4786         if (free_extents < 0) {
4787                 status = free_extents;
4788                 mlog_errno(status);
4789                 goto leave;
4790         }
4791 
4792         /* there are two cases which could cause us to EAGAIN in the
4793          * we-need-more-metadata case:
4794          * 1) we haven't reserved *any*
4795          * 2) we are so fragmented, we've needed to add metadata too
4796          *    many times. */
4797         if (!free_extents && !meta_ac) {
4798                 err = -1;
4799                 status = -EAGAIN;
4800                 reason = RESTART_META;
4801                 goto leave;
4802         } else if ((!free_extents)
4803                    && (ocfs2_alloc_context_bits_left(meta_ac)
4804                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4805                 err = -2;
4806                 status = -EAGAIN;
4807                 reason = RESTART_META;
4808                 goto leave;
4809         }
4810 
4811         status = __ocfs2_claim_clusters(handle, data_ac, 1,
4812                                         clusters_to_add, &bit_off, &num_bits);
4813         if (status < 0) {
4814                 if (status != -ENOSPC)
4815                         mlog_errno(status);
4816                 goto leave;
4817         }
4818 
4819         BUG_ON(num_bits > clusters_to_add);
4820 
4821         /* reserve our write early -- insert_extent may update the tree root */
4822         status = ocfs2_et_root_journal_access(handle, et,
4823                                               OCFS2_JOURNAL_ACCESS_WRITE);
4824         if (status < 0) {
4825                 mlog_errno(status);
4826                 need_free = 1;
4827                 goto bail;
4828         }
4829 
4830         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4831         trace_ocfs2_add_clusters_in_btree(
4832              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4833              bit_off, num_bits);
4834         status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4835                                      num_bits, flags, meta_ac);
4836         if (status < 0) {
4837                 mlog_errno(status);
4838                 need_free = 1;
4839                 goto bail;
4840         }
4841 
4842         ocfs2_journal_dirty(handle, et->et_root_bh);
4843 
4844         clusters_to_add -= num_bits;
4845         *logical_offset += num_bits;
4846 
4847         if (clusters_to_add) {
4848                 err = clusters_to_add;
4849                 status = -EAGAIN;
4850                 reason = RESTART_TRANS;
4851         }
4852 
4853 bail:
4854         if (need_free) {
4855                 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
4856                         ocfs2_free_local_alloc_bits(osb, handle, data_ac,
4857                                         bit_off, num_bits);
4858                 else
4859                         ocfs2_free_clusters(handle,
4860                                         data_ac->ac_inode,
4861                                         data_ac->ac_bh,
4862                                         ocfs2_clusters_to_blocks(osb->sb, bit_off),
4863                                         num_bits);
4864         }
4865 
4866 leave:
4867         if (reason_ret)
4868                 *reason_ret = reason;
4869         trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4870         return status;
4871 }
4872 
4873 static void ocfs2_make_right_split_rec(struct super_block *sb,
4874                                        struct ocfs2_extent_rec *split_rec,
4875                                        u32 cpos,
4876                                        struct ocfs2_extent_rec *rec)
4877 {
4878         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4879         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4880 
4881         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4882 
4883         split_rec->e_cpos = cpu_to_le32(cpos);
4884         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4885 
4886         split_rec->e_blkno = rec->e_blkno;
4887         le64_add_cpu(&split_rec->e_blkno,
4888                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4889 
4890         split_rec->e_flags = rec->e_flags;
4891 }
4892 
4893 static int ocfs2_split_and_insert(handle_t *handle,
4894                                   struct ocfs2_extent_tree *et,
4895                                   struct ocfs2_path *path,
4896                                   struct buffer_head **last_eb_bh,
4897                                   int split_index,
4898                                   struct ocfs2_extent_rec *orig_split_rec,
4899                                   struct ocfs2_alloc_context *meta_ac)
4900 {
4901         int ret = 0, depth;
4902         unsigned int insert_range, rec_range, do_leftright = 0;
4903         struct ocfs2_extent_rec tmprec;
4904         struct ocfs2_extent_list *rightmost_el;
4905         struct ocfs2_extent_rec rec;
4906         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4907         struct ocfs2_insert_type insert;
4908         struct ocfs2_extent_block *eb;
4909 
4910 leftright:
4911         /*
4912          * Store a copy of the record on the stack - it might move
4913          * around as the tree is manipulated below.
4914          */
4915         rec = path_leaf_el(path)->l_recs[split_index];
4916 
4917         rightmost_el = et->et_root_el;
4918 
4919         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4920         if (depth) {
4921                 BUG_ON(!(*last_eb_bh));
4922                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4923                 rightmost_el = &eb->h_list;
4924         }
4925 
4926         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4927             le16_to_cpu(rightmost_el->l_count)) {
4928                 ret = ocfs2_grow_tree(handle, et,
4929                                       &depth, last_eb_bh, meta_ac);
4930                 if (ret) {
4931                         mlog_errno(ret);
4932                         goto out;
4933                 }
4934         }
4935 
4936         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4937         insert.ins_appending = APPEND_NONE;
4938         insert.ins_contig = CONTIG_NONE;
4939         insert.ins_tree_depth = depth;
4940 
4941         insert_range = le32_to_cpu(split_rec.e_cpos) +
4942                 le16_to_cpu(split_rec.e_leaf_clusters);
4943         rec_range = le32_to_cpu(rec.e_cpos) +
4944                 le16_to_cpu(rec.e_leaf_clusters);
4945 
4946         if (split_rec.e_cpos == rec.e_cpos) {
4947                 insert.ins_split = SPLIT_LEFT;
4948         } else if (insert_range == rec_range) {
4949                 insert.ins_split = SPLIT_RIGHT;
4950         } else {
4951                 /*
4952                  * Left/right split. We fake this as a right split
4953                  * first and then make a second pass as a left split.
4954                  */
4955                 insert.ins_split = SPLIT_RIGHT;
4956 
4957                 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4958                                            &tmprec, insert_range, &rec);
4959 
4960                 split_rec = tmprec;
4961 
4962                 BUG_ON(do_leftright);
4963                 do_leftright = 1;
4964         }
4965 
4966         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4967         if (ret) {
4968                 mlog_errno(ret);
4969                 goto out;
4970         }
4971 
4972         if (do_leftright == 1) {
4973                 u32 cpos;
4974                 struct ocfs2_extent_list *el;
4975 
4976                 do_leftright++;
4977                 split_rec = *orig_split_rec;
4978 
4979                 ocfs2_reinit_path(path, 1);
4980 
4981                 cpos = le32_to_cpu(split_rec.e_cpos);
4982                 ret = ocfs2_find_path(et->et_ci, path, cpos);
4983                 if (ret) {
4984                         mlog_errno(ret);
4985                         goto out;
4986                 }
4987 
4988                 el = path_leaf_el(path);
4989                 split_index = ocfs2_search_extent_list(el, cpos);
4990                 if (split_index == -1) {
4991                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
4992                                     "Owner %llu has an extent at cpos %u which can no longer be found\n",
4993                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4994                                     cpos);
4995                         ret = -EROFS;
4996                         goto out;
4997                 }
4998                 goto leftright;
4999         }
5000 out:
5001 
5002         return ret;
5003 }
5004 
5005 static int ocfs2_replace_extent_rec(handle_t *handle,
5006                                     struct ocfs2_extent_tree *et,
5007                                     struct ocfs2_path *path,
5008                                     struct ocfs2_extent_list *el,
5009                                     int split_index,
5010                                     struct ocfs2_extent_rec *split_rec)
5011 {
5012         int ret;
5013 
5014         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
5015                                            path_num_items(path) - 1);
5016         if (ret) {
5017                 mlog_errno(ret);
5018                 goto out;
5019         }
5020 
5021         el->l_recs[split_index] = *split_rec;
5022 
5023         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5024 out:
5025         return ret;
5026 }
5027 
5028 /*
5029  * Split part or all of the extent record at split_index in the leaf
5030  * pointed to by path. Merge with the contiguous extent record if needed.
5031  *
5032  * Care is taken to handle contiguousness so as to not grow the tree.
5033  *
5034  * meta_ac is not strictly necessary - we only truly need it if growth
5035  * of the tree is required. All other cases will degrade into a less
5036  * optimal tree layout.
5037  *
5038  * last_eb_bh should be the rightmost leaf block for any extent
5039  * btree. Since a split may grow the tree or a merge might shrink it,
5040  * the caller cannot trust the contents of that buffer after this call.
5041  *
5042  * This code is optimized for readability - several passes might be
5043  * made over certain portions of the tree. All of those blocks will
5044  * have been brought into cache (and pinned via the journal), so the
5045  * extra overhead is not expressed in terms of disk reads.
5046  */
5047 int ocfs2_split_extent(handle_t *handle,
5048                        struct ocfs2_extent_tree *et,
5049                        struct ocfs2_path *path,
5050                        int split_index,
5051                        struct ocfs2_extent_rec *split_rec,
5052                        struct ocfs2_alloc_context *meta_ac,
5053                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5054 {
5055         int ret = 0;
5056         struct ocfs2_extent_list *el = path_leaf_el(path);
5057         struct buffer_head *last_eb_bh = NULL;
5058         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5059         struct ocfs2_merge_ctxt ctxt;
5060         struct ocfs2_extent_list *rightmost_el;
5061 
5062         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5063             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5064              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5065                 ret = -EIO;
5066                 mlog_errno(ret);
5067                 goto out;
5068         }
5069 
5070         ret = ocfs2_figure_merge_contig_type(et, path, el,
5071                                              split_index,
5072                                              split_rec,
5073                                              &ctxt);
5074         if (ret) {
5075                 mlog_errno(ret);
5076                 goto out;
5077         }
5078 
5079         /*
5080          * The core merge / split code wants to know how much room is
5081          * left in this allocation tree, so we pass the
5082          * rightmost extent list.
5083          */
5084         if (path->p_tree_depth) {
5085                 struct ocfs2_extent_block *eb;
5086 
5087                 ret = ocfs2_read_extent_block(et->et_ci,
5088                                               ocfs2_et_get_last_eb_blk(et),
5089                                               &last_eb_bh);
5090                 if (ret) {
5091                         mlog_errno(ret);
5092                         goto out;
5093                 }
5094 
5095                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5096                 rightmost_el = &eb->h_list;
5097         } else
5098                 rightmost_el = path_root_el(path);
5099 
5100         if (rec->e_cpos == split_rec->e_cpos &&
5101             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5102                 ctxt.c_split_covers_rec = 1;
5103         else
5104                 ctxt.c_split_covers_rec = 0;
5105 
5106         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5107 
5108         trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5109                                  ctxt.c_has_empty_extent,
5110                                  ctxt.c_split_covers_rec);
5111 
5112         if (ctxt.c_contig_type == CONTIG_NONE) {
5113                 if (ctxt.c_split_covers_rec)
5114                         ret = ocfs2_replace_extent_rec(handle, et, path, el,
5115                                                        split_index, split_rec);
5116                 else
5117                         ret = ocfs2_split_and_insert(handle, et, path,
5118                                                      &last_eb_bh, split_index,
5119                                                      split_rec, meta_ac);
5120                 if (ret)
5121                         mlog_errno(ret);
5122         } else {
5123                 ret = ocfs2_try_to_merge_extent(handle, et, path,
5124                                                 split_index, split_rec,
5125                                                 dealloc, &ctxt);
5126                 if (ret)
5127                         mlog_errno(ret);
5128         }
5129 
5130 out:
5131         brelse(last_eb_bh);
5132         return ret;
5133 }
5134 
5135 /*
5136  * Change the flags of the already-existing extent at cpos for len clusters.
5137  *
5138  * new_flags: the flags we want to set.
5139  * clear_flags: the flags we want to clear.
5140  * phys: the new physical offset we want this new extent starts from.
5141  *
5142  * If the existing extent is larger than the request, initiate a
5143  * split. An attempt will be made at merging with adjacent extents.
5144  *
5145  * The caller is responsible for passing down meta_ac if we'll need it.
5146  */
5147 int ocfs2_change_extent_flag(handle_t *handle,
5148                              struct ocfs2_extent_tree *et,
5149                              u32 cpos, u32 len, u32 phys,
5150                              struct ocfs2_alloc_context *meta_ac,
5151                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5152                              int new_flags, int clear_flags)
5153 {
5154         int ret, index;
5155         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5156         u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5157         struct ocfs2_extent_rec split_rec;
5158         struct ocfs2_path *left_path = NULL;
5159         struct ocfs2_extent_list *el;
5160         struct ocfs2_extent_rec *rec;
5161 
5162         left_path = ocfs2_new_path_from_et(et);
5163         if (!left_path) {
5164                 ret = -ENOMEM;
5165                 mlog_errno(ret);
5166                 goto out;
5167         }
5168 
5169         ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5170         if (ret) {
5171                 mlog_errno(ret);
5172                 goto out;
5173         }
5174         el = path_leaf_el(left_path);
5175 
5176         index = ocfs2_search_extent_list(el, cpos);
5177         if (index == -1) {
5178                 ocfs2_error(sb,
5179                             "Owner %llu has an extent at cpos %u which can no longer be found\n",
5180                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5181                             cpos);
5182                 ret = -EROFS;
5183                 goto out;
5184         }
5185 
5186         ret = -EIO;
5187         rec = &el->l_recs[index];
5188         if (new_flags && (rec->e_flags & new_flags)) {
5189                 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5190                      "extent that already had them\n",
5191                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5192                      new_flags);
5193                 goto out;
5194         }
5195 
5196         if (clear_flags && !(rec->e_flags & clear_flags)) {
5197                 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5198                      "extent that didn't have them\n",
5199                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5200                      clear_flags);
5201                 goto out;
5202         }
5203 
5204         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5205         split_rec.e_cpos = cpu_to_le32(cpos);
5206         split_rec.e_leaf_clusters = cpu_to_le16(len);
5207         split_rec.e_blkno = cpu_to_le64(start_blkno);
5208         split_rec.e_flags = rec->e_flags;
5209         if (new_flags)
5210                 split_rec.e_flags |= new_flags;
5211         if (clear_flags)
5212                 split_rec.e_flags &= ~clear_flags;
5213 
5214         ret = ocfs2_split_extent(handle, et, left_path,
5215                                  index, &split_rec, meta_ac,
5216                                  dealloc);
5217         if (ret)
5218                 mlog_errno(ret);
5219 
5220 out:
5221         ocfs2_free_path(left_path);
5222         return ret;
5223 
5224 }
5225 
5226 /*
5227  * Mark the already-existing extent at cpos as written for len clusters.
5228  * This removes the unwritten extent flag.
5229  *
5230  * If the existing extent is larger than the request, initiate a
5231  * split. An attempt will be made at merging with adjacent extents.
5232  *
5233  * The caller is responsible for passing down meta_ac if we'll need it.
5234  */
5235 int ocfs2_mark_extent_written(struct inode *inode,
5236                               struct ocfs2_extent_tree *et,
5237                               handle_t *handle, u32 cpos, u32 len, u32 phys,
5238                               struct ocfs2_alloc_context *meta_ac,
5239                               struct ocfs2_cached_dealloc_ctxt *dealloc)
5240 {
5241         int ret;
5242 
5243         trace_ocfs2_mark_extent_written(
5244                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
5245                 cpos, len, phys);
5246 
5247         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5248                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
5249                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
5250                 ret = -EROFS;
5251                 goto out;
5252         }
5253 
5254         /*
5255          * XXX: This should be fixed up so that we just re-insert the
5256          * next extent records.
5257          */
5258         ocfs2_et_extent_map_truncate(et, 0);
5259 
5260         ret = ocfs2_change_extent_flag(handle, et, cpos,
5261                                        len, phys, meta_ac, dealloc,
5262                                        0, OCFS2_EXT_UNWRITTEN);
5263         if (ret)
5264                 mlog_errno(ret);
5265 
5266 out:
5267         return ret;
5268 }
5269 
5270 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5271                             struct ocfs2_path *path,
5272                             int index, u32 new_range,
5273                             struct ocfs2_alloc_context *meta_ac)
5274 {
5275         int ret, depth, credits;
5276         struct buffer_head *last_eb_bh = NULL;
5277         struct ocfs2_extent_block *eb;
5278         struct ocfs2_extent_list *rightmost_el, *el;
5279         struct ocfs2_extent_rec split_rec;
5280         struct ocfs2_extent_rec *rec;
5281         struct ocfs2_insert_type insert;
5282 
5283         /*
5284          * Setup the record to split before we grow the tree.
5285          */
5286         el = path_leaf_el(path);
5287         rec = &el->l_recs[index];
5288         ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5289                                    &split_rec, new_range, rec);
5290 
5291         depth = path->p_tree_depth;
5292         if (depth > 0) {
5293                 ret = ocfs2_read_extent_block(et->et_ci,
5294                                               ocfs2_et_get_last_eb_blk(et),
5295                                               &last_eb_bh);
5296                 if (ret < 0) {
5297                         mlog_errno(ret);
5298                         goto out;
5299                 }
5300 
5301                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5302                 rightmost_el = &eb->h_list;
5303         } else
5304                 rightmost_el = path_leaf_el(path);
5305 
5306         credits = path->p_tree_depth +
5307                   ocfs2_extend_meta_needed(et->et_root_el);
5308         ret = ocfs2_extend_trans(handle, credits);
5309         if (ret) {
5310                 mlog_errno(ret);
5311                 goto out;
5312         }
5313 
5314         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5315             le16_to_cpu(rightmost_el->l_count)) {
5316                 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5317                                       meta_ac);
5318                 if (ret) {
5319                         mlog_errno(ret);
5320                         goto out;
5321                 }
5322         }
5323 
5324         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5325         insert.ins_appending = APPEND_NONE;
5326         insert.ins_contig = CONTIG_NONE;
5327         insert.ins_split = SPLIT_RIGHT;
5328         insert.ins_tree_depth = depth;
5329 
5330         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5331         if (ret)
5332                 mlog_errno(ret);
5333 
5334 out:
5335         brelse(last_eb_bh);
5336         return ret;
5337 }
5338 
5339 static int ocfs2_truncate_rec(handle_t *handle,
5340                               struct ocfs2_extent_tree *et,
5341                               struct ocfs2_path *path, int index,
5342                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5343                               u32 cpos, u32 len)
5344 {
5345         int ret;
5346         u32 left_cpos, rec_range, trunc_range;
5347         int is_rightmost_tree_rec = 0;
5348         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5349         struct ocfs2_path *left_path = NULL;
5350         struct ocfs2_extent_list *el = path_leaf_el(path);
5351         struct ocfs2_extent_rec *rec;
5352         struct ocfs2_extent_block *eb;
5353 
5354         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5355                 /* extend credit for ocfs2_remove_rightmost_path */
5356                 ret = ocfs2_extend_rotate_transaction(handle, 0,
5357                                 handle->h_buffer_credits,
5358                                 path);
5359                 if (ret) {
5360                         mlog_errno(ret);
5361                         goto out;
5362                 }
5363 
5364                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5365                 if (ret) {
5366                         mlog_errno(ret);
5367                         goto out;
5368                 }
5369 
5370                 index--;
5371         }
5372 
5373         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5374             path->p_tree_depth) {
5375                 /*
5376                  * Check whether this is the rightmost tree record. If
5377                  * we remove all of this record or part of its right
5378                  * edge then an update of the record lengths above it
5379                  * will be required.
5380                  */
5381                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5382                 if (eb->h_next_leaf_blk == 0)
5383                         is_rightmost_tree_rec = 1;
5384         }
5385 
5386         rec = &el->l_recs[index];
5387         if (index == 0 && path->p_tree_depth &&
5388             le32_to_cpu(rec->e_cpos) == cpos) {
5389                 /*
5390                  * Changing the leftmost offset (via partial or whole
5391                  * record truncate) of an interior (or rightmost) path
5392                  * means we have to update the subtree that is formed
5393                  * by this leaf and the one to it's left.
5394                  *
5395                  * There are two cases we can skip:
5396                  *   1) Path is the leftmost one in our btree.
5397                  *   2) The leaf is rightmost and will be empty after
5398                  *      we remove the extent record - the rotate code
5399                  *      knows how to update the newly formed edge.
5400                  */
5401 
5402                 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5403                 if (ret) {
5404                         mlog_errno(ret);
5405                         goto out;
5406                 }
5407 
5408                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5409                         left_path = ocfs2_new_path_from_path(path);
5410                         if (!left_path) {
5411                                 ret = -ENOMEM;
5412                                 mlog_errno(ret);
5413                                 goto out;
5414                         }
5415 
5416                         ret = ocfs2_find_path(et->et_ci, left_path,
5417                                               left_cpos);
5418                         if (ret) {
5419                                 mlog_errno(ret);
5420                                 goto out;
5421                         }
5422                 }
5423         }
5424 
5425         ret = ocfs2_extend_rotate_transaction(handle, 0,
5426                                               handle->h_buffer_credits,
5427                                               path);
5428         if (ret) {
5429                 mlog_errno(ret);
5430                 goto out;
5431         }
5432 
5433         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5434         if (ret) {
5435                 mlog_errno(ret);
5436                 goto out;
5437         }
5438 
5439         ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5440         if (ret) {
5441                 mlog_errno(ret);
5442                 goto out;
5443         }
5444 
5445         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5446         trunc_range = cpos + len;
5447 
5448         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5449                 int next_free;
5450 
5451                 memset(rec, 0, sizeof(*rec));
5452                 ocfs2_cleanup_merge(el, index);
5453 
5454                 next_free = le16_to_cpu(el->l_next_free_rec);
5455                 if (is_rightmost_tree_rec && next_free > 1) {
5456                         /*
5457                          * We skip the edge update if this path will
5458                          * be deleted by the rotate code.
5459                          */
5460                         rec = &el->l_recs[next_free - 1];
5461                         ocfs2_adjust_rightmost_records(handle, et, path,
5462                                                        rec);
5463                 }
5464         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5465                 /* Remove leftmost portion of the record. */
5466                 le32_add_cpu(&rec->e_cpos, len);
5467                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5468                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5469         } else if (rec_range == trunc_range) {
5470                 /* Remove rightmost portion of the record */
5471                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5472                 if (is_rightmost_tree_rec)
5473                         ocfs2_adjust_rightmost_records(handle, et, path, rec);
5474         } else {
5475                 /* Caller should have trapped this. */
5476                 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5477                      "(%u, %u)\n",
5478                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5479                      le32_to_cpu(rec->e_cpos),
5480                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5481                 BUG();
5482         }
5483 
5484         if (left_path) {
5485                 int subtree_index;
5486 
5487                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5488                 ocfs2_complete_edge_insert(handle, left_path, path,
5489                                            subtree_index);
5490         }
5491 
5492         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5493 
5494         ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5495         if (ret) {
5496                 mlog_errno(ret);
5497                 goto out;
5498         }
5499 
5500 out:
5501         ocfs2_free_path(left_path);
5502         return ret;
5503 }
5504 
5505 int ocfs2_remove_extent(handle_t *handle,
5506                         struct ocfs2_extent_tree *et,
5507                         u32 cpos, u32 len,
5508                         struct ocfs2_alloc_context *meta_ac,
5509                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5510 {
5511         int ret, index;
5512         u32 rec_range, trunc_range;
5513         struct ocfs2_extent_rec *rec;
5514         struct ocfs2_extent_list *el;
5515         struct ocfs2_path *path = NULL;
5516 
5517         /*
5518          * XXX: Why are we truncating to 0 instead of wherever this
5519          * affects us?
5520          */
5521         ocfs2_et_extent_map_truncate(et, 0);
5522 
5523         path = ocfs2_new_path_from_et(et);
5524         if (!path) {
5525                 ret = -ENOMEM;
5526                 mlog_errno(ret);
5527                 goto out;
5528         }
5529 
5530         ret = ocfs2_find_path(et->et_ci, path, cpos);
5531         if (ret) {
5532                 mlog_errno(ret);
5533                 goto out;
5534         }
5535 
5536         el = path_leaf_el(path);
5537         index = ocfs2_search_extent_list(el, cpos);
5538         if (index == -1) {
5539                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5540                             "Owner %llu has an extent at cpos %u which can no longer be found\n",
5541                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5542                             cpos);
5543                 ret = -EROFS;
5544                 goto out;
5545         }
5546 
5547         /*
5548          * We have 3 cases of extent removal:
5549          *   1) Range covers the entire extent rec
5550          *   2) Range begins or ends on one edge of the extent rec
5551          *   3) Range is in the middle of the extent rec (no shared edges)
5552          *
5553          * For case 1 we remove the extent rec and left rotate to
5554          * fill the hole.
5555          *
5556          * For case 2 we just shrink the existing extent rec, with a
5557          * tree update if the shrinking edge is also the edge of an
5558          * extent block.
5559          *
5560          * For case 3 we do a right split to turn the extent rec into
5561          * something case 2 can handle.
5562          */
5563         rec = &el->l_recs[index];
5564         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5565         trunc_range = cpos + len;
5566 
5567         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5568 
5569         trace_ocfs2_remove_extent(
5570                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5571                 cpos, len, index, le32_to_cpu(rec->e_cpos),
5572                 ocfs2_rec_clusters(el, rec));
5573 
5574         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5575                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5576                                          cpos, len);
5577                 if (ret) {
5578                         mlog_errno(ret);
5579                         goto out;
5580                 }
5581         } else {
5582                 ret = ocfs2_split_tree(handle, et, path, index,
5583                                        trunc_range, meta_ac);
5584                 if (ret) {
5585                         mlog_errno(ret);
5586                         goto out;
5587                 }
5588 
5589                 /*
5590                  * The split could have manipulated the tree enough to
5591                  * move the record location, so we have to look for it again.
5592                  */
5593                 ocfs2_reinit_path(path, 1);
5594 
5595                 ret = ocfs2_find_path(et->et_ci, path, cpos);
5596                 if (ret) {
5597                         mlog_errno(ret);
5598                         goto out;
5599                 }
5600 
5601                 el = path_leaf_el(path);
5602                 index = ocfs2_search_extent_list(el, cpos);
5603                 if (index == -1) {
5604                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5605                                     "Owner %llu: split at cpos %u lost record\n",
5606                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5607                                     cpos);
5608                         ret = -EROFS;
5609                         goto out;
5610                 }
5611 
5612                 /*
5613                  * Double check our values here. If anything is fishy,
5614                  * it's easier to catch it at the top level.
5615                  */
5616                 rec = &el->l_recs[index];
5617                 rec_range = le32_to_cpu(rec->e_cpos) +
5618                         ocfs2_rec_clusters(el, rec);
5619                 if (rec_range != trunc_range) {
5620                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5621                                     "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
5622                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5623                                     cpos, len, le32_to_cpu(rec->e_cpos),
5624                                     ocfs2_rec_clusters(el, rec));
5625                         ret = -EROFS;
5626                         goto out;
5627                 }
5628 
5629                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5630                                          cpos, len);
5631                 if (ret) {
5632                         mlog_errno(ret);
5633                         goto out;
5634                 }
5635         }
5636 
5637 out:
5638         ocfs2_free_path(path);
5639         return ret;
5640 }
5641 
5642 /*
5643  * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5644  * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5645  * number to reserve some extra blocks, and it only handles meta
5646  * data allocations.
5647  *
5648  * Currently, only ocfs2_remove_btree_range() uses it for truncating
5649  * and punching holes.
5650  */
5651 static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5652                                               struct ocfs2_extent_tree *et,
5653                                               u32 extents_to_split,
5654                                               struct ocfs2_alloc_context **ac,
5655                                               int extra_blocks)
5656 {
5657         int ret = 0, num_free_extents;
5658         unsigned int max_recs_needed = 2 * extents_to_split;
5659         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5660 
5661         *ac = NULL;
5662 
5663         num_free_extents = ocfs2_num_free_extents(et);
5664         if (num_free_extents < 0) {
5665                 ret = num_free_extents;
5666                 mlog_errno(ret);
5667                 goto out;
5668         }
5669 
5670         if (!num_free_extents ||
5671             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5672                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5673 
5674         if (extra_blocks) {
5675                 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5676                 if (ret < 0) {
5677                         if (ret != -ENOSPC)
5678                                 mlog_errno(ret);
5679                         goto out;
5680                 }
5681         }
5682 
5683 out:
5684         if (ret) {
5685                 if (*ac) {
5686                         ocfs2_free_alloc_context(*ac);
5687                         *ac = NULL;
5688                 }
5689         }
5690 
5691         return ret;
5692 }
5693 
5694 int ocfs2_remove_btree_range(struct inode *inode,
5695                              struct ocfs2_extent_tree *et,
5696                              u32 cpos, u32 phys_cpos, u32 len, int flags,
5697                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5698                              u64 refcount_loc, bool refcount_tree_locked)
5699 {
5700         int ret, credits = 0, extra_blocks = 0;
5701         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5702         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5703         struct inode *tl_inode = osb->osb_tl_inode;
5704         handle_t *handle;
5705         struct ocfs2_alloc_context *meta_ac = NULL;
5706         struct ocfs2_refcount_tree *ref_tree = NULL;
5707 
5708         if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5709                 BUG_ON(!ocfs2_is_refcount_inode(inode));
5710 
5711                 if (!refcount_tree_locked) {
5712                         ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5713                                                        &ref_tree, NULL);
5714                         if (ret) {
5715                                 mlog_errno(ret);
5716                                 goto bail;
5717                         }
5718                 }
5719 
5720                 ret = ocfs2_prepare_refcount_change_for_del(inode,
5721                                                             refcount_loc,
5722                                                             phys_blkno,
5723                                                             len,
5724                                                             &credits,
5725                                                             &extra_blocks);
5726                 if (ret < 0) {
5727                         mlog_errno(ret);
5728                         goto bail;
5729                 }
5730         }
5731 
5732         ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5733                                                  extra_blocks);
5734         if (ret) {
5735                 mlog_errno(ret);
5736                 goto bail;
5737         }
5738 
5739         inode_lock(tl_inode);
5740 
5741         if (ocfs2_truncate_log_needs_flush(osb)) {
5742                 ret = __ocfs2_flush_truncate_log(osb);
5743                 if (ret < 0) {
5744                         mlog_errno(ret);
5745                         goto out;
5746                 }
5747         }
5748 
5749         handle = ocfs2_start_trans(osb,
5750                         ocfs2_remove_extent_credits(osb->sb) + credits);
5751         if (IS_ERR(handle)) {
5752                 ret = PTR_ERR(handle);
5753                 mlog_errno(ret);
5754                 goto out;
5755         }
5756 
5757         ret = ocfs2_et_root_journal_access(handle, et,
5758                                            OCFS2_JOURNAL_ACCESS_WRITE);
5759         if (ret) {
5760                 mlog_errno(ret);
5761                 goto out_commit;
5762         }
5763 
5764         dquot_free_space_nodirty(inode,
5765                                   ocfs2_clusters_to_bytes(inode->i_sb, len));
5766 
5767         ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5768         if (ret) {
5769                 mlog_errno(ret);
5770                 goto out_commit;
5771         }
5772 
5773         ocfs2_et_update_clusters(et, -len);
5774         ocfs2_update_inode_fsync_trans(handle, inode, 1);
5775 
5776         ocfs2_journal_dirty(handle, et->et_root_bh);
5777 
5778         if (phys_blkno) {
5779                 if (flags & OCFS2_EXT_REFCOUNTED)
5780                         ret = ocfs2_decrease_refcount(inode, handle,
5781                                         ocfs2_blocks_to_clusters(osb->sb,
5782                                                                  phys_blkno),
5783                                         len, meta_ac,
5784                                         dealloc, 1);
5785                 else
5786                         ret = ocfs2_truncate_log_append(osb, handle,
5787                                                         phys_blkno, len);
5788                 if (ret)
5789                         mlog_errno(ret);
5790 
5791         }
5792 
5793 out_commit:
5794         ocfs2_commit_trans(osb, handle);
5795 out:
5796         inode_unlock(tl_inode);
5797 bail:
5798         if (meta_ac)
5799                 ocfs2_free_alloc_context(meta_ac);
5800 
5801         if (ref_tree)
5802                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5803 
5804         return ret;
5805 }
5806 
5807 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5808 {
5809         struct buffer_head *tl_bh = osb->osb_tl_bh;
5810         struct ocfs2_dinode *di;
5811         struct ocfs2_truncate_log *tl;
5812 
5813         di = (struct ocfs2_dinode *) tl_bh->b_data;
5814         tl = &di->id2.i_dealloc;
5815 
5816         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5817                         "slot %d, invalid truncate log parameters: used = "
5818                         "%u, count = %u\n", osb->slot_num,
5819                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5820         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5821 }
5822 
5823 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5824                                            unsigned int new_start)
5825 {
5826         unsigned int tail_index;
5827         unsigned int current_tail;
5828 
5829         /* No records, nothing to coalesce */
5830         if (!le16_to_cpu(tl->tl_used))
5831                 return 0;
5832 
5833         tail_index = le16_to_cpu(tl->tl_used) - 1;
5834         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5835         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5836 
5837         return current_tail == new_start;
5838 }
5839 
5840 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5841                               handle_t *handle,
5842                               u64 start_blk,
5843                               unsigned int num_clusters)
5844 {
5845         int status, index;
5846         unsigned int start_cluster, tl_count;
5847         struct inode *tl_inode = osb->osb_tl_inode;
5848         struct buffer_head *tl_bh = osb->osb_tl_bh;
5849         struct ocfs2_dinode *di;
5850         struct ocfs2_truncate_log *tl;
5851 
5852         BUG_ON(inode_trylock(tl_inode));
5853 
5854         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5855 
5856         di = (struct ocfs2_dinode *) tl_bh->b_data;
5857 
5858         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5859          * by the underlying call to ocfs2_read_inode_block(), so any
5860          * corruption is a code bug */
5861         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5862 
5863         tl = &di->id2.i_dealloc;
5864         tl_count = le16_to_cpu(tl->tl_count);
5865         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5866                         tl_count == 0,
5867                         "Truncate record count on #%llu invalid "
5868                         "wanted %u, actual %u\n",
5869                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5870                         ocfs2_truncate_recs_per_inode(osb->sb),
5871                         le16_to_cpu(tl->tl_count));
5872 
5873         /* Caller should have known to flush before calling us. */
5874         index = le16_to_cpu(tl->tl_used);
5875         if (index >= tl_count) {
5876                 status = -ENOSPC;
5877                 mlog_errno(status);
5878                 goto bail;
5879         }
5880 
5881         status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5882                                          OCFS2_JOURNAL_ACCESS_WRITE);
5883         if (status < 0) {
5884                 mlog_errno(status);
5885                 goto bail;
5886         }
5887 
5888         trace_ocfs2_truncate_log_append(
5889                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
5890                 start_cluster, num_clusters);
5891         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5892                 /*
5893                  * Move index back to the record we are coalescing with.
5894                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5895                  */
5896                 index--;
5897 
5898                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5899                 trace_ocfs2_truncate_log_append(
5900                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5901                         index, le32_to_cpu(tl->tl_recs[index].t_start),
5902                         num_clusters);
5903         } else {
5904                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5905                 tl->tl_used = cpu_to_le16(index + 1);
5906         }
5907         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5908 
5909         ocfs2_journal_dirty(handle, tl_bh);
5910 
5911         osb->truncated_clusters += num_clusters;
5912 bail:
5913         return status;
5914 }
5915 
5916 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5917                                          struct inode *data_alloc_inode,
5918                                          struct buffer_head *data_alloc_bh)
5919 {
5920         int status = 0;
5921         int i;
5922         unsigned int num_clusters;
5923         u64 start_blk;
5924         struct ocfs2_truncate_rec rec;
5925         struct ocfs2_dinode *di;
5926         struct ocfs2_truncate_log *tl;
5927         struct inode *tl_inode = osb->osb_tl_inode;
5928         struct buffer_head *tl_bh = osb->osb_tl_bh;
5929         handle_t *handle;
5930 
5931         di = (struct ocfs2_dinode *) tl_bh->b_data;
5932         tl = &di->id2.i_dealloc;
5933         i = le16_to_cpu(tl->tl_used) - 1;
5934         while (i >= 0) {
5935                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5936                 if (IS_ERR(handle)) {
5937                         status = PTR_ERR(handle);
5938                         mlog_errno(status);
5939                         goto bail;
5940                 }
5941 
5942                 /* Caller has given us at least enough credits to
5943                  * update the truncate log dinode */
5944                 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5945                                                  OCFS2_JOURNAL_ACCESS_WRITE);
5946                 if (status < 0) {
5947                         mlog_errno(status);
5948                         goto bail;
5949                 }
5950 
5951                 tl->tl_used = cpu_to_le16(i);
5952 
5953                 ocfs2_journal_dirty(handle, tl_bh);
5954 
5955                 rec = tl->tl_recs[i];
5956                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5957                                                     le32_to_cpu(rec.t_start));
5958                 num_clusters = le32_to_cpu(rec.t_clusters);
5959 
5960                 /* if start_blk is not set, we ignore the record as
5961                  * invalid. */
5962                 if (start_blk) {
5963                         trace_ocfs2_replay_truncate_records(
5964                                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5965                                 i, le32_to_cpu(rec.t_start), num_clusters);
5966 
5967                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5968                                                      data_alloc_bh, start_blk,
5969                                                      num_clusters);
5970                         if (status < 0) {
5971                                 mlog_errno(status);
5972                                 goto bail;
5973                         }
5974                 }
5975 
5976                 ocfs2_commit_trans(osb, handle);
5977                 i--;
5978         }
5979 
5980         osb->truncated_clusters = 0;
5981 
5982 bail:
5983         return status;
5984 }
5985 
5986 /* Expects you to already be holding tl_inode->i_mutex */
5987 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5988 {
5989         int status;
5990         unsigned int num_to_flush;
5991         struct inode *tl_inode = osb->osb_tl_inode;
5992         struct inode *data_alloc_inode = NULL;
5993         struct buffer_head *tl_bh = osb->osb_tl_bh;
5994         struct buffer_head *data_alloc_bh = NULL;
5995         struct ocfs2_dinode *di;
5996         struct ocfs2_truncate_log *tl;
5997 
5998         BUG_ON(inode_trylock(tl_inode));
5999 
6000         di = (struct ocfs2_dinode *) tl_bh->b_data;
6001 
6002         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
6003          * by the underlying call to ocfs2_read_inode_block(), so any
6004          * corruption is a code bug */
6005         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6006 
6007         tl = &di->id2.i_dealloc;
6008         num_to_flush = le16_to_cpu(tl->tl_used);
6009         trace_ocfs2_flush_truncate_log(
6010                 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
6011                 num_to_flush);
6012         if (!num_to_flush) {
6013                 status = 0;
6014                 goto out;
6015         }
6016 
6017         data_alloc_inode = ocfs2_get_system_file_inode(osb,
6018                                                        GLOBAL_BITMAP_SYSTEM_INODE,
6019                                                        OCFS2_INVALID_SLOT);
6020         if (!data_alloc_inode) {
6021                 status = -EINVAL;
6022                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
6023                 goto out;
6024         }
6025 
6026         inode_lock(data_alloc_inode);
6027 
6028         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
6029         if (status < 0) {
6030                 mlog_errno(status);
6031                 goto out_mutex;
6032         }
6033 
6034         status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
6035                                                data_alloc_bh);
6036         if (status < 0)
6037                 mlog_errno(status);
6038 
6039         brelse(data_alloc_bh);
6040         ocfs2_inode_unlock(data_alloc_inode, 1);
6041 
6042 out_mutex:
6043         inode_unlock(data_alloc_inode);
6044         iput(data_alloc_inode);
6045 
6046 out:
6047         return status;
6048 }
6049 
6050 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6051 {
6052         int status;
6053         struct inode *tl_inode = osb->osb_tl_inode;
6054 
6055         inode_lock(tl_inode);
6056         status = __ocfs2_flush_truncate_log(osb);
6057         inode_unlock(tl_inode);
6058 
6059         return status;
6060 }
6061 
6062 static void ocfs2_truncate_log_worker(struct work_struct *work)
6063 {
6064         int status;
6065         struct ocfs2_super *osb =
6066                 container_of(work, struct ocfs2_super,
6067                              osb_truncate_log_wq.work);
6068 
6069         status = ocfs2_flush_truncate_log(osb);
6070         if (status < 0)
6071                 mlog_errno(status);
6072         else
6073                 ocfs2_init_steal_slots(osb);
6074 }
6075 
6076 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
6077 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6078                                        int cancel)
6079 {
6080         if (osb->osb_tl_inode &&
6081                         atomic_read(&osb->osb_tl_disable) == 0) {
6082                 /* We want to push off log flushes while truncates are
6083                  * still running. */
6084                 if (cancel)
6085                         cancel_delayed_work(&osb->osb_truncate_log_wq);
6086 
6087                 queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
6088                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6089         }
6090 }
6091 
6092 /*
6093  * Try to flush truncate logs if we can free enough clusters from it.
6094  * As for return value, "< 0" means error, "" no space and "1" means
6095  * we have freed enough spaces and let the caller try to allocate again.
6096  */
6097 int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
6098                                         unsigned int needed)
6099 {
6100         tid_t target;
6101         int ret = 0;
6102         unsigned int truncated_clusters;
6103 
6104         inode_lock(osb->osb_tl_inode);
6105         truncated_clusters = osb->truncated_clusters;
6106         inode_unlock(osb->osb_tl_inode);
6107 
6108         /*
6109          * Check whether we can succeed in allocating if we free
6110          * the truncate log.
6111          */
6112         if (truncated_clusters < needed)
6113                 goto out;
6114 
6115         ret = ocfs2_flush_truncate_log(osb);
6116         if (ret) {
6117                 mlog_errno(ret);
6118                 goto out;
6119         }
6120 
6121         if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
6122                 jbd2_log_wait_commit(osb->journal->j_journal, target);
6123                 ret = 1;
6124         }
6125 out:
6126         return ret;
6127 }
6128 
6129 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6130                                        int slot_num,
6131                                        struct inode **tl_inode,
6132                                        struct buffer_head **tl_bh)
6133 {
6134         int status;
6135         struct inode *inode = NULL;
6136         struct buffer_head *bh = NULL;
6137 
6138         inode = ocfs2_get_system_file_inode(osb,
6139                                            TRUNCATE_LOG_SYSTEM_INODE,
6140                                            slot_num);
6141         if (!inode) {
6142                 status = -EINVAL;
6143                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
6144                 goto bail;
6145         }
6146 
6147         status = ocfs2_read_inode_block(inode, &bh);
6148         if (status < 0) {
6149                 iput(inode);
6150                 mlog_errno(status);
6151                 goto bail;
6152         }
6153 
6154         *tl_inode = inode;
6155         *tl_bh    = bh;
6156 bail:
6157         return status;
6158 }
6159 
6160 /* called during the 1st stage of node recovery. we stamp a clean
6161  * truncate log and pass back a copy for processing later. if the
6162  * truncate log does not require processing, a *tl_copy is set to
6163  * NULL. */
6164 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6165                                       int slot_num,
6166                                       struct ocfs2_dinode **tl_copy)
6167 {
6168         int status;
6169         struct inode *tl_inode = NULL;
6170         struct buffer_head *tl_bh = NULL;
6171         struct ocfs2_dinode *di;
6172         struct ocfs2_truncate_log *tl;
6173 
6174         *tl_copy = NULL;
6175 
6176         trace_ocfs2_begin_truncate_log_recovery(slot_num);
6177 
6178         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6179         if (status < 0) {
6180                 mlog_errno(status);
6181                 goto bail;
6182         }
6183 
6184         di = (struct ocfs2_dinode *) tl_bh->b_data;
6185 
6186         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6187          * validated by the underlying call to ocfs2_read_inode_block(),
6188          * so any corruption is a code bug */
6189         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6190 
6191         tl = &di->id2.i_dealloc;
6192         if (le16_to_cpu(tl->tl_used)) {
6193                 trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
6194 
6195                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6196                 if (!(*tl_copy)) {
6197                         status = -ENOMEM;
6198                         mlog_errno(status);
6199                         goto bail;
6200                 }
6201 
6202                 /* Assuming the write-out below goes well, this copy
6203                  * will be passed back to recovery for processing. */
6204                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6205 
6206                 /* All we need to do to clear the truncate log is set
6207                  * tl_used. */
6208                 tl->tl_used = 0;
6209 
6210                 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6211                 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6212                 if (status < 0) {
6213                         mlog_errno(status);
6214                         goto bail;
6215                 }
6216         }
6217 
6218 bail:
6219         iput(tl_inode);
6220         brelse(tl_bh);
6221 
6222         if (status < 0) {
6223                 kfree(*tl_copy);
6224                 *tl_copy = NULL;
6225                 mlog_errno(status);
6226         }
6227 
6228         return status;
6229 }
6230 
6231 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6232                                          struct ocfs2_dinode *tl_copy)
6233 {
6234         int status = 0;
6235         int i;
6236         unsigned int clusters, num_recs, start_cluster;
6237         u64 start_blk;
6238         handle_t *handle;
6239         struct inode *tl_inode = osb->osb_tl_inode;
6240         struct ocfs2_truncate_log *tl;
6241 
6242         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6243                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6244                 return -EINVAL;
6245         }
6246 
6247         tl = &tl_copy->id2.i_dealloc;
6248         num_recs = le16_to_cpu(tl->tl_used);
6249         trace_ocfs2_complete_truncate_log_recovery(
6250                 (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
6251                 num_recs);
6252 
6253         inode_lock(tl_inode);
6254         for(i = 0; i < num_recs; i++) {
6255                 if (ocfs2_truncate_log_needs_flush(osb)) {
6256                         status = __ocfs2_flush_truncate_log(osb);
6257                         if (status < 0) {
6258                                 mlog_errno(status);
6259                                 goto bail_up;
6260                         }
6261                 }
6262 
6263                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6264                 if (IS_ERR(handle)) {
6265                         status = PTR_ERR(handle);
6266                         mlog_errno(status);
6267                         goto bail_up;
6268                 }
6269 
6270                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6271                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6272                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6273 
6274                 status = ocfs2_truncate_log_append(osb, handle,
6275                                                    start_blk, clusters);
6276                 ocfs2_commit_trans(osb, handle);
6277                 if (status < 0) {
6278                         mlog_errno(status);
6279                         goto bail_up;
6280                 }
6281         }
6282 
6283 bail_up:
6284         inode_unlock(tl_inode);
6285 
6286         return status;
6287 }
6288 
6289 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6290 {
6291         int status;
6292         struct inode *tl_inode = osb->osb_tl_inode;
6293 
6294         atomic_set(&osb->osb_tl_disable, 1);
6295 
6296         if (tl_inode) {
6297                 cancel_delayed_work(&osb->osb_truncate_log_wq);
6298                 flush_workqueue(osb->ocfs2_wq);
6299 
6300                 status = ocfs2_flush_truncate_log(osb);
6301                 if (status < 0)
6302                         mlog_errno(status);
6303 
6304                 brelse(osb->osb_tl_bh);
6305                 iput(osb->osb_tl_inode);
6306         }
6307 }
6308 
6309 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6310 {
6311         int status;
6312         struct inode *tl_inode = NULL;
6313         struct buffer_head *tl_bh = NULL;
6314 
6315         status = ocfs2_get_truncate_log_info(osb,
6316                                              osb->slot_num,
6317                                              &tl_inode,
6318                                              &tl_bh);
6319         if (status < 0)
6320                 mlog_errno(status);
6321 
6322         /* ocfs2_truncate_log_shutdown keys on the existence of
6323          * osb->osb_tl_inode so we don't set any of the osb variables
6324          * until we're sure all is well. */
6325         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6326                           ocfs2_truncate_log_worker);
6327         atomic_set(&osb->osb_tl_disable, 0);
6328         osb->osb_tl_bh    = tl_bh;
6329         osb->osb_tl_inode = tl_inode;
6330 
6331         return status;
6332 }
6333 
6334 /*
6335  * Delayed de-allocation of suballocator blocks.
6336  *
6337  * Some sets of block de-allocations might involve multiple suballocator inodes.
6338  *
6339  * The locking for this can get extremely complicated, especially when
6340  * the suballocator inodes to delete from aren't known until deep
6341  * within an unrelated codepath.
6342  *
6343  * ocfs2_extent_block structures are a good example of this - an inode
6344  * btree could have been grown by any number of nodes each allocating
6345  * out of their own suballoc inode.
6346  *
6347  * These structures allow the delay of block de-allocation until a
6348  * later time, when locking of multiple cluster inodes won't cause
6349  * deadlock.
6350  */
6351 
6352 /*
6353  * Describe a single bit freed from a suballocator.  For the block
6354  * suballocators, it represents one block.  For the global cluster
6355  * allocator, it represents some clusters and free_bit indicates
6356  * clusters number.
6357  */
6358 struct ocfs2_cached_block_free {
6359         struct ocfs2_cached_block_free          *free_next;
6360         u64                                     free_bg;
6361         u64                                     free_blk;
6362         unsigned int                            free_bit;
6363 };
6364 
6365 struct ocfs2_per_slot_free_list {
6366         struct ocfs2_per_slot_free_list         *f_next_suballocator;
6367         int                                     f_inode_type;
6368         int                                     f_slot;
6369         struct ocfs2_cached_block_free          *f_first;
6370 };
6371 
6372 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6373                                     int sysfile_type,
6374                                     int slot,
6375                                     struct ocfs2_cached_block_free *head)
6376 {
6377         int ret;
6378         u64 bg_blkno;
6379         handle_t *handle;
6380         struct inode *inode;
6381         struct buffer_head *di_bh = NULL;
6382         struct ocfs2_cached_block_free *tmp;
6383 
6384         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6385         if (!inode) {
6386                 ret = -EINVAL;
6387                 mlog_errno(ret);
6388                 goto out;
6389         }
6390 
6391         inode_lock(inode);
6392 
6393         ret = ocfs2_inode_lock(inode, &di_bh, 1);
6394         if (ret) {
6395                 mlog_errno(ret);
6396                 goto out_mutex;
6397         }
6398 
6399         while (head) {
6400                 if (head->free_bg)
6401                         bg_blkno = head->free_bg;
6402                 else
6403                         bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6404                                                               head->free_bit);
6405                 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6406                 if (IS_ERR(handle)) {
6407                         ret = PTR_ERR(handle);
6408                         mlog_errno(ret);
6409                         goto out_unlock;
6410                 }
6411 
6412                 trace_ocfs2_free_cached_blocks(
6413                      (unsigned long long)head->free_blk, head->free_bit);
6414 
6415                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6416                                                head->free_bit, bg_blkno, 1);
6417                 if (ret)
6418                         mlog_errno(ret);
6419 
6420                 ocfs2_commit_trans(osb, handle);
6421 
6422                 tmp = head;
6423                 head = head->free_next;
6424                 kfree(tmp);
6425         }
6426 
6427 out_unlock:
6428         ocfs2_inode_unlock(inode, 1);
6429         brelse(di_bh);
6430 out_mutex:
6431         inode_unlock(inode);
6432         iput(inode);
6433 out:
6434         while(head) {
6435                 /* Premature exit may have left some dangling items. */
6436                 tmp = head;
6437                 head = head->free_next;
6438                 kfree(tmp);
6439         }
6440 
6441         return ret;
6442 }
6443 
6444 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6445                                 u64 blkno, unsigned int bit)
6446 {
6447         int ret = 0;
6448         struct ocfs2_cached_block_free *item;
6449 
6450         item = kzalloc(sizeof(*item), GFP_NOFS);
6451         if (item == NULL) {
6452                 ret = -ENOMEM;
6453                 mlog_errno(ret);
6454                 return ret;
6455         }
6456 
6457         trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
6458 
6459         item->free_blk = blkno;
6460         item->free_bit = bit;
6461         item->free_next = ctxt->c_global_allocator;
6462 
6463         ctxt->c_global_allocator = item;
6464         return ret;
6465 }
6466 
6467 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6468                                       struct ocfs2_cached_block_free *head)
6469 {
6470         struct ocfs2_cached_block_free *tmp;
6471         struct inode *tl_inode = osb->osb_tl_inode;
6472         handle_t *handle;
6473         int ret = 0;
6474 
6475         inode_lock(tl_inode);
6476 
6477         while (head) {
6478                 if (ocfs2_truncate_log_needs_flush(osb)) {
6479                         ret = __ocfs2_flush_truncate_log(osb);
6480                         if (ret < 0) {
6481                                 mlog_errno(ret);
6482                                 break;
6483                         }
6484                 }
6485 
6486                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6487                 if (IS_ERR(handle)) {
6488                         ret = PTR_ERR(handle);
6489                         mlog_errno(ret);
6490                         break;
6491                 }
6492 
6493                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6494                                                 head->free_bit);
6495 
6496                 ocfs2_commit_trans(osb, handle);
6497                 tmp = head;
6498                 head = head->free_next;
6499                 kfree(tmp);
6500 
6501                 if (ret < 0) {
6502                         mlog_errno(ret);
6503                         break;
6504                 }
6505         }
6506 
6507         inode_unlock(tl_inode);
6508 
6509         while (head) {
6510                 /* Premature exit may have left some dangling items. */
6511                 tmp = head;
6512                 head = head->free_next;
6513                 kfree(tmp);
6514         }
6515 
6516         return ret;
6517 }
6518 
6519 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6520                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6521 {
6522         int ret = 0, ret2;
6523         struct ocfs2_per_slot_free_list *fl;
6524 
6525         if (!ctxt)
6526                 return 0;
6527 
6528         while (ctxt->c_first_suballocator) {
6529                 fl = ctxt->c_first_suballocator;
6530 
6531                 if (fl->f_first) {
6532                         trace_ocfs2_run_deallocs(fl->f_inode_type,
6533                                                  fl->f_slot);
6534                         ret2 = ocfs2_free_cached_blocks(osb,
6535                                                         fl->f_inode_type,
6536                                                         fl->f_slot,
6537                                                         fl->f_first);
6538                         if (ret2)
6539                                 mlog_errno(ret2);
6540                         if (!ret)
6541                                 ret = ret2;
6542                 }
6543 
6544                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6545                 kfree(fl);
6546         }
6547 
6548         if (ctxt->c_global_allocator) {
6549                 ret2 = ocfs2_free_cached_clusters(osb,
6550                                                   ctxt->c_global_allocator);
6551                 if (ret2)
6552                         mlog_errno(ret2);
6553                 if (!ret)
6554                         ret = ret2;
6555 
6556                 ctxt->c_global_allocator = NULL;
6557         }
6558 
6559         return ret;
6560 }
6561 
6562 static struct ocfs2_per_slot_free_list *
6563 ocfs2_find_per_slot_free_list(int type,
6564                               int slot,
6565                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6566 {
6567         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6568 
6569         while (fl) {
6570                 if (fl->f_inode_type == type && fl->f_slot == slot)
6571                         return fl;
6572 
6573                 fl = fl->f_next_suballocator;
6574         }
6575 
6576         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6577         if (fl) {
6578                 fl->f_inode_type = type;
6579                 fl->f_slot = slot;
6580                 fl->f_first = NULL;
6581                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6582 
6583                 ctxt->c_first_suballocator = fl;
6584         }
6585         return fl;
6586 }
6587 
6588 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6589                               int type, int slot, u64 suballoc,
6590                               u64 blkno, unsigned int bit)
6591 {
6592         int ret;
6593         struct ocfs2_per_slot_free_list *fl;
6594         struct ocfs2_cached_block_free *item;
6595 
6596         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6597         if (fl == NULL) {
6598                 ret = -ENOMEM;
6599                 mlog_errno(ret);
6600                 goto out;
6601         }
6602 
6603         item = kzalloc(sizeof(*item), GFP_NOFS);
6604         if (item == NULL) {
6605                 ret = -ENOMEM;
6606                 mlog_errno(ret);
6607                 goto out;
6608         }
6609 
6610         trace_ocfs2_cache_block_dealloc(type, slot,
6611                                         (unsigned long long)suballoc,
6612                                         (unsigned long long)blkno, bit);
6613 
6614         item->free_bg = suballoc;
6615         item->free_blk = blkno;
6616         item->free_bit = bit;
6617         item->free_next = fl->f_first;
6618 
6619         fl->f_first = item;
6620 
6621         ret = 0;
6622 out:
6623         return ret;
6624 }
6625 
6626 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6627                                          struct ocfs2_extent_block *eb)
6628 {
6629         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6630                                          le16_to_cpu(eb->h_suballoc_slot),
6631                                          le64_to_cpu(eb->h_suballoc_loc),
6632                                          le64_to_cpu(eb->h_blkno),
6633                                          le16_to_cpu(eb->h_suballoc_bit));
6634 }
6635 
6636 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6637 {
6638         set_buffer_uptodate(bh);
6639         mark_buffer_dirty(bh);
6640         return 0;
6641 }
6642 
6643 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6644                               unsigned int from, unsigned int to,
6645                               struct page *page, int zero, u64 *phys)
6646 {
6647         int ret, partial = 0;
6648 
6649         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6650         if (ret)
6651                 mlog_errno(ret);
6652 
6653         if (zero)
6654                 zero_user_segment(page, from, to);
6655 
6656         /*
6657          * Need to set the buffers we zero'd into uptodate
6658          * here if they aren't - ocfs2_map_page_blocks()
6659          * might've skipped some
6660          */
6661         ret = walk_page_buffers(handle, page_buffers(page),
6662                                 from, to, &partial,
6663                                 ocfs2_zero_func);
6664         if (ret < 0)
6665                 mlog_errno(ret);
6666         else if (ocfs2_should_order_data(inode)) {
6667                 ret = ocfs2_jbd2_file_inode(handle, inode);
6668                 if (ret < 0)
6669                         mlog_errno(ret);
6670         }
6671 
6672         if (!partial)
6673                 SetPageUptodate(page);
6674 
6675         flush_dcache_page(page);
6676 }
6677 
6678 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6679                                      loff_t end, struct page **pages,
6680                                      int numpages, u64 phys, handle_t *handle)
6681 {
6682         int i;
6683         struct page *page;
6684         unsigned int from, to = PAGE_SIZE;
6685         struct super_block *sb = inode->i_sb;
6686 
6687         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6688 
6689         if (numpages == 0)
6690                 goto out;
6691 
6692         to = PAGE_SIZE;
6693         for(i = 0; i < numpages; i++) {
6694                 page = pages[i];
6695 
6696                 from = start & (PAGE_SIZE - 1);
6697                 if ((end >> PAGE_SHIFT) == page->index)
6698                         to = end & (PAGE_SIZE - 1);
6699 
6700                 BUG_ON(from > PAGE_SIZE);
6701                 BUG_ON(to > PAGE_SIZE);
6702 
6703                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6704                                          &phys);
6705 
6706                 start = (page->index + 1) << PAGE_SHIFT;
6707         }
6708 out:
6709         if (pages)
6710                 ocfs2_unlock_and_free_pages(pages, numpages);
6711 }
6712 
6713 int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6714                      struct page **pages, int *num)
6715 {
6716         int numpages, ret = 0;
6717         struct address_space *mapping = inode->i_mapping;
6718         unsigned long index;
6719         loff_t last_page_bytes;
6720 
6721         BUG_ON(start > end);
6722 
6723         numpages = 0;
6724         last_page_bytes = PAGE_ALIGN(end);
6725         index = start >> PAGE_SHIFT;
6726         do {
6727                 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6728                 if (!pages[numpages]) {
6729                         ret = -ENOMEM;
6730                         mlog_errno(ret);
6731                         goto out;
6732                 }
6733 
6734                 numpages++;
6735                 index++;
6736         } while (index < (last_page_bytes >> PAGE_SHIFT));
6737 
6738 out:
6739         if (ret != 0) {
6740                 if (pages)
6741                         ocfs2_unlock_and_free_pages(pages, numpages);
6742                 numpages = 0;
6743         }
6744 
6745         *num = numpages;
6746 
6747         return ret;
6748 }
6749 
6750 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6751                                 struct page **pages, int *num)
6752 {
6753         struct super_block *sb = inode->i_sb;
6754 
6755         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6756                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6757 
6758         return ocfs2_grab_pages(inode, start, end, pages, num);
6759 }
6760 
6761 /*
6762  * Zero the area past i_size but still within an allocated
6763  * cluster. This avoids exposing nonzero data on subsequent file
6764  * extends.
6765  *
6766  * We need to call this before i_size is updated on the inode because
6767  * otherwise block_write_full_page() will skip writeout of pages past
6768  * i_size. The new_i_size parameter is passed for this reason.
6769  */
6770 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6771                                   u64 range_start, u64 range_end)
6772 {
6773         int ret = 0, numpages;
6774         struct page **pages = NULL;
6775         u64 phys;
6776         unsigned int ext_flags;
6777         struct super_block *sb = inode->i_sb;
6778 
6779         /*
6780          * File systems which don't support sparse files zero on every
6781          * extend.
6782          */
6783         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6784                 return 0;
6785 
6786         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6787                         sizeof(struct page *), GFP_NOFS);
6788         if (pages == NULL) {
6789                 ret = -ENOMEM;
6790                 mlog_errno(ret);
6791                 goto out;
6792         }
6793 
6794         if (range_start == range_end)
6795                 goto out;
6796 
6797         ret = ocfs2_extent_map_get_blocks(inode,
6798                                           range_start >> sb->s_blocksize_bits,
6799                                           &phys, NULL, &ext_flags);
6800         if (ret) {
6801                 mlog_errno(ret);
6802                 goto out;
6803         }
6804 
6805         /*
6806          * Tail is a hole, or is marked unwritten. In either case, we
6807          * can count on read and write to return/push zero's.
6808          */
6809         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6810                 goto out;
6811 
6812         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6813                                    &numpages);
6814         if (ret) {
6815                 mlog_errno(ret);
6816                 goto out;
6817         }
6818 
6819         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6820                                  numpages, phys, handle);
6821 
6822         /*
6823          * Initiate writeout of the pages we zero'd here. We don't
6824          * wait on them - the truncate_inode_pages() call later will
6825          * do that for us.
6826          */
6827         ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
6828                                        range_end - 1);
6829         if (ret)
6830                 mlog_errno(ret);
6831 
6832 out:
6833         kfree(pages);
6834 
6835         return ret;
6836 }
6837 
6838 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6839                                              struct ocfs2_dinode *di)
6840 {
6841         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6842         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6843 
6844         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6845                 memset(&di->id2, 0, blocksize -
6846                                     offsetof(struct ocfs2_dinode, id2) -
6847                                     xattrsize);
6848         else
6849                 memset(&di->id2, 0, blocksize -
6850                                     offsetof(struct ocfs2_dinode, id2));
6851 }
6852 
6853 void ocfs2_dinode_new_extent_list(struct inode *inode,
6854                                   struct ocfs2_dinode *di)
6855 {
6856         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6857         di->id2.i_list.l_tree_depth = 0;
6858         di->id2.i_list.l_next_free_rec = 0;
6859         di->id2.i_list.l_count = cpu_to_le16(
6860                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6861 }
6862 
6863 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6864 {
6865         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6866         struct ocfs2_inline_data *idata = &di->id2.i_data;
6867 
6868         spin_lock(&oi->ip_lock);
6869         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6870         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6871         spin_unlock(&oi->ip_lock);
6872 
6873         /*
6874          * We clear the entire i_data structure here so that all
6875          * fields can be properly initialized.
6876          */
6877         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6878 
6879         idata->id_count = cpu_to_le16(
6880                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6881 }
6882 
6883 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6884                                          struct buffer_head *di_bh)
6885 {
6886         int ret, i, has_data, num_pages = 0;
6887         int need_free = 0;
6888         u32 bit_off, num;
6889         handle_t *handle;
6890         u64 uninitialized_var(block);
6891         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6892         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6893         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6894         struct ocfs2_alloc_context *data_ac = NULL;
6895         struct page **pages = NULL;
6896         loff_t end = osb->s_clustersize;
6897         struct ocfs2_extent_tree et;
6898         int did_quota = 0;
6899 
6900         has_data = i_size_read(inode) ? 1 : 0;
6901 
6902         if (has_data) {
6903                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6904                                 sizeof(struct page *), GFP_NOFS);
6905                 if (pages == NULL) {
6906                         ret = -ENOMEM;
6907                         mlog_errno(ret);
6908                         return ret;
6909                 }
6910 
6911                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6912                 if (ret) {
6913                         mlog_errno(ret);
6914                         goto free_pages;
6915                 }
6916         }
6917 
6918         handle = ocfs2_start_trans(osb,
6919                                    ocfs2_inline_to_extents_credits(osb->sb));
6920         if (IS_ERR(handle)) {
6921                 ret = PTR_ERR(handle);
6922                 mlog_errno(ret);
6923                 goto out;
6924         }
6925 
6926         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
6927                                       OCFS2_JOURNAL_ACCESS_WRITE);
6928         if (ret) {
6929                 mlog_errno(ret);
6930                 goto out_commit;
6931         }
6932 
6933         if (has_data) {
6934                 unsigned int page_end;
6935                 u64 phys;
6936 
6937                 ret = dquot_alloc_space_nodirty(inode,
6938                                        ocfs2_clusters_to_bytes(osb->sb, 1));
6939                 if (ret)
6940                         goto out_commit;
6941                 did_quota = 1;
6942 
6943                 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6944 
6945                 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
6946                                            &num);
6947                 if (ret) {
6948                         mlog_errno(ret);
6949                         goto out_commit;
6950                 }
6951 
6952                 /*
6953                  * Save two copies, one for insert, and one that can
6954                  * be changed by ocfs2_map_and_dirty_page() below.
6955                  */
6956                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6957 
6958                 /*
6959                  * Non sparse file systems zero on extend, so no need
6960                  * to do that now.
6961                  */
6962                 if (!ocfs2_sparse_alloc(osb) &&
6963                     PAGE_SIZE < osb->s_clustersize)
6964                         end = PAGE_SIZE;
6965 
6966                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6967                 if (ret) {
6968                         mlog_errno(ret);
6969                         need_free = 1;
6970                         goto out_commit;
6971                 }
6972 
6973                 /*
6974                  * This should populate the 1st page for us and mark
6975                  * it up to date.
6976                  */
6977                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6978                 if (ret) {
6979                         mlog_errno(ret);
6980                         need_free = 1;
6981                         goto out_unlock;
6982                 }
6983 
6984                 page_end = PAGE_SIZE;
6985                 if (PAGE_SIZE > osb->s_clustersize)
6986                         page_end = osb->s_clustersize;
6987 
6988                 for (i = 0; i < num_pages; i++)
6989                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6990                                                  pages[i], i > 0, &phys);
6991         }
6992 
6993         spin_lock(&oi->ip_lock);
6994         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6995         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6996         spin_unlock(&oi->ip_lock);
6997 
6998         ocfs2_update_inode_fsync_trans(handle, inode, 1);
6999         ocfs2_dinode_new_extent_list(inode, di);
7000 
7001         ocfs2_journal_dirty(handle, di_bh);
7002 
7003         if (has_data) {
7004                 /*
7005                  * An error at this point should be extremely rare. If
7006                  * this proves to be false, we could always re-build
7007                  * the in-inode data from our pages.
7008                  */
7009                 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7010                 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7011                 if (ret) {
7012                         mlog_errno(ret);
7013                         need_free = 1;
7014                         goto out_unlock;
7015                 }
7016 
7017                 inode->i_blocks = ocfs2_inode_sector_count(inode);
7018         }
7019 
7020 out_unlock:
7021         if (pages)
7022                 ocfs2_unlock_and_free_pages(pages, num_pages);
7023 
7024 out_commit:
7025         if (ret < 0 && did_quota)
7026                 dquot_free_space_nodirty(inode,
7027                                           ocfs2_clusters_to_bytes(osb->sb, 1));
7028 
7029         if (need_free) {
7030                 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
7031                         ocfs2_free_local_alloc_bits(osb, handle, data_ac,
7032                                         bit_off, num);
7033                 else
7034                         ocfs2_free_clusters(handle,
7035                                         data_ac->ac_inode,
7036                                         data_ac->ac_bh,
7037                                         ocfs2_clusters_to_blocks(osb->sb, bit_off),
7038                                         num);
7039         }
7040 
7041         ocfs2_commit_trans(osb, handle);
7042 
7043 out:
7044         if (data_ac)
7045                 ocfs2_free_alloc_context(data_ac);
7046 free_pages:
7047         kfree(pages);
7048         return ret;
7049 }
7050 
7051 /*
7052  * It is expected, that by the time you call this function,
7053  * inode->i_size and fe->i_size have been adjusted.
7054  *
7055  * WARNING: This will kfree the truncate context
7056  */
7057 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7058                           struct inode *inode,
7059                           struct buffer_head *di_bh)
7060 {
7061         int status = 0, i, flags = 0;
7062         u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7063         u64 blkno = 0;
7064         struct ocfs2_extent_list *el;
7065         struct ocfs2_extent_rec *rec;
7066         struct ocfs2_path *path = NULL;
7067         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7068         struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7069         u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7070         struct ocfs2_extent_tree et;
7071         struct ocfs2_cached_dealloc_ctxt dealloc;
7072         struct ocfs2_refcount_tree *ref_tree = NULL;
7073 
7074         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7075         ocfs2_init_dealloc_ctxt(&dealloc);
7076 
7077         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7078                                                      i_size_read(inode));
7079 
7080         path = ocfs2_new_path(di_bh, &di->id2.i_list,
7081                               ocfs2_journal_access_di);
7082         if (!path) {
7083                 status = -ENOMEM;
7084                 mlog_errno(status);
7085                 goto bail;
7086         }
7087 
7088         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7089 
7090 start:
7091         /*
7092          * Check that we still have allocation to delete.
7093          */
7094         if (OCFS2_I(inode)->ip_clusters == 0) {
7095                 status = 0;
7096                 goto bail;
7097         }
7098 
7099         /*
7100          * Truncate always works against the rightmost tree branch.
7101          */
7102         status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7103         if (status) {
7104                 mlog_errno(status);
7105                 goto bail;
7106         }
7107 
7108         trace_ocfs2_commit_truncate(
7109                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
7110                 new_highest_cpos,
7111                 OCFS2_I(inode)->ip_clusters,
7112                 path->p_tree_depth);
7113 
7114         /*
7115          * By now, el will point to the extent list on the bottom most
7116          * portion of this tree. Only the tail record is considered in
7117          * each pass.
7118          *
7119          * We handle the following cases, in order:
7120          * - empty extent: delete the remaining branch
7121          * - remove the entire record
7122          * - remove a partial record
7123          * - no record needs to be removed (truncate has completed)
7124          */
7125         el = path_leaf_el(path);
7126         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7127                 ocfs2_error(inode->i_sb,
7128                             "Inode %llu has empty extent block at %llu\n",
7129                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7130                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7131                 status = -EROFS;
7132                 goto bail;
7133         }
7134 
7135         i = le16_to_cpu(el->l_next_free_rec) - 1;
7136         rec = &el->l_recs[i];
7137         flags = rec->e_flags;
7138         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7139 
7140         if (i == 0 && ocfs2_is_empty_extent(rec)) {
7141                 /*
7142                  * Lower levels depend on this never happening, but it's best
7143                  * to check it up here before changing the tree.
7144                 */
7145                 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7146                         mlog(ML_ERROR, "Inode %lu has an empty "
7147                                     "extent record, depth %u\n", inode->i_ino,
7148                                     le16_to_cpu(root_el->l_tree_depth));
7149                         status = ocfs2_remove_rightmost_empty_extent(osb,
7150                                         &et, path, &dealloc);
7151                         if (status) {
7152                                 mlog_errno(status);
7153                                 goto bail;
7154                         }
7155 
7156                         ocfs2_reinit_path(path, 1);
7157                         goto start;
7158                 } else {
7159                         trunc_cpos = le32_to_cpu(rec->e_cpos);
7160                         trunc_len = 0;
7161                         blkno = 0;
7162                 }
7163         } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7164                 /*
7165                  * Truncate entire record.
7166                  */
7167                 trunc_cpos = le32_to_cpu(rec->e_cpos);
7168                 trunc_len = ocfs2_rec_clusters(el, rec);
7169                 blkno = le64_to_cpu(rec->e_blkno);
7170         } else if (range > new_highest_cpos) {
7171                 /*
7172                  * Partial truncate. it also should be
7173                  * the last truncate we're doing.
7174                  */
7175                 trunc_cpos = new_highest_cpos;
7176                 trunc_len = range - new_highest_cpos;
7177                 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7178                 blkno = le64_to_cpu(rec->e_blkno) +
7179                                 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7180         } else {
7181                 /*
7182                  * Truncate completed, leave happily.
7183                  */
7184                 status = 0;
7185                 goto bail;
7186         }
7187 
7188         phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7189 
7190         if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
7191                 status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
7192                                 &ref_tree, NULL);
7193                 if (status) {
7194                         mlog_errno(status);
7195                         goto bail;
7196                 }
7197         }
7198 
7199         status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7200                                           phys_cpos, trunc_len, flags, &dealloc,
7201                                           refcount_loc, true);
7202         if (status < 0) {
7203                 mlog_errno(status);
7204                 goto bail;
7205         }
7206 
7207         ocfs2_reinit_path(path, 1);
7208 
7209         /*
7210          * The check above will catch the case where we've truncated
7211          * away all allocation.
7212          */
7213         goto start;
7214 
7215 bail:
7216         if (ref_tree)
7217                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7218 
7219         ocfs2_schedule_truncate_log_flush(osb, 1);
7220 
7221         ocfs2_run_deallocs(osb, &dealloc);
7222 
7223         ocfs2_free_path(path);
7224 
7225         return status;
7226 }
7227 
7228 /*
7229  * 'start' is inclusive, 'end' is not.
7230  */
7231 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7232                           unsigned int start, unsigned int end, int trunc)
7233 {
7234         int ret;
7235         unsigned int numbytes;
7236         handle_t *handle;
7237         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7238         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7239         struct ocfs2_inline_data *idata = &di->id2.i_data;
7240 
7241         if (end > i_size_read(inode))
7242                 end = i_size_read(inode);
7243 
7244         BUG_ON(start > end);
7245 
7246         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7247             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7248             !ocfs2_supports_inline_data(osb)) {
7249                 ocfs2_error(inode->i_sb,
7250                             "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7251                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7252                             le16_to_cpu(di->i_dyn_features),
7253                             OCFS2_I(inode)->ip_dyn_features,
7254                             osb->s_feature_incompat);
7255                 ret = -EROFS;
7256                 goto out;
7257         }
7258 
7259         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7260         if (IS_ERR(handle)) {
7261                 ret = PTR_ERR(handle);
7262                 mlog_errno(ret);
7263                 goto out;
7264         }
7265 
7266         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7267                                       OCFS2_JOURNAL_ACCESS_WRITE);
7268         if (ret) {
7269                 mlog_errno(ret);
7270                 goto out_commit;
7271         }
7272 
7273         numbytes = end - start;
7274         memset(idata->id_data + start, 0, numbytes);
7275 
7276         /*
7277          * No need to worry about the data page here - it's been
7278          * truncated already and inline data doesn't need it for
7279          * pushing zero's to disk, so we'll let readpage pick it up
7280          * later.
7281          */
7282         if (trunc) {
7283                 i_size_write(inode, start);
7284                 di->i_size = cpu_to_le64(start);
7285         }
7286 
7287         inode->i_blocks = ocfs2_inode_sector_count(inode);
7288         inode->i_ctime = inode->i_mtime = current_time(inode);
7289 
7290         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7291         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7292 
7293         ocfs2_update_inode_fsync_trans(handle, inode, 1);
7294         ocfs2_journal_dirty(handle, di_bh);
7295 
7296 out_commit:
7297         ocfs2_commit_trans(osb, handle);
7298 
7299 out:
7300         return ret;
7301 }
7302 
7303 static int ocfs2_trim_extent(struct super_block *sb,
7304                              struct ocfs2_group_desc *gd,
7305                              u64 group, u32 start, u32 count)
7306 {
7307         u64 discard, bcount;
7308         struct ocfs2_super *osb = OCFS2_SB(sb);
7309 
7310         bcount = ocfs2_clusters_to_blocks(sb, count);
7311         discard = ocfs2_clusters_to_blocks(sb, start);
7312 
7313         /*
7314          * For the first cluster group, the gd->bg_blkno is not at the start
7315          * of the group, but at an offset from the start. If we add it while
7316          * calculating discard for first group, we will wrongly start fstrim a
7317          * few blocks after the desried start block and the range can cross
7318          * over into the next cluster group. So, add it only if this is not
7319          * the first cluster group.
7320          */
7321         if (group != osb->first_cluster_group_blkno)
7322                 discard += le64_to_cpu(gd->bg_blkno);
7323 
7324         trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7325 
7326         return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7327 }
7328 
7329 static int ocfs2_trim_group(struct super_block *sb,
7330                             struct ocfs2_group_desc *gd, u64 group,
7331                             u32 start, u32 max, u32 minbits)
7332 {
7333         int ret = 0, count = 0, next;
7334         void *bitmap = gd->bg_bitmap;
7335 
7336         if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7337                 return 0;
7338 
7339         trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7340                                start, max, minbits);
7341 
7342         while (start < max) {
7343                 start = ocfs2_find_next_zero_bit(bitmap, max, start);
7344                 if (start >= max)
7345                         break;
7346                 next = ocfs2_find_next_bit(bitmap, max, start);
7347 
7348                 if ((next - start) >= minbits) {
7349                         ret = ocfs2_trim_extent(sb, gd, group,
7350                                                 start, next - start);
7351                         if (ret < 0) {
7352                                 mlog_errno(ret);
7353                                 break;
7354                         }
7355                         count += next - start;
7356                 }
7357                 start = next + 1;
7358 
7359                 if (fatal_signal_pending(current)) {
7360                         count = -ERESTARTSYS;
7361                         break;
7362                 }
7363 
7364                 if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7365                         break;
7366         }
7367 
7368         if (ret < 0)
7369                 count = ret;
7370 
7371         return count;
7372 }
7373 
7374 int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7375 {
7376         struct ocfs2_super *osb = OCFS2_SB(sb);
7377         u64 start, len, trimmed, first_group, last_group, group;
7378         int ret, cnt;
7379         u32 first_bit, last_bit, minlen;
7380         struct buffer_head *main_bm_bh = NULL;
7381         struct inode *main_bm_inode = NULL;
7382         struct buffer_head *gd_bh = NULL;
7383         struct ocfs2_dinode *main_bm;
7384         struct ocfs2_group_desc *gd = NULL;
7385 
7386         start = range->start >> osb->s_clustersize_bits;
7387         len = range->len >> osb->s_clustersize_bits;
7388         minlen = range->minlen >> osb->s_clustersize_bits;
7389 
7390         if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7391                 return -EINVAL;
7392 
7393         main_bm_inode = ocfs2_get_system_file_inode(osb,
7394                                                     GLOBAL_BITMAP_SYSTEM_INODE,
7395                                                     OCFS2_INVALID_SLOT);
7396         if (!main_bm_inode) {
7397                 ret = -EIO;
7398                 mlog_errno(ret);
7399                 goto out;
7400         }
7401 
7402         inode_lock(main_bm_inode);
7403 
7404         ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7405         if (ret < 0) {
7406                 mlog_errno(ret);
7407                 goto out_mutex;
7408         }
7409         main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7410 
7411         if (start >= le32_to_cpu(main_bm->i_clusters)) {
7412                 ret = -EINVAL;
7413                 goto out_unlock;
7414         }
7415 
7416         len = range->len >> osb->s_clustersize_bits;
7417         if (start + len > le32_to_cpu(main_bm->i_clusters))
7418                 len = le32_to_cpu(main_bm->i_clusters) - start;
7419 
7420         trace_ocfs2_trim_fs(start, len, minlen);
7421 
7422         /* Determine first and last group to examine based on start and len */
7423         first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7424         if (first_group == osb->first_cluster_group_blkno)
7425                 first_bit = start;
7426         else
7427                 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7428         last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7429         last_bit = osb->bitmap_cpg;
7430 
7431         trimmed = 0;
7432         for (group = first_group; group <= last_group;) {
7433                 if (first_bit + len >= osb->bitmap_cpg)
7434                         last_bit = osb->bitmap_cpg;
7435                 else
7436                         last_bit = first_bit + len;
7437 
7438                 ret = ocfs2_read_group_descriptor(main_bm_inode,
7439                                                   main_bm, group,
7440                                                   &gd_bh);
7441                 if (ret < 0) {
7442                         mlog_errno(ret);
7443                         break;
7444                 }
7445 
7446                 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7447                 cnt = ocfs2_trim_group(sb, gd, group,
7448                                        first_bit, last_bit, minlen);
7449                 brelse(gd_bh);
7450                 gd_bh = NULL;
7451                 if (cnt < 0) {
7452                         ret = cnt;
7453                         mlog_errno(ret);
7454                         break;
7455                 }
7456 
7457                 trimmed += cnt;
7458                 len -= osb->bitmap_cpg - first_bit;
7459                 first_bit = 0;
7460                 if (group == osb->first_cluster_group_blkno)
7461                         group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7462                 else
7463                         group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7464         }
7465         range->len = trimmed * sb->s_blocksize;
7466 out_unlock:
7467         ocfs2_inode_unlock(main_bm_inode, 0);
7468         brelse(main_bm_bh);
7469 out_mutex:
7470         inode_unlock(main_bm_inode);
7471         iput(main_bm_inode);
7472 out:
7473         return ret;
7474 }
7475 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp