~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/ext4/extents.c

Version: ~ [ linux-5.11 ] ~ [ linux-5.10.17 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.99 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.176 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.221 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.257 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.257 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
  4  * Written by Alex Tomas <alex@clusterfs.com>
  5  *
  6  * Architecture independence:
  7  *   Copyright (c) 2005, Bull S.A.
  8  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
  9  */
 10 
 11 /*
 12  * Extents support for EXT4
 13  *
 14  * TODO:
 15  *   - ext4*_error() should be used in some situations
 16  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
 17  *   - smart tree reduction
 18  */
 19 
 20 #include <linux/fs.h>
 21 #include <linux/time.h>
 22 #include <linux/jbd2.h>
 23 #include <linux/highuid.h>
 24 #include <linux/pagemap.h>
 25 #include <linux/quotaops.h>
 26 #include <linux/string.h>
 27 #include <linux/slab.h>
 28 #include <linux/uaccess.h>
 29 #include <linux/fiemap.h>
 30 #include <linux/backing-dev.h>
 31 #include <linux/iomap.h>
 32 #include "ext4_jbd2.h"
 33 #include "ext4_extents.h"
 34 #include "xattr.h"
 35 
 36 #include <trace/events/ext4.h>
 37 
 38 /*
 39  * used by extent splitting.
 40  */
 41 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
 42                                         due to ENOSPC */
 43 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
 44 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
 45 
 46 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
 47 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
 48 
 49 static __le32 ext4_extent_block_csum(struct inode *inode,
 50                                      struct ext4_extent_header *eh)
 51 {
 52         struct ext4_inode_info *ei = EXT4_I(inode);
 53         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 54         __u32 csum;
 55 
 56         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
 57                            EXT4_EXTENT_TAIL_OFFSET(eh));
 58         return cpu_to_le32(csum);
 59 }
 60 
 61 static int ext4_extent_block_csum_verify(struct inode *inode,
 62                                          struct ext4_extent_header *eh)
 63 {
 64         struct ext4_extent_tail *et;
 65 
 66         if (!ext4_has_metadata_csum(inode->i_sb))
 67                 return 1;
 68 
 69         et = find_ext4_extent_tail(eh);
 70         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
 71                 return 0;
 72         return 1;
 73 }
 74 
 75 static void ext4_extent_block_csum_set(struct inode *inode,
 76                                        struct ext4_extent_header *eh)
 77 {
 78         struct ext4_extent_tail *et;
 79 
 80         if (!ext4_has_metadata_csum(inode->i_sb))
 81                 return;
 82 
 83         et = find_ext4_extent_tail(eh);
 84         et->et_checksum = ext4_extent_block_csum(inode, eh);
 85 }
 86 
 87 static int ext4_split_extent_at(handle_t *handle,
 88                              struct inode *inode,
 89                              struct ext4_ext_path **ppath,
 90                              ext4_lblk_t split,
 91                              int split_flag,
 92                              int flags);
 93 
 94 static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 95 {
 96         /*
 97          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 98          * moment, get_block can be called only for blocks inside i_size since
 99          * page cache has been already dropped and writes are blocked by
100          * i_mutex. So we can safely drop the i_data_sem here.
101          */
102         BUG_ON(EXT4_JOURNAL(inode) == NULL);
103         ext4_discard_preallocations(inode);
104         up_write(&EXT4_I(inode)->i_data_sem);
105         *dropped = 1;
106         return 0;
107 }
108 
109 /*
110  * Make sure 'handle' has at least 'check_cred' credits. If not, restart
111  * transaction with 'restart_cred' credits. The function drops i_data_sem
112  * when restarting transaction and gets it after transaction is restarted.
113  *
114  * The function returns 0 on success, 1 if transaction had to be restarted,
115  * and < 0 in case of fatal error.
116  */
117 int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
118                                 int check_cred, int restart_cred,
119                                 int revoke_cred)
120 {
121         int ret;
122         int dropped = 0;
123 
124         ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
125                 revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
126         if (dropped)
127                 down_write(&EXT4_I(inode)->i_data_sem);
128         return ret;
129 }
130 
131 /*
132  * could return:
133  *  - EROFS
134  *  - ENOMEM
135  */
136 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
137                                 struct ext4_ext_path *path)
138 {
139         if (path->p_bh) {
140                 /* path points to block */
141                 BUFFER_TRACE(path->p_bh, "get_write_access");
142                 return ext4_journal_get_write_access(handle, path->p_bh);
143         }
144         /* path points to leaf/index in inode body */
145         /* we use in-core data, no need to protect them */
146         return 0;
147 }
148 
149 /*
150  * could return:
151  *  - EROFS
152  *  - ENOMEM
153  *  - EIO
154  */
155 static int __ext4_ext_dirty(const char *where, unsigned int line,
156                             handle_t *handle, struct inode *inode,
157                             struct ext4_ext_path *path)
158 {
159         int err;
160 
161         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
162         if (path->p_bh) {
163                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
164                 /* path points to block */
165                 err = __ext4_handle_dirty_metadata(where, line, handle,
166                                                    inode, path->p_bh);
167         } else {
168                 /* path points to leaf/index in inode body */
169                 err = ext4_mark_inode_dirty(handle, inode);
170         }
171         return err;
172 }
173 
174 #define ext4_ext_dirty(handle, inode, path) \
175                 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
176 
177 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
178                               struct ext4_ext_path *path,
179                               ext4_lblk_t block)
180 {
181         if (path) {
182                 int depth = path->p_depth;
183                 struct ext4_extent *ex;
184 
185                 /*
186                  * Try to predict block placement assuming that we are
187                  * filling in a file which will eventually be
188                  * non-sparse --- i.e., in the case of libbfd writing
189                  * an ELF object sections out-of-order but in a way
190                  * the eventually results in a contiguous object or
191                  * executable file, or some database extending a table
192                  * space file.  However, this is actually somewhat
193                  * non-ideal if we are writing a sparse file such as
194                  * qemu or KVM writing a raw image file that is going
195                  * to stay fairly sparse, since it will end up
196                  * fragmenting the file system's free space.  Maybe we
197                  * should have some hueristics or some way to allow
198                  * userspace to pass a hint to file system,
199                  * especially if the latter case turns out to be
200                  * common.
201                  */
202                 ex = path[depth].p_ext;
203                 if (ex) {
204                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
205                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
206 
207                         if (block > ext_block)
208                                 return ext_pblk + (block - ext_block);
209                         else
210                                 return ext_pblk - (ext_block - block);
211                 }
212 
213                 /* it looks like index is empty;
214                  * try to find starting block from index itself */
215                 if (path[depth].p_bh)
216                         return path[depth].p_bh->b_blocknr;
217         }
218 
219         /* OK. use inode's group */
220         return ext4_inode_to_goal_block(inode);
221 }
222 
223 /*
224  * Allocation for a meta data block
225  */
226 static ext4_fsblk_t
227 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
228                         struct ext4_ext_path *path,
229                         struct ext4_extent *ex, int *err, unsigned int flags)
230 {
231         ext4_fsblk_t goal, newblock;
232 
233         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
234         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
235                                         NULL, err);
236         return newblock;
237 }
238 
239 static inline int ext4_ext_space_block(struct inode *inode, int check)
240 {
241         int size;
242 
243         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
244                         / sizeof(struct ext4_extent);
245 #ifdef AGGRESSIVE_TEST
246         if (!check && size > 6)
247                 size = 6;
248 #endif
249         return size;
250 }
251 
252 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
253 {
254         int size;
255 
256         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
257                         / sizeof(struct ext4_extent_idx);
258 #ifdef AGGRESSIVE_TEST
259         if (!check && size > 5)
260                 size = 5;
261 #endif
262         return size;
263 }
264 
265 static inline int ext4_ext_space_root(struct inode *inode, int check)
266 {
267         int size;
268 
269         size = sizeof(EXT4_I(inode)->i_data);
270         size -= sizeof(struct ext4_extent_header);
271         size /= sizeof(struct ext4_extent);
272 #ifdef AGGRESSIVE_TEST
273         if (!check && size > 3)
274                 size = 3;
275 #endif
276         return size;
277 }
278 
279 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
280 {
281         int size;
282 
283         size = sizeof(EXT4_I(inode)->i_data);
284         size -= sizeof(struct ext4_extent_header);
285         size /= sizeof(struct ext4_extent_idx);
286 #ifdef AGGRESSIVE_TEST
287         if (!check && size > 4)
288                 size = 4;
289 #endif
290         return size;
291 }
292 
293 static inline int
294 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
295                            struct ext4_ext_path **ppath, ext4_lblk_t lblk,
296                            int nofail)
297 {
298         struct ext4_ext_path *path = *ppath;
299         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
300 
301         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
302                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
303                         EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
304                         (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
305 }
306 
307 static int
308 ext4_ext_max_entries(struct inode *inode, int depth)
309 {
310         int max;
311 
312         if (depth == ext_depth(inode)) {
313                 if (depth == 0)
314                         max = ext4_ext_space_root(inode, 1);
315                 else
316                         max = ext4_ext_space_root_idx(inode, 1);
317         } else {
318                 if (depth == 0)
319                         max = ext4_ext_space_block(inode, 1);
320                 else
321                         max = ext4_ext_space_block_idx(inode, 1);
322         }
323 
324         return max;
325 }
326 
327 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328 {
329         ext4_fsblk_t block = ext4_ext_pblock(ext);
330         int len = ext4_ext_get_actual_len(ext);
331         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
332 
333         /*
334          * We allow neither:
335          *  - zero length
336          *  - overflow/wrap-around
337          */
338         if (lblock + len <= lblock)
339                 return 0;
340         return ext4_inode_block_valid(inode, block, len);
341 }
342 
343 static int ext4_valid_extent_idx(struct inode *inode,
344                                 struct ext4_extent_idx *ext_idx)
345 {
346         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
347 
348         return ext4_inode_block_valid(inode, block, 1);
349 }
350 
351 static int ext4_valid_extent_entries(struct inode *inode,
352                                      struct ext4_extent_header *eh,
353                                      ext4_fsblk_t *pblk, int depth)
354 {
355         unsigned short entries;
356         if (eh->eh_entries == 0)
357                 return 1;
358 
359         entries = le16_to_cpu(eh->eh_entries);
360 
361         if (depth == 0) {
362                 /* leaf entries */
363                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
364                 ext4_lblk_t lblock = 0;
365                 ext4_lblk_t prev = 0;
366                 int len = 0;
367                 while (entries) {
368                         if (!ext4_valid_extent(inode, ext))
369                                 return 0;
370 
371                         /* Check for overlapping extents */
372                         lblock = le32_to_cpu(ext->ee_block);
373                         len = ext4_ext_get_actual_len(ext);
374                         if ((lblock <= prev) && prev) {
375                                 *pblk = ext4_ext_pblock(ext);
376                                 return 0;
377                         }
378                         ext++;
379                         entries--;
380                         prev = lblock + len - 1;
381                 }
382         } else {
383                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
384                 while (entries) {
385                         if (!ext4_valid_extent_idx(inode, ext_idx))
386                                 return 0;
387                         ext_idx++;
388                         entries--;
389                 }
390         }
391         return 1;
392 }
393 
394 static int __ext4_ext_check(const char *function, unsigned int line,
395                             struct inode *inode, struct ext4_extent_header *eh,
396                             int depth, ext4_fsblk_t pblk)
397 {
398         const char *error_msg;
399         int max = 0, err = -EFSCORRUPTED;
400 
401         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
402                 error_msg = "invalid magic";
403                 goto corrupted;
404         }
405         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
406                 error_msg = "unexpected eh_depth";
407                 goto corrupted;
408         }
409         if (unlikely(eh->eh_max == 0)) {
410                 error_msg = "invalid eh_max";
411                 goto corrupted;
412         }
413         max = ext4_ext_max_entries(inode, depth);
414         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
415                 error_msg = "too large eh_max";
416                 goto corrupted;
417         }
418         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
419                 error_msg = "invalid eh_entries";
420                 goto corrupted;
421         }
422         if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
423                 error_msg = "invalid extent entries";
424                 goto corrupted;
425         }
426         if (unlikely(depth > 32)) {
427                 error_msg = "too large eh_depth";
428                 goto corrupted;
429         }
430         /* Verify checksum on non-root extent tree nodes */
431         if (ext_depth(inode) != depth &&
432             !ext4_extent_block_csum_verify(inode, eh)) {
433                 error_msg = "extent tree corrupted";
434                 err = -EFSBADCRC;
435                 goto corrupted;
436         }
437         return 0;
438 
439 corrupted:
440         ext4_error_inode_err(inode, function, line, 0, -err,
441                              "pblk %llu bad header/extent: %s - magic %x, "
442                              "entries %u, max %u(%u), depth %u(%u)",
443                              (unsigned long long) pblk, error_msg,
444                              le16_to_cpu(eh->eh_magic),
445                              le16_to_cpu(eh->eh_entries),
446                              le16_to_cpu(eh->eh_max),
447                              max, le16_to_cpu(eh->eh_depth), depth);
448         return err;
449 }
450 
451 #define ext4_ext_check(inode, eh, depth, pblk)                  \
452         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
453 
454 int ext4_ext_check_inode(struct inode *inode)
455 {
456         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
457 }
458 
459 static void ext4_cache_extents(struct inode *inode,
460                                struct ext4_extent_header *eh)
461 {
462         struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
463         ext4_lblk_t prev = 0;
464         int i;
465 
466         for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
467                 unsigned int status = EXTENT_STATUS_WRITTEN;
468                 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
469                 int len = ext4_ext_get_actual_len(ex);
470 
471                 if (prev && (prev != lblk))
472                         ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
473                                              EXTENT_STATUS_HOLE);
474 
475                 if (ext4_ext_is_unwritten(ex))
476                         status = EXTENT_STATUS_UNWRITTEN;
477                 ext4_es_cache_extent(inode, lblk, len,
478                                      ext4_ext_pblock(ex), status);
479                 prev = lblk + len;
480         }
481 }
482 
483 static struct buffer_head *
484 __read_extent_tree_block(const char *function, unsigned int line,
485                          struct inode *inode, ext4_fsblk_t pblk, int depth,
486                          int flags)
487 {
488         struct buffer_head              *bh;
489         int                             err;
490 
491         bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
492         if (unlikely(!bh))
493                 return ERR_PTR(-ENOMEM);
494 
495         if (!bh_uptodate_or_lock(bh)) {
496                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
497                 err = bh_submit_read(bh);
498                 if (err < 0)
499                         goto errout;
500         }
501         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
502                 return bh;
503         err = __ext4_ext_check(function, line, inode,
504                                ext_block_hdr(bh), depth, pblk);
505         if (err)
506                 goto errout;
507         set_buffer_verified(bh);
508         /*
509          * If this is a leaf block, cache all of its entries
510          */
511         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
512                 struct ext4_extent_header *eh = ext_block_hdr(bh);
513                 ext4_cache_extents(inode, eh);
514         }
515         return bh;
516 errout:
517         put_bh(bh);
518         return ERR_PTR(err);
519 
520 }
521 
522 #define read_extent_tree_block(inode, pblk, depth, flags)               \
523         __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
524                                  (depth), (flags))
525 
526 /*
527  * This function is called to cache a file's extent information in the
528  * extent status tree
529  */
530 int ext4_ext_precache(struct inode *inode)
531 {
532         struct ext4_inode_info *ei = EXT4_I(inode);
533         struct ext4_ext_path *path = NULL;
534         struct buffer_head *bh;
535         int i = 0, depth, ret = 0;
536 
537         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
538                 return 0;       /* not an extent-mapped inode */
539 
540         down_read(&ei->i_data_sem);
541         depth = ext_depth(inode);
542 
543         /* Don't cache anything if there are no external extent blocks */
544         if (!depth) {
545                 up_read(&ei->i_data_sem);
546                 return ret;
547         }
548 
549         path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
550                        GFP_NOFS);
551         if (path == NULL) {
552                 up_read(&ei->i_data_sem);
553                 return -ENOMEM;
554         }
555 
556         path[0].p_hdr = ext_inode_hdr(inode);
557         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
558         if (ret)
559                 goto out;
560         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
561         while (i >= 0) {
562                 /*
563                  * If this is a leaf block or we've reached the end of
564                  * the index block, go up
565                  */
566                 if ((i == depth) ||
567                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
568                         brelse(path[i].p_bh);
569                         path[i].p_bh = NULL;
570                         i--;
571                         continue;
572                 }
573                 bh = read_extent_tree_block(inode,
574                                             ext4_idx_pblock(path[i].p_idx++),
575                                             depth - i - 1,
576                                             EXT4_EX_FORCE_CACHE);
577                 if (IS_ERR(bh)) {
578                         ret = PTR_ERR(bh);
579                         break;
580                 }
581                 i++;
582                 path[i].p_bh = bh;
583                 path[i].p_hdr = ext_block_hdr(bh);
584                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
585         }
586         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
587 out:
588         up_read(&ei->i_data_sem);
589         ext4_ext_drop_refs(path);
590         kfree(path);
591         return ret;
592 }
593 
594 #ifdef EXT_DEBUG
595 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
596 {
597         int k, l = path->p_depth;
598 
599         ext_debug("path:");
600         for (k = 0; k <= l; k++, path++) {
601                 if (path->p_idx) {
602                         ext_debug("  %d->%llu",
603                                   le32_to_cpu(path->p_idx->ei_block),
604                                   ext4_idx_pblock(path->p_idx));
605                 } else if (path->p_ext) {
606                         ext_debug("  %d:[%d]%d:%llu ",
607                                   le32_to_cpu(path->p_ext->ee_block),
608                                   ext4_ext_is_unwritten(path->p_ext),
609                                   ext4_ext_get_actual_len(path->p_ext),
610                                   ext4_ext_pblock(path->p_ext));
611                 } else
612                         ext_debug("  []");
613         }
614         ext_debug("\n");
615 }
616 
617 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
618 {
619         int depth = ext_depth(inode);
620         struct ext4_extent_header *eh;
621         struct ext4_extent *ex;
622         int i;
623 
624         if (!path)
625                 return;
626 
627         eh = path[depth].p_hdr;
628         ex = EXT_FIRST_EXTENT(eh);
629 
630         ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
631 
632         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
633                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
634                           ext4_ext_is_unwritten(ex),
635                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
636         }
637         ext_debug("\n");
638 }
639 
640 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
641                         ext4_fsblk_t newblock, int level)
642 {
643         int depth = ext_depth(inode);
644         struct ext4_extent *ex;
645 
646         if (depth != level) {
647                 struct ext4_extent_idx *idx;
648                 idx = path[level].p_idx;
649                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
650                         ext_debug("%d: move %d:%llu in new index %llu\n", level,
651                                         le32_to_cpu(idx->ei_block),
652                                         ext4_idx_pblock(idx),
653                                         newblock);
654                         idx++;
655                 }
656 
657                 return;
658         }
659 
660         ex = path[depth].p_ext;
661         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
662                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
663                                 le32_to_cpu(ex->ee_block),
664                                 ext4_ext_pblock(ex),
665                                 ext4_ext_is_unwritten(ex),
666                                 ext4_ext_get_actual_len(ex),
667                                 newblock);
668                 ex++;
669         }
670 }
671 
672 #else
673 #define ext4_ext_show_path(inode, path)
674 #define ext4_ext_show_leaf(inode, path)
675 #define ext4_ext_show_move(inode, path, newblock, level)
676 #endif
677 
678 void ext4_ext_drop_refs(struct ext4_ext_path *path)
679 {
680         int depth, i;
681 
682         if (!path)
683                 return;
684         depth = path->p_depth;
685         for (i = 0; i <= depth; i++, path++) {
686                 if (path->p_bh) {
687                         brelse(path->p_bh);
688                         path->p_bh = NULL;
689                 }
690         }
691 }
692 
693 /*
694  * ext4_ext_binsearch_idx:
695  * binary search for the closest index of the given block
696  * the header must be checked before calling this
697  */
698 static void
699 ext4_ext_binsearch_idx(struct inode *inode,
700                         struct ext4_ext_path *path, ext4_lblk_t block)
701 {
702         struct ext4_extent_header *eh = path->p_hdr;
703         struct ext4_extent_idx *r, *l, *m;
704 
705 
706         ext_debug("binsearch for %u(idx):  ", block);
707 
708         l = EXT_FIRST_INDEX(eh) + 1;
709         r = EXT_LAST_INDEX(eh);
710         while (l <= r) {
711                 m = l + (r - l) / 2;
712                 if (block < le32_to_cpu(m->ei_block))
713                         r = m - 1;
714                 else
715                         l = m + 1;
716                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
717                                 m, le32_to_cpu(m->ei_block),
718                                 r, le32_to_cpu(r->ei_block));
719         }
720 
721         path->p_idx = l - 1;
722         ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
723                   ext4_idx_pblock(path->p_idx));
724 
725 #ifdef CHECK_BINSEARCH
726         {
727                 struct ext4_extent_idx *chix, *ix;
728                 int k;
729 
730                 chix = ix = EXT_FIRST_INDEX(eh);
731                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
732                         if (k != 0 && le32_to_cpu(ix->ei_block) <=
733                             le32_to_cpu(ix[-1].ei_block)) {
734                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
735                                        "first=0x%p\n", k,
736                                        ix, EXT_FIRST_INDEX(eh));
737                                 printk(KERN_DEBUG "%u <= %u\n",
738                                        le32_to_cpu(ix->ei_block),
739                                        le32_to_cpu(ix[-1].ei_block));
740                         }
741                         BUG_ON(k && le32_to_cpu(ix->ei_block)
742                                            <= le32_to_cpu(ix[-1].ei_block));
743                         if (block < le32_to_cpu(ix->ei_block))
744                                 break;
745                         chix = ix;
746                 }
747                 BUG_ON(chix != path->p_idx);
748         }
749 #endif
750 
751 }
752 
753 /*
754  * ext4_ext_binsearch:
755  * binary search for closest extent of the given block
756  * the header must be checked before calling this
757  */
758 static void
759 ext4_ext_binsearch(struct inode *inode,
760                 struct ext4_ext_path *path, ext4_lblk_t block)
761 {
762         struct ext4_extent_header *eh = path->p_hdr;
763         struct ext4_extent *r, *l, *m;
764 
765         if (eh->eh_entries == 0) {
766                 /*
767                  * this leaf is empty:
768                  * we get such a leaf in split/add case
769                  */
770                 return;
771         }
772 
773         ext_debug("binsearch for %u:  ", block);
774 
775         l = EXT_FIRST_EXTENT(eh) + 1;
776         r = EXT_LAST_EXTENT(eh);
777 
778         while (l <= r) {
779                 m = l + (r - l) / 2;
780                 if (block < le32_to_cpu(m->ee_block))
781                         r = m - 1;
782                 else
783                         l = m + 1;
784                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
785                                 m, le32_to_cpu(m->ee_block),
786                                 r, le32_to_cpu(r->ee_block));
787         }
788 
789         path->p_ext = l - 1;
790         ext_debug("  -> %d:%llu:[%d]%d ",
791                         le32_to_cpu(path->p_ext->ee_block),
792                         ext4_ext_pblock(path->p_ext),
793                         ext4_ext_is_unwritten(path->p_ext),
794                         ext4_ext_get_actual_len(path->p_ext));
795 
796 #ifdef CHECK_BINSEARCH
797         {
798                 struct ext4_extent *chex, *ex;
799                 int k;
800 
801                 chex = ex = EXT_FIRST_EXTENT(eh);
802                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
803                         BUG_ON(k && le32_to_cpu(ex->ee_block)
804                                           <= le32_to_cpu(ex[-1].ee_block));
805                         if (block < le32_to_cpu(ex->ee_block))
806                                 break;
807                         chex = ex;
808                 }
809                 BUG_ON(chex != path->p_ext);
810         }
811 #endif
812 
813 }
814 
815 void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
816 {
817         struct ext4_extent_header *eh;
818 
819         eh = ext_inode_hdr(inode);
820         eh->eh_depth = 0;
821         eh->eh_entries = 0;
822         eh->eh_magic = EXT4_EXT_MAGIC;
823         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
824         ext4_mark_inode_dirty(handle, inode);
825 }
826 
827 struct ext4_ext_path *
828 ext4_find_extent(struct inode *inode, ext4_lblk_t block,
829                  struct ext4_ext_path **orig_path, int flags)
830 {
831         struct ext4_extent_header *eh;
832         struct buffer_head *bh;
833         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
834         short int depth, i, ppos = 0;
835         int ret;
836 
837         eh = ext_inode_hdr(inode);
838         depth = ext_depth(inode);
839         if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
840                 EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
841                                  depth);
842                 ret = -EFSCORRUPTED;
843                 goto err;
844         }
845 
846         if (path) {
847                 ext4_ext_drop_refs(path);
848                 if (depth > path[0].p_maxdepth) {
849                         kfree(path);
850                         *orig_path = path = NULL;
851                 }
852         }
853         if (!path) {
854                 /* account possible depth increase */
855                 path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
856                                 GFP_NOFS);
857                 if (unlikely(!path))
858                         return ERR_PTR(-ENOMEM);
859                 path[0].p_maxdepth = depth + 1;
860         }
861         path[0].p_hdr = eh;
862         path[0].p_bh = NULL;
863 
864         i = depth;
865         if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
866                 ext4_cache_extents(inode, eh);
867         /* walk through the tree */
868         while (i) {
869                 ext_debug("depth %d: num %d, max %d\n",
870                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
871 
872                 ext4_ext_binsearch_idx(inode, path + ppos, block);
873                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
874                 path[ppos].p_depth = i;
875                 path[ppos].p_ext = NULL;
876 
877                 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
878                                             flags);
879                 if (IS_ERR(bh)) {
880                         ret = PTR_ERR(bh);
881                         goto err;
882                 }
883 
884                 eh = ext_block_hdr(bh);
885                 ppos++;
886                 path[ppos].p_bh = bh;
887                 path[ppos].p_hdr = eh;
888         }
889 
890         path[ppos].p_depth = i;
891         path[ppos].p_ext = NULL;
892         path[ppos].p_idx = NULL;
893 
894         /* find extent */
895         ext4_ext_binsearch(inode, path + ppos, block);
896         /* if not an empty leaf */
897         if (path[ppos].p_ext)
898                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
899 
900         ext4_ext_show_path(inode, path);
901 
902         return path;
903 
904 err:
905         ext4_ext_drop_refs(path);
906         kfree(path);
907         if (orig_path)
908                 *orig_path = NULL;
909         return ERR_PTR(ret);
910 }
911 
912 /*
913  * ext4_ext_insert_index:
914  * insert new index [@logical;@ptr] into the block at @curp;
915  * check where to insert: before @curp or after @curp
916  */
917 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
918                                  struct ext4_ext_path *curp,
919                                  int logical, ext4_fsblk_t ptr)
920 {
921         struct ext4_extent_idx *ix;
922         int len, err;
923 
924         err = ext4_ext_get_access(handle, inode, curp);
925         if (err)
926                 return err;
927 
928         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
929                 EXT4_ERROR_INODE(inode,
930                                  "logical %d == ei_block %d!",
931                                  logical, le32_to_cpu(curp->p_idx->ei_block));
932                 return -EFSCORRUPTED;
933         }
934 
935         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
936                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
937                 EXT4_ERROR_INODE(inode,
938                                  "eh_entries %d >= eh_max %d!",
939                                  le16_to_cpu(curp->p_hdr->eh_entries),
940                                  le16_to_cpu(curp->p_hdr->eh_max));
941                 return -EFSCORRUPTED;
942         }
943 
944         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
945                 /* insert after */
946                 ext_debug("insert new index %d after: %llu\n", logical, ptr);
947                 ix = curp->p_idx + 1;
948         } else {
949                 /* insert before */
950                 ext_debug("insert new index %d before: %llu\n", logical, ptr);
951                 ix = curp->p_idx;
952         }
953 
954         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
955         BUG_ON(len < 0);
956         if (len > 0) {
957                 ext_debug("insert new index %d: "
958                                 "move %d indices from 0x%p to 0x%p\n",
959                                 logical, len, ix, ix + 1);
960                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
961         }
962 
963         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
964                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
965                 return -EFSCORRUPTED;
966         }
967 
968         ix->ei_block = cpu_to_le32(logical);
969         ext4_idx_store_pblock(ix, ptr);
970         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
971 
972         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
973                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
974                 return -EFSCORRUPTED;
975         }
976 
977         err = ext4_ext_dirty(handle, inode, curp);
978         ext4_std_error(inode->i_sb, err);
979 
980         return err;
981 }
982 
983 /*
984  * ext4_ext_split:
985  * inserts new subtree into the path, using free index entry
986  * at depth @at:
987  * - allocates all needed blocks (new leaf and all intermediate index blocks)
988  * - makes decision where to split
989  * - moves remaining extents and index entries (right to the split point)
990  *   into the newly allocated blocks
991  * - initializes subtree
992  */
993 static int ext4_ext_split(handle_t *handle, struct inode *inode,
994                           unsigned int flags,
995                           struct ext4_ext_path *path,
996                           struct ext4_extent *newext, int at)
997 {
998         struct buffer_head *bh = NULL;
999         int depth = ext_depth(inode);
1000         struct ext4_extent_header *neh;
1001         struct ext4_extent_idx *fidx;
1002         int i = at, k, m, a;
1003         ext4_fsblk_t newblock, oldblock;
1004         __le32 border;
1005         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1006         int err = 0;
1007         size_t ext_size = 0;
1008 
1009         /* make decision: where to split? */
1010         /* FIXME: now decision is simplest: at current extent */
1011 
1012         /* if current leaf will be split, then we should use
1013          * border from split point */
1014         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1015                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1016                 return -EFSCORRUPTED;
1017         }
1018         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1019                 border = path[depth].p_ext[1].ee_block;
1020                 ext_debug("leaf will be split."
1021                                 " next leaf starts at %d\n",
1022                                   le32_to_cpu(border));
1023         } else {
1024                 border = newext->ee_block;
1025                 ext_debug("leaf will be added."
1026                                 " next leaf starts at %d\n",
1027                                 le32_to_cpu(border));
1028         }
1029 
1030         /*
1031          * If error occurs, then we break processing
1032          * and mark filesystem read-only. index won't
1033          * be inserted and tree will be in consistent
1034          * state. Next mount will repair buffers too.
1035          */
1036 
1037         /*
1038          * Get array to track all allocated blocks.
1039          * We need this to handle errors and free blocks
1040          * upon them.
1041          */
1042         ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
1043         if (!ablocks)
1044                 return -ENOMEM;
1045 
1046         /* allocate all needed blocks */
1047         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1048         for (a = 0; a < depth - at; a++) {
1049                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1050                                                    newext, &err, flags);
1051                 if (newblock == 0)
1052                         goto cleanup;
1053                 ablocks[a] = newblock;
1054         }
1055 
1056         /* initialize new leaf */
1057         newblock = ablocks[--a];
1058         if (unlikely(newblock == 0)) {
1059                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1060                 err = -EFSCORRUPTED;
1061                 goto cleanup;
1062         }
1063         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1064         if (unlikely(!bh)) {
1065                 err = -ENOMEM;
1066                 goto cleanup;
1067         }
1068         lock_buffer(bh);
1069 
1070         err = ext4_journal_get_create_access(handle, bh);
1071         if (err)
1072                 goto cleanup;
1073 
1074         neh = ext_block_hdr(bh);
1075         neh->eh_entries = 0;
1076         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1077         neh->eh_magic = EXT4_EXT_MAGIC;
1078         neh->eh_depth = 0;
1079 
1080         /* move remainder of path[depth] to the new leaf */
1081         if (unlikely(path[depth].p_hdr->eh_entries !=
1082                      path[depth].p_hdr->eh_max)) {
1083                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1084                                  path[depth].p_hdr->eh_entries,
1085                                  path[depth].p_hdr->eh_max);
1086                 err = -EFSCORRUPTED;
1087                 goto cleanup;
1088         }
1089         /* start copy from next extent */
1090         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1091         ext4_ext_show_move(inode, path, newblock, depth);
1092         if (m) {
1093                 struct ext4_extent *ex;
1094                 ex = EXT_FIRST_EXTENT(neh);
1095                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1096                 le16_add_cpu(&neh->eh_entries, m);
1097         }
1098 
1099         /* zero out unused area in the extent block */
1100         ext_size = sizeof(struct ext4_extent_header) +
1101                 sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1102         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1103         ext4_extent_block_csum_set(inode, neh);
1104         set_buffer_uptodate(bh);
1105         unlock_buffer(bh);
1106 
1107         err = ext4_handle_dirty_metadata(handle, inode, bh);
1108         if (err)
1109                 goto cleanup;
1110         brelse(bh);
1111         bh = NULL;
1112 
1113         /* correct old leaf */
1114         if (m) {
1115                 err = ext4_ext_get_access(handle, inode, path + depth);
1116                 if (err)
1117                         goto cleanup;
1118                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1119                 err = ext4_ext_dirty(handle, inode, path + depth);
1120                 if (err)
1121                         goto cleanup;
1122 
1123         }
1124 
1125         /* create intermediate indexes */
1126         k = depth - at - 1;
1127         if (unlikely(k < 0)) {
1128                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1129                 err = -EFSCORRUPTED;
1130                 goto cleanup;
1131         }
1132         if (k)
1133                 ext_debug("create %d intermediate indices\n", k);
1134         /* insert new index into current index block */
1135         /* current depth stored in i var */
1136         i = depth - 1;
1137         while (k--) {
1138                 oldblock = newblock;
1139                 newblock = ablocks[--a];
1140                 bh = sb_getblk(inode->i_sb, newblock);
1141                 if (unlikely(!bh)) {
1142                         err = -ENOMEM;
1143                         goto cleanup;
1144                 }
1145                 lock_buffer(bh);
1146 
1147                 err = ext4_journal_get_create_access(handle, bh);
1148                 if (err)
1149                         goto cleanup;
1150 
1151                 neh = ext_block_hdr(bh);
1152                 neh->eh_entries = cpu_to_le16(1);
1153                 neh->eh_magic = EXT4_EXT_MAGIC;
1154                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1155                 neh->eh_depth = cpu_to_le16(depth - i);
1156                 fidx = EXT_FIRST_INDEX(neh);
1157                 fidx->ei_block = border;
1158                 ext4_idx_store_pblock(fidx, oldblock);
1159 
1160                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1161                                 i, newblock, le32_to_cpu(border), oldblock);
1162 
1163                 /* move remainder of path[i] to the new index block */
1164                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1165                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1166                         EXT4_ERROR_INODE(inode,
1167                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1168                                          le32_to_cpu(path[i].p_ext->ee_block));
1169                         err = -EFSCORRUPTED;
1170                         goto cleanup;
1171                 }
1172                 /* start copy indexes */
1173                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1174                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1175                                 EXT_MAX_INDEX(path[i].p_hdr));
1176                 ext4_ext_show_move(inode, path, newblock, i);
1177                 if (m) {
1178                         memmove(++fidx, path[i].p_idx,
1179                                 sizeof(struct ext4_extent_idx) * m);
1180                         le16_add_cpu(&neh->eh_entries, m);
1181                 }
1182                 /* zero out unused area in the extent block */
1183                 ext_size = sizeof(struct ext4_extent_header) +
1184                    (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1185                 memset(bh->b_data + ext_size, 0,
1186                         inode->i_sb->s_blocksize - ext_size);
1187                 ext4_extent_block_csum_set(inode, neh);
1188                 set_buffer_uptodate(bh);
1189                 unlock_buffer(bh);
1190 
1191                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1192                 if (err)
1193                         goto cleanup;
1194                 brelse(bh);
1195                 bh = NULL;
1196 
1197                 /* correct old index */
1198                 if (m) {
1199                         err = ext4_ext_get_access(handle, inode, path + i);
1200                         if (err)
1201                                 goto cleanup;
1202                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1203                         err = ext4_ext_dirty(handle, inode, path + i);
1204                         if (err)
1205                                 goto cleanup;
1206                 }
1207 
1208                 i--;
1209         }
1210 
1211         /* insert new index */
1212         err = ext4_ext_insert_index(handle, inode, path + at,
1213                                     le32_to_cpu(border), newblock);
1214 
1215 cleanup:
1216         if (bh) {
1217                 if (buffer_locked(bh))
1218                         unlock_buffer(bh);
1219                 brelse(bh);
1220         }
1221 
1222         if (err) {
1223                 /* free all allocated blocks in error case */
1224                 for (i = 0; i < depth; i++) {
1225                         if (!ablocks[i])
1226                                 continue;
1227                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1228                                          EXT4_FREE_BLOCKS_METADATA);
1229                 }
1230         }
1231         kfree(ablocks);
1232 
1233         return err;
1234 }
1235 
1236 /*
1237  * ext4_ext_grow_indepth:
1238  * implements tree growing procedure:
1239  * - allocates new block
1240  * - moves top-level data (index block or leaf) into the new block
1241  * - initializes new top-level, creating index that points to the
1242  *   just created block
1243  */
1244 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1245                                  unsigned int flags)
1246 {
1247         struct ext4_extent_header *neh;
1248         struct buffer_head *bh;
1249         ext4_fsblk_t newblock, goal = 0;
1250         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1251         int err = 0;
1252         size_t ext_size = 0;
1253 
1254         /* Try to prepend new index to old one */
1255         if (ext_depth(inode))
1256                 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1257         if (goal > le32_to_cpu(es->s_first_data_block)) {
1258                 flags |= EXT4_MB_HINT_TRY_GOAL;
1259                 goal--;
1260         } else
1261                 goal = ext4_inode_to_goal_block(inode);
1262         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1263                                         NULL, &err);
1264         if (newblock == 0)
1265                 return err;
1266 
1267         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1268         if (unlikely(!bh))
1269                 return -ENOMEM;
1270         lock_buffer(bh);
1271 
1272         err = ext4_journal_get_create_access(handle, bh);
1273         if (err) {
1274                 unlock_buffer(bh);
1275                 goto out;
1276         }
1277 
1278         ext_size = sizeof(EXT4_I(inode)->i_data);
1279         /* move top-level index/leaf into new block */
1280         memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1281         /* zero out unused area in the extent block */
1282         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1283 
1284         /* set size of new block */
1285         neh = ext_block_hdr(bh);
1286         /* old root could have indexes or leaves
1287          * so calculate e_max right way */
1288         if (ext_depth(inode))
1289                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1290         else
1291                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1292         neh->eh_magic = EXT4_EXT_MAGIC;
1293         ext4_extent_block_csum_set(inode, neh);
1294         set_buffer_uptodate(bh);
1295         unlock_buffer(bh);
1296 
1297         err = ext4_handle_dirty_metadata(handle, inode, bh);
1298         if (err)
1299                 goto out;
1300 
1301         /* Update top-level index: num,max,pointer */
1302         neh = ext_inode_hdr(inode);
1303         neh->eh_entries = cpu_to_le16(1);
1304         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1305         if (neh->eh_depth == 0) {
1306                 /* Root extent block becomes index block */
1307                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1308                 EXT_FIRST_INDEX(neh)->ei_block =
1309                         EXT_FIRST_EXTENT(neh)->ee_block;
1310         }
1311         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1312                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1313                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1314                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1315 
1316         le16_add_cpu(&neh->eh_depth, 1);
1317         err = ext4_mark_inode_dirty(handle, inode);
1318 out:
1319         brelse(bh);
1320 
1321         return err;
1322 }
1323 
1324 /*
1325  * ext4_ext_create_new_leaf:
1326  * finds empty index and adds new leaf.
1327  * if no free index is found, then it requests in-depth growing.
1328  */
1329 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1330                                     unsigned int mb_flags,
1331                                     unsigned int gb_flags,
1332                                     struct ext4_ext_path **ppath,
1333                                     struct ext4_extent *newext)
1334 {
1335         struct ext4_ext_path *path = *ppath;
1336         struct ext4_ext_path *curp;
1337         int depth, i, err = 0;
1338 
1339 repeat:
1340         i = depth = ext_depth(inode);
1341 
1342         /* walk up to the tree and look for free index entry */
1343         curp = path + depth;
1344         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1345                 i--;
1346                 curp--;
1347         }
1348 
1349         /* we use already allocated block for index block,
1350          * so subsequent data blocks should be contiguous */
1351         if (EXT_HAS_FREE_INDEX(curp)) {
1352                 /* if we found index with free entry, then use that
1353                  * entry: create all needed subtree and add new leaf */
1354                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1355                 if (err)
1356                         goto out;
1357 
1358                 /* refill path */
1359                 path = ext4_find_extent(inode,
1360                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1361                                     ppath, gb_flags);
1362                 if (IS_ERR(path))
1363                         err = PTR_ERR(path);
1364         } else {
1365                 /* tree is full, time to grow in depth */
1366                 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1367                 if (err)
1368                         goto out;
1369 
1370                 /* refill path */
1371                 path = ext4_find_extent(inode,
1372                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1373                                     ppath, gb_flags);
1374                 if (IS_ERR(path)) {
1375                         err = PTR_ERR(path);
1376                         goto out;
1377                 }
1378 
1379                 /*
1380                  * only first (depth 0 -> 1) produces free space;
1381                  * in all other cases we have to split the grown tree
1382                  */
1383                 depth = ext_depth(inode);
1384                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1385                         /* now we need to split */
1386                         goto repeat;
1387                 }
1388         }
1389 
1390 out:
1391         return err;
1392 }
1393 
1394 /*
1395  * search the closest allocated block to the left for *logical
1396  * and returns it at @logical + it's physical address at @phys
1397  * if *logical is the smallest allocated block, the function
1398  * returns 0 at @phys
1399  * return value contains 0 (success) or error code
1400  */
1401 static int ext4_ext_search_left(struct inode *inode,
1402                                 struct ext4_ext_path *path,
1403                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1404 {
1405         struct ext4_extent_idx *ix;
1406         struct ext4_extent *ex;
1407         int depth, ee_len;
1408 
1409         if (unlikely(path == NULL)) {
1410                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1411                 return -EFSCORRUPTED;
1412         }
1413         depth = path->p_depth;
1414         *phys = 0;
1415 
1416         if (depth == 0 && path->p_ext == NULL)
1417                 return 0;
1418 
1419         /* usually extent in the path covers blocks smaller
1420          * then *logical, but it can be that extent is the
1421          * first one in the file */
1422 
1423         ex = path[depth].p_ext;
1424         ee_len = ext4_ext_get_actual_len(ex);
1425         if (*logical < le32_to_cpu(ex->ee_block)) {
1426                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1427                         EXT4_ERROR_INODE(inode,
1428                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1429                                          *logical, le32_to_cpu(ex->ee_block));
1430                         return -EFSCORRUPTED;
1431                 }
1432                 while (--depth >= 0) {
1433                         ix = path[depth].p_idx;
1434                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1435                                 EXT4_ERROR_INODE(inode,
1436                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1437                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1438                                   EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1439                 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1440                                   depth);
1441                                 return -EFSCORRUPTED;
1442                         }
1443                 }
1444                 return 0;
1445         }
1446 
1447         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1448                 EXT4_ERROR_INODE(inode,
1449                                  "logical %d < ee_block %d + ee_len %d!",
1450                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1451                 return -EFSCORRUPTED;
1452         }
1453 
1454         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1455         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1456         return 0;
1457 }
1458 
1459 /*
1460  * search the closest allocated block to the right for *logical
1461  * and returns it at @logical + it's physical address at @phys
1462  * if *logical is the largest allocated block, the function
1463  * returns 0 at @phys
1464  * return value contains 0 (success) or error code
1465  */
1466 static int ext4_ext_search_right(struct inode *inode,
1467                                  struct ext4_ext_path *path,
1468                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1469                                  struct ext4_extent **ret_ex)
1470 {
1471         struct buffer_head *bh = NULL;
1472         struct ext4_extent_header *eh;
1473         struct ext4_extent_idx *ix;
1474         struct ext4_extent *ex;
1475         ext4_fsblk_t block;
1476         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1477         int ee_len;
1478 
1479         if (unlikely(path == NULL)) {
1480                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1481                 return -EFSCORRUPTED;
1482         }
1483         depth = path->p_depth;
1484         *phys = 0;
1485 
1486         if (depth == 0 && path->p_ext == NULL)
1487                 return 0;
1488 
1489         /* usually extent in the path covers blocks smaller
1490          * then *logical, but it can be that extent is the
1491          * first one in the file */
1492 
1493         ex = path[depth].p_ext;
1494         ee_len = ext4_ext_get_actual_len(ex);
1495         if (*logical < le32_to_cpu(ex->ee_block)) {
1496                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1497                         EXT4_ERROR_INODE(inode,
1498                                          "first_extent(path[%d].p_hdr) != ex",
1499                                          depth);
1500                         return -EFSCORRUPTED;
1501                 }
1502                 while (--depth >= 0) {
1503                         ix = path[depth].p_idx;
1504                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1505                                 EXT4_ERROR_INODE(inode,
1506                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1507                                                  *logical);
1508                                 return -EFSCORRUPTED;
1509                         }
1510                 }
1511                 goto found_extent;
1512         }
1513 
1514         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1515                 EXT4_ERROR_INODE(inode,
1516                                  "logical %d < ee_block %d + ee_len %d!",
1517                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1518                 return -EFSCORRUPTED;
1519         }
1520 
1521         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1522                 /* next allocated block in this leaf */
1523                 ex++;
1524                 goto found_extent;
1525         }
1526 
1527         /* go up and search for index to the right */
1528         while (--depth >= 0) {
1529                 ix = path[depth].p_idx;
1530                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1531                         goto got_index;
1532         }
1533 
1534         /* we've gone up to the root and found no index to the right */
1535         return 0;
1536 
1537 got_index:
1538         /* we've found index to the right, let's
1539          * follow it and find the closest allocated
1540          * block to the right */
1541         ix++;
1542         block = ext4_idx_pblock(ix);
1543         while (++depth < path->p_depth) {
1544                 /* subtract from p_depth to get proper eh_depth */
1545                 bh = read_extent_tree_block(inode, block,
1546                                             path->p_depth - depth, 0);
1547                 if (IS_ERR(bh))
1548                         return PTR_ERR(bh);
1549                 eh = ext_block_hdr(bh);
1550                 ix = EXT_FIRST_INDEX(eh);
1551                 block = ext4_idx_pblock(ix);
1552                 put_bh(bh);
1553         }
1554 
1555         bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1556         if (IS_ERR(bh))
1557                 return PTR_ERR(bh);
1558         eh = ext_block_hdr(bh);
1559         ex = EXT_FIRST_EXTENT(eh);
1560 found_extent:
1561         *logical = le32_to_cpu(ex->ee_block);
1562         *phys = ext4_ext_pblock(ex);
1563         *ret_ex = ex;
1564         if (bh)
1565                 put_bh(bh);
1566         return 0;
1567 }
1568 
1569 /*
1570  * ext4_ext_next_allocated_block:
1571  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1572  * NOTE: it considers block number from index entry as
1573  * allocated block. Thus, index entries have to be consistent
1574  * with leaves.
1575  */
1576 ext4_lblk_t
1577 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1578 {
1579         int depth;
1580 
1581         BUG_ON(path == NULL);
1582         depth = path->p_depth;
1583 
1584         if (depth == 0 && path->p_ext == NULL)
1585                 return EXT_MAX_BLOCKS;
1586 
1587         while (depth >= 0) {
1588                 struct ext4_ext_path *p = &path[depth];
1589 
1590                 if (depth == path->p_depth) {
1591                         /* leaf */
1592                         if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1593                                 return le32_to_cpu(p->p_ext[1].ee_block);
1594                 } else {
1595                         /* index */
1596                         if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1597                                 return le32_to_cpu(p->p_idx[1].ei_block);
1598                 }
1599                 depth--;
1600         }
1601 
1602         return EXT_MAX_BLOCKS;
1603 }
1604 
1605 /*
1606  * ext4_ext_next_leaf_block:
1607  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1608  */
1609 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1610 {
1611         int depth;
1612 
1613         BUG_ON(path == NULL);
1614         depth = path->p_depth;
1615 
1616         /* zero-tree has no leaf blocks at all */
1617         if (depth == 0)
1618                 return EXT_MAX_BLOCKS;
1619 
1620         /* go to index block */
1621         depth--;
1622 
1623         while (depth >= 0) {
1624                 if (path[depth].p_idx !=
1625                                 EXT_LAST_INDEX(path[depth].p_hdr))
1626                         return (ext4_lblk_t)
1627                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1628                 depth--;
1629         }
1630 
1631         return EXT_MAX_BLOCKS;
1632 }
1633 
1634 /*
1635  * ext4_ext_correct_indexes:
1636  * if leaf gets modified and modified extent is first in the leaf,
1637  * then we have to correct all indexes above.
1638  * TODO: do we need to correct tree in all cases?
1639  */
1640 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1641                                 struct ext4_ext_path *path)
1642 {
1643         struct ext4_extent_header *eh;
1644         int depth = ext_depth(inode);
1645         struct ext4_extent *ex;
1646         __le32 border;
1647         int k, err = 0;
1648 
1649         eh = path[depth].p_hdr;
1650         ex = path[depth].p_ext;
1651 
1652         if (unlikely(ex == NULL || eh == NULL)) {
1653                 EXT4_ERROR_INODE(inode,
1654                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1655                 return -EFSCORRUPTED;
1656         }
1657 
1658         if (depth == 0) {
1659                 /* there is no tree at all */
1660                 return 0;
1661         }
1662 
1663         if (ex != EXT_FIRST_EXTENT(eh)) {
1664                 /* we correct tree if first leaf got modified only */
1665                 return 0;
1666         }
1667 
1668         /*
1669          * TODO: we need correction if border is smaller than current one
1670          */
1671         k = depth - 1;
1672         border = path[depth].p_ext->ee_block;
1673         err = ext4_ext_get_access(handle, inode, path + k);
1674         if (err)
1675                 return err;
1676         path[k].p_idx->ei_block = border;
1677         err = ext4_ext_dirty(handle, inode, path + k);
1678         if (err)
1679                 return err;
1680 
1681         while (k--) {
1682                 /* change all left-side indexes */
1683                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1684                         break;
1685                 err = ext4_ext_get_access(handle, inode, path + k);
1686                 if (err)
1687                         break;
1688                 path[k].p_idx->ei_block = border;
1689                 err = ext4_ext_dirty(handle, inode, path + k);
1690                 if (err)
1691                         break;
1692         }
1693 
1694         return err;
1695 }
1696 
1697 static int ext4_can_extents_be_merged(struct inode *inode,
1698                                       struct ext4_extent *ex1,
1699                                       struct ext4_extent *ex2)
1700 {
1701         unsigned short ext1_ee_len, ext2_ee_len;
1702 
1703         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1704                 return 0;
1705 
1706         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1707         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1708 
1709         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1710                         le32_to_cpu(ex2->ee_block))
1711                 return 0;
1712 
1713         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1714                 return 0;
1715 
1716         if (ext4_ext_is_unwritten(ex1) &&
1717             ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1718                 return 0;
1719 #ifdef AGGRESSIVE_TEST
1720         if (ext1_ee_len >= 4)
1721                 return 0;
1722 #endif
1723 
1724         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1725                 return 1;
1726         return 0;
1727 }
1728 
1729 /*
1730  * This function tries to merge the "ex" extent to the next extent in the tree.
1731  * It always tries to merge towards right. If you want to merge towards
1732  * left, pass "ex - 1" as argument instead of "ex".
1733  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1734  * 1 if they got merged.
1735  */
1736 static int ext4_ext_try_to_merge_right(struct inode *inode,
1737                                  struct ext4_ext_path *path,
1738                                  struct ext4_extent *ex)
1739 {
1740         struct ext4_extent_header *eh;
1741         unsigned int depth, len;
1742         int merge_done = 0, unwritten;
1743 
1744         depth = ext_depth(inode);
1745         BUG_ON(path[depth].p_hdr == NULL);
1746         eh = path[depth].p_hdr;
1747 
1748         while (ex < EXT_LAST_EXTENT(eh)) {
1749                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1750                         break;
1751                 /* merge with next extent! */
1752                 unwritten = ext4_ext_is_unwritten(ex);
1753                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1754                                 + ext4_ext_get_actual_len(ex + 1));
1755                 if (unwritten)
1756                         ext4_ext_mark_unwritten(ex);
1757 
1758                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1759                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1760                                 * sizeof(struct ext4_extent);
1761                         memmove(ex + 1, ex + 2, len);
1762                 }
1763                 le16_add_cpu(&eh->eh_entries, -1);
1764                 merge_done = 1;
1765                 WARN_ON(eh->eh_entries == 0);
1766                 if (!eh->eh_entries)
1767                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1768         }
1769 
1770         return merge_done;
1771 }
1772 
1773 /*
1774  * This function does a very simple check to see if we can collapse
1775  * an extent tree with a single extent tree leaf block into the inode.
1776  */
1777 static void ext4_ext_try_to_merge_up(handle_t *handle,
1778                                      struct inode *inode,
1779                                      struct ext4_ext_path *path)
1780 {
1781         size_t s;
1782         unsigned max_root = ext4_ext_space_root(inode, 0);
1783         ext4_fsblk_t blk;
1784 
1785         if ((path[0].p_depth != 1) ||
1786             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1787             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1788                 return;
1789 
1790         /*
1791          * We need to modify the block allocation bitmap and the block
1792          * group descriptor to release the extent tree block.  If we
1793          * can't get the journal credits, give up.
1794          */
1795         if (ext4_journal_extend(handle, 2,
1796                         ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1797                 return;
1798 
1799         /*
1800          * Copy the extent data up to the inode
1801          */
1802         blk = ext4_idx_pblock(path[0].p_idx);
1803         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1804                 sizeof(struct ext4_extent_idx);
1805         s += sizeof(struct ext4_extent_header);
1806 
1807         path[1].p_maxdepth = path[0].p_maxdepth;
1808         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1809         path[0].p_depth = 0;
1810         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1811                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1812         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1813 
1814         brelse(path[1].p_bh);
1815         ext4_free_blocks(handle, inode, NULL, blk, 1,
1816                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1817 }
1818 
1819 /*
1820  * This function tries to merge the @ex extent to neighbours in the tree, then
1821  * tries to collapse the extent tree into the inode.
1822  */
1823 static void ext4_ext_try_to_merge(handle_t *handle,
1824                                   struct inode *inode,
1825                                   struct ext4_ext_path *path,
1826                                   struct ext4_extent *ex)
1827 {
1828         struct ext4_extent_header *eh;
1829         unsigned int depth;
1830         int merge_done = 0;
1831 
1832         depth = ext_depth(inode);
1833         BUG_ON(path[depth].p_hdr == NULL);
1834         eh = path[depth].p_hdr;
1835 
1836         if (ex > EXT_FIRST_EXTENT(eh))
1837                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1838 
1839         if (!merge_done)
1840                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1841 
1842         ext4_ext_try_to_merge_up(handle, inode, path);
1843 }
1844 
1845 /*
1846  * check if a portion of the "newext" extent overlaps with an
1847  * existing extent.
1848  *
1849  * If there is an overlap discovered, it updates the length of the newext
1850  * such that there will be no overlap, and then returns 1.
1851  * If there is no overlap found, it returns 0.
1852  */
1853 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1854                                            struct inode *inode,
1855                                            struct ext4_extent *newext,
1856                                            struct ext4_ext_path *path)
1857 {
1858         ext4_lblk_t b1, b2;
1859         unsigned int depth, len1;
1860         unsigned int ret = 0;
1861 
1862         b1 = le32_to_cpu(newext->ee_block);
1863         len1 = ext4_ext_get_actual_len(newext);
1864         depth = ext_depth(inode);
1865         if (!path[depth].p_ext)
1866                 goto out;
1867         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1868 
1869         /*
1870          * get the next allocated block if the extent in the path
1871          * is before the requested block(s)
1872          */
1873         if (b2 < b1) {
1874                 b2 = ext4_ext_next_allocated_block(path);
1875                 if (b2 == EXT_MAX_BLOCKS)
1876                         goto out;
1877                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1878         }
1879 
1880         /* check for wrap through zero on extent logical start block*/
1881         if (b1 + len1 < b1) {
1882                 len1 = EXT_MAX_BLOCKS - b1;
1883                 newext->ee_len = cpu_to_le16(len1);
1884                 ret = 1;
1885         }
1886 
1887         /* check for overlap */
1888         if (b1 + len1 > b2) {
1889                 newext->ee_len = cpu_to_le16(b2 - b1);
1890                 ret = 1;
1891         }
1892 out:
1893         return ret;
1894 }
1895 
1896 /*
1897  * ext4_ext_insert_extent:
1898  * tries to merge requsted extent into the existing extent or
1899  * inserts requested extent as new one into the tree,
1900  * creating new leaf in the no-space case.
1901  */
1902 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1903                                 struct ext4_ext_path **ppath,
1904                                 struct ext4_extent *newext, int gb_flags)
1905 {
1906         struct ext4_ext_path *path = *ppath;
1907         struct ext4_extent_header *eh;
1908         struct ext4_extent *ex, *fex;
1909         struct ext4_extent *nearex; /* nearest extent */
1910         struct ext4_ext_path *npath = NULL;
1911         int depth, len, err;
1912         ext4_lblk_t next;
1913         int mb_flags = 0, unwritten;
1914 
1915         if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1916                 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1917         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1918                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1919                 return -EFSCORRUPTED;
1920         }
1921         depth = ext_depth(inode);
1922         ex = path[depth].p_ext;
1923         eh = path[depth].p_hdr;
1924         if (unlikely(path[depth].p_hdr == NULL)) {
1925                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1926                 return -EFSCORRUPTED;
1927         }
1928 
1929         /* try to insert block into found extent and return */
1930         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1931 
1932                 /*
1933                  * Try to see whether we should rather test the extent on
1934                  * right from ex, or from the left of ex. This is because
1935                  * ext4_find_extent() can return either extent on the
1936                  * left, or on the right from the searched position. This
1937                  * will make merging more effective.
1938                  */
1939                 if (ex < EXT_LAST_EXTENT(eh) &&
1940                     (le32_to_cpu(ex->ee_block) +
1941                     ext4_ext_get_actual_len(ex) <
1942                     le32_to_cpu(newext->ee_block))) {
1943                         ex += 1;
1944                         goto prepend;
1945                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1946                            (le32_to_cpu(newext->ee_block) +
1947                            ext4_ext_get_actual_len(newext) <
1948                            le32_to_cpu(ex->ee_block)))
1949                         ex -= 1;
1950 
1951                 /* Try to append newex to the ex */
1952                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1953                         ext_debug("append [%d]%d block to %u:[%d]%d"
1954                                   "(from %llu)\n",
1955                                   ext4_ext_is_unwritten(newext),
1956                                   ext4_ext_get_actual_len(newext),
1957                                   le32_to_cpu(ex->ee_block),
1958                                   ext4_ext_is_unwritten(ex),
1959                                   ext4_ext_get_actual_len(ex),
1960                                   ext4_ext_pblock(ex));
1961                         err = ext4_ext_get_access(handle, inode,
1962                                                   path + depth);
1963                         if (err)
1964                                 return err;
1965                         unwritten = ext4_ext_is_unwritten(ex);
1966                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1967                                         + ext4_ext_get_actual_len(newext));
1968                         if (unwritten)
1969                                 ext4_ext_mark_unwritten(ex);
1970                         eh = path[depth].p_hdr;
1971                         nearex = ex;
1972                         goto merge;
1973                 }
1974 
1975 prepend:
1976                 /* Try to prepend newex to the ex */
1977                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1978                         ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1979                                   "(from %llu)\n",
1980                                   le32_to_cpu(newext->ee_block),
1981                                   ext4_ext_is_unwritten(newext),
1982                                   ext4_ext_get_actual_len(newext),
1983                                   le32_to_cpu(ex->ee_block),
1984                                   ext4_ext_is_unwritten(ex),
1985                                   ext4_ext_get_actual_len(ex),
1986                                   ext4_ext_pblock(ex));
1987                         err = ext4_ext_get_access(handle, inode,
1988                                                   path + depth);
1989                         if (err)
1990                                 return err;
1991 
1992                         unwritten = ext4_ext_is_unwritten(ex);
1993                         ex->ee_block = newext->ee_block;
1994                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1995                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1996                                         + ext4_ext_get_actual_len(newext));
1997                         if (unwritten)
1998                                 ext4_ext_mark_unwritten(ex);
1999                         eh = path[depth].p_hdr;
2000                         nearex = ex;
2001                         goto merge;
2002                 }
2003         }
2004 
2005         depth = ext_depth(inode);
2006         eh = path[depth].p_hdr;
2007         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2008                 goto has_space;
2009 
2010         /* probably next leaf has space for us? */
2011         fex = EXT_LAST_EXTENT(eh);
2012         next = EXT_MAX_BLOCKS;
2013         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2014                 next = ext4_ext_next_leaf_block(path);
2015         if (next != EXT_MAX_BLOCKS) {
2016                 ext_debug("next leaf block - %u\n", next);
2017                 BUG_ON(npath != NULL);
2018                 npath = ext4_find_extent(inode, next, NULL, 0);
2019                 if (IS_ERR(npath))
2020                         return PTR_ERR(npath);
2021                 BUG_ON(npath->p_depth != path->p_depth);
2022                 eh = npath[depth].p_hdr;
2023                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2024                         ext_debug("next leaf isn't full(%d)\n",
2025                                   le16_to_cpu(eh->eh_entries));
2026                         path = npath;
2027                         goto has_space;
2028                 }
2029                 ext_debug("next leaf has no free space(%d,%d)\n",
2030                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2031         }
2032 
2033         /*
2034          * There is no free space in the found leaf.
2035          * We're gonna add a new leaf in the tree.
2036          */
2037         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2038                 mb_flags |= EXT4_MB_USE_RESERVED;
2039         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2040                                        ppath, newext);
2041         if (err)
2042                 goto cleanup;
2043         depth = ext_depth(inode);
2044         eh = path[depth].p_hdr;
2045 
2046 has_space:
2047         nearex = path[depth].p_ext;
2048 
2049         err = ext4_ext_get_access(handle, inode, path + depth);
2050         if (err)
2051                 goto cleanup;
2052 
2053         if (!nearex) {
2054                 /* there is no extent in this leaf, create first one */
2055                 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2056                                 le32_to_cpu(newext->ee_block),
2057                                 ext4_ext_pblock(newext),
2058                                 ext4_ext_is_unwritten(newext),
2059                                 ext4_ext_get_actual_len(newext));
2060                 nearex = EXT_FIRST_EXTENT(eh);
2061         } else {
2062                 if (le32_to_cpu(newext->ee_block)
2063                            > le32_to_cpu(nearex->ee_block)) {
2064                         /* Insert after */
2065                         ext_debug("insert %u:%llu:[%d]%d before: "
2066                                         "nearest %p\n",
2067                                         le32_to_cpu(newext->ee_block),
2068                                         ext4_ext_pblock(newext),
2069                                         ext4_ext_is_unwritten(newext),
2070                                         ext4_ext_get_actual_len(newext),
2071                                         nearex);
2072                         nearex++;
2073                 } else {
2074                         /* Insert before */
2075                         BUG_ON(newext->ee_block == nearex->ee_block);
2076                         ext_debug("insert %u:%llu:[%d]%d after: "
2077                                         "nearest %p\n",
2078                                         le32_to_cpu(newext->ee_block),
2079                                         ext4_ext_pblock(newext),
2080                                         ext4_ext_is_unwritten(newext),
2081                                         ext4_ext_get_actual_len(newext),
2082                                         nearex);
2083                 }
2084                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2085                 if (len > 0) {
2086                         ext_debug("insert %u:%llu:[%d]%d: "
2087                                         "move %d extents from 0x%p to 0x%p\n",
2088                                         le32_to_cpu(newext->ee_block),
2089                                         ext4_ext_pblock(newext),
2090                                         ext4_ext_is_unwritten(newext),
2091                                         ext4_ext_get_actual_len(newext),
2092                                         len, nearex, nearex + 1);
2093                         memmove(nearex + 1, nearex,
2094                                 len * sizeof(struct ext4_extent));
2095                 }
2096         }
2097 
2098         le16_add_cpu(&eh->eh_entries, 1);
2099         path[depth].p_ext = nearex;
2100         nearex->ee_block = newext->ee_block;
2101         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2102         nearex->ee_len = newext->ee_len;
2103 
2104 merge:
2105         /* try to merge extents */
2106         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2107                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2108 
2109 
2110         /* time to correct all indexes above */
2111         err = ext4_ext_correct_indexes(handle, inode, path);
2112         if (err)
2113                 goto cleanup;
2114 
2115         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2116 
2117 cleanup:
2118         ext4_ext_drop_refs(npath);
2119         kfree(npath);
2120         return err;
2121 }
2122 
2123 static int ext4_fill_es_cache_info(struct inode *inode,
2124                                    ext4_lblk_t block, ext4_lblk_t num,
2125                                    struct fiemap_extent_info *fieinfo)
2126 {
2127         ext4_lblk_t next, end = block + num - 1;
2128         struct extent_status es;
2129         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2130         unsigned int flags;
2131         int err;
2132 
2133         while (block <= end) {
2134                 next = 0;
2135                 flags = 0;
2136                 if (!ext4_es_lookup_extent(inode, block, &next, &es))
2137                         break;
2138                 if (ext4_es_is_unwritten(&es))
2139                         flags |= FIEMAP_EXTENT_UNWRITTEN;
2140                 if (ext4_es_is_delayed(&es))
2141                         flags |= (FIEMAP_EXTENT_DELALLOC |
2142                                   FIEMAP_EXTENT_UNKNOWN);
2143                 if (ext4_es_is_hole(&es))
2144                         flags |= EXT4_FIEMAP_EXTENT_HOLE;
2145                 if (next == 0)
2146                         flags |= FIEMAP_EXTENT_LAST;
2147                 if (flags & (FIEMAP_EXTENT_DELALLOC|
2148                              EXT4_FIEMAP_EXTENT_HOLE))
2149                         es.es_pblk = 0;
2150                 else
2151                         es.es_pblk = ext4_es_pblock(&es);
2152                 err = fiemap_fill_next_extent(fieinfo,
2153                                 (__u64)es.es_lblk << blksize_bits,
2154                                 (__u64)es.es_pblk << blksize_bits,
2155                                 (__u64)es.es_len << blksize_bits,
2156                                 flags);
2157                 if (next == 0)
2158                         break;
2159                 block = next;
2160                 if (err < 0)
2161                         return err;
2162                 if (err == 1)
2163                         return 0;
2164         }
2165         return 0;
2166 }
2167 
2168 
2169 /*
2170  * ext4_ext_determine_hole - determine hole around given block
2171  * @inode:      inode we lookup in
2172  * @path:       path in extent tree to @lblk
2173  * @lblk:       pointer to logical block around which we want to determine hole
2174  *
2175  * Determine hole length (and start if easily possible) around given logical
2176  * block. We don't try too hard to find the beginning of the hole but @path
2177  * actually points to extent before @lblk, we provide it.
2178  *
2179  * The function returns the length of a hole starting at @lblk. We update @lblk
2180  * to the beginning of the hole if we managed to find it.
2181  */
2182 static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2183                                            struct ext4_ext_path *path,
2184                                            ext4_lblk_t *lblk)
2185 {
2186         int depth = ext_depth(inode);
2187         struct ext4_extent *ex;
2188         ext4_lblk_t len;
2189 
2190         ex = path[depth].p_ext;
2191         if (ex == NULL) {
2192                 /* there is no extent yet, so gap is [0;-] */
2193                 *lblk = 0;
2194                 len = EXT_MAX_BLOCKS;
2195         } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2196                 len = le32_to_cpu(ex->ee_block) - *lblk;
2197         } else if (*lblk >= le32_to_cpu(ex->ee_block)
2198                         + ext4_ext_get_actual_len(ex)) {
2199                 ext4_lblk_t next;
2200 
2201                 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2202                 next = ext4_ext_next_allocated_block(path);
2203                 BUG_ON(next == *lblk);
2204                 len = next - *lblk;
2205         } else {
2206                 BUG();
2207         }
2208         return len;
2209 }
2210 
2211 /*
2212  * ext4_ext_put_gap_in_cache:
2213  * calculate boundaries of the gap that the requested block fits into
2214  * and cache this gap
2215  */
2216 static void
2217 ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2218                           ext4_lblk_t hole_len)
2219 {
2220         struct extent_status es;
2221 
2222         ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2223                                   hole_start + hole_len - 1, &es);
2224         if (es.es_len) {
2225                 /* There's delayed extent containing lblock? */
2226                 if (es.es_lblk <= hole_start)
2227                         return;
2228                 hole_len = min(es.es_lblk - hole_start, hole_len);
2229         }
2230         ext_debug(" -> %u:%u\n", hole_start, hole_len);
2231         ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2232                               EXTENT_STATUS_HOLE);
2233 }
2234 
2235 /*
2236  * ext4_ext_rm_idx:
2237  * removes index from the index block.
2238  */
2239 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2240                         struct ext4_ext_path *path, int depth)
2241 {
2242         int err;
2243         ext4_fsblk_t leaf;
2244 
2245         /* free index block */
2246         depth--;
2247         path = path + depth;
2248         leaf = ext4_idx_pblock(path->p_idx);
2249         if (unlikely(path->p_hdr->eh_entries == 0)) {
2250                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2251                 return -EFSCORRUPTED;
2252         }
2253         err = ext4_ext_get_access(handle, inode, path);
2254         if (err)
2255                 return err;
2256 
2257         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2258                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2259                 len *= sizeof(struct ext4_extent_idx);
2260                 memmove(path->p_idx, path->p_idx + 1, len);
2261         }
2262 
2263         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2264         err = ext4_ext_dirty(handle, inode, path);
2265         if (err)
2266                 return err;
2267         ext_debug("index is empty, remove it, free block %llu\n", leaf);
2268         trace_ext4_ext_rm_idx(inode, leaf);
2269 
2270         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2271                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2272 
2273         while (--depth >= 0) {
2274                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2275                         break;
2276                 path--;
2277                 err = ext4_ext_get_access(handle, inode, path);
2278                 if (err)
2279                         break;
2280                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2281                 err = ext4_ext_dirty(handle, inode, path);
2282                 if (err)
2283                         break;
2284         }
2285         return err;
2286 }
2287 
2288 /*
2289  * ext4_ext_calc_credits_for_single_extent:
2290  * This routine returns max. credits that needed to insert an extent
2291  * to the extent tree.
2292  * When pass the actual path, the caller should calculate credits
2293  * under i_data_sem.
2294  */
2295 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2296                                                 struct ext4_ext_path *path)
2297 {
2298         if (path) {
2299                 int depth = ext_depth(inode);
2300                 int ret = 0;
2301 
2302                 /* probably there is space in leaf? */
2303                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2304                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2305 
2306                         /*
2307                          *  There are some space in the leaf tree, no
2308                          *  need to account for leaf block credit
2309                          *
2310                          *  bitmaps and block group descriptor blocks
2311                          *  and other metadata blocks still need to be
2312                          *  accounted.
2313                          */
2314                         /* 1 bitmap, 1 block group descriptor */
2315                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2316                         return ret;
2317                 }
2318         }
2319 
2320         return ext4_chunk_trans_blocks(inode, nrblocks);
2321 }
2322 
2323 /*
2324  * How many index/leaf blocks need to change/allocate to add @extents extents?
2325  *
2326  * If we add a single extent, then in the worse case, each tree level
2327  * index/leaf need to be changed in case of the tree split.
2328  *
2329  * If more extents are inserted, they could cause the whole tree split more
2330  * than once, but this is really rare.
2331  */
2332 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2333 {
2334         int index;
2335         int depth;
2336 
2337         /* If we are converting the inline data, only one is needed here. */
2338         if (ext4_has_inline_data(inode))
2339                 return 1;
2340 
2341         depth = ext_depth(inode);
2342 
2343         if (extents <= 1)
2344                 index = depth * 2;
2345         else
2346                 index = depth * 3;
2347 
2348         return index;
2349 }
2350 
2351 static inline int get_default_free_blocks_flags(struct inode *inode)
2352 {
2353         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2354             ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2355                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2356         else if (ext4_should_journal_data(inode))
2357                 return EXT4_FREE_BLOCKS_FORGET;
2358         return 0;
2359 }
2360 
2361 /*
2362  * ext4_rereserve_cluster - increment the reserved cluster count when
2363  *                          freeing a cluster with a pending reservation
2364  *
2365  * @inode - file containing the cluster
2366  * @lblk - logical block in cluster to be reserved
2367  *
2368  * Increments the reserved cluster count and adjusts quota in a bigalloc
2369  * file system when freeing a partial cluster containing at least one
2370  * delayed and unwritten block.  A partial cluster meeting that
2371  * requirement will have a pending reservation.  If so, the
2372  * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2373  * defer reserved and allocated space accounting to a subsequent call
2374  * to this function.
2375  */
2376 static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2377 {
2378         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2379         struct ext4_inode_info *ei = EXT4_I(inode);
2380 
2381         dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2382 
2383         spin_lock(&ei->i_block_reservation_lock);
2384         ei->i_reserved_data_blocks++;
2385         percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2386         spin_unlock(&ei->i_block_reservation_lock);
2387 
2388         percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2389         ext4_remove_pending(inode, lblk);
2390 }
2391 
2392 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2393                               struct ext4_extent *ex,
2394                               struct partial_cluster *partial,
2395                               ext4_lblk_t from, ext4_lblk_t to)
2396 {
2397         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2398         unsigned short ee_len = ext4_ext_get_actual_len(ex);
2399         ext4_fsblk_t last_pblk, pblk;
2400         ext4_lblk_t num;
2401         int flags;
2402 
2403         /* only extent tail removal is allowed */
2404         if (from < le32_to_cpu(ex->ee_block) ||
2405             to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2406                 ext4_error(sbi->s_sb,
2407                            "strange request: removal(2) %u-%u from %u:%u",
2408                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2409                 return 0;
2410         }
2411 
2412 #ifdef EXTENTS_STATS
2413         spin_lock(&sbi->s_ext_stats_lock);
2414         sbi->s_ext_blocks += ee_len;
2415         sbi->s_ext_extents++;
2416         if (ee_len < sbi->s_ext_min)
2417                 sbi->s_ext_min = ee_len;
2418         if (ee_len > sbi->s_ext_max)
2419                 sbi->s_ext_max = ee_len;
2420         if (ext_depth(inode) > sbi->s_depth_max)
2421                 sbi->s_depth_max = ext_depth(inode);
2422         spin_unlock(&sbi->s_ext_stats_lock);
2423 #endif
2424 
2425         trace_ext4_remove_blocks(inode, ex, from, to, partial);
2426 
2427         /*
2428          * if we have a partial cluster, and it's different from the
2429          * cluster of the last block in the extent, we free it
2430          */
2431         last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2432 
2433         if (partial->state != initial &&
2434             partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2435                 if (partial->state == tofree) {
2436                         flags = get_default_free_blocks_flags(inode);
2437                         if (ext4_is_pending(inode, partial->lblk))
2438                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2439                         ext4_free_blocks(handle, inode, NULL,
2440                                          EXT4_C2B(sbi, partial->pclu),
2441                                          sbi->s_cluster_ratio, flags);
2442                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2443                                 ext4_rereserve_cluster(inode, partial->lblk);
2444                 }
2445                 partial->state = initial;
2446         }
2447 
2448         num = le32_to_cpu(ex->ee_block) + ee_len - from;
2449         pblk = ext4_ext_pblock(ex) + ee_len - num;
2450 
2451         /*
2452          * We free the partial cluster at the end of the extent (if any),
2453          * unless the cluster is used by another extent (partial_cluster
2454          * state is nofree).  If a partial cluster exists here, it must be
2455          * shared with the last block in the extent.
2456          */
2457         flags = get_default_free_blocks_flags(inode);
2458 
2459         /* partial, left end cluster aligned, right end unaligned */
2460         if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2461             (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2462             (partial->state != nofree)) {
2463                 if (ext4_is_pending(inode, to))
2464                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2465                 ext4_free_blocks(handle, inode, NULL,
2466                                  EXT4_PBLK_CMASK(sbi, last_pblk),
2467                                  sbi->s_cluster_ratio, flags);
2468                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2469                         ext4_rereserve_cluster(inode, to);
2470                 partial->state = initial;
2471                 flags = get_default_free_blocks_flags(inode);
2472         }
2473 
2474         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2475 
2476         /*
2477          * For bigalloc file systems, we never free a partial cluster
2478          * at the beginning of the extent.  Instead, we check to see if we
2479          * need to free it on a subsequent call to ext4_remove_blocks,
2480          * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2481          */
2482         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2483         ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2484 
2485         /* reset the partial cluster if we've freed past it */
2486         if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2487                 partial->state = initial;
2488 
2489         /*
2490          * If we've freed the entire extent but the beginning is not left
2491          * cluster aligned and is not marked as ineligible for freeing we
2492          * record the partial cluster at the beginning of the extent.  It
2493          * wasn't freed by the preceding ext4_free_blocks() call, and we
2494          * need to look farther to the left to determine if it's to be freed
2495          * (not shared with another extent). Else, reset the partial
2496          * cluster - we're either  done freeing or the beginning of the
2497          * extent is left cluster aligned.
2498          */
2499         if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2500                 if (partial->state == initial) {
2501                         partial->pclu = EXT4_B2C(sbi, pblk);
2502                         partial->lblk = from;
2503                         partial->state = tofree;
2504                 }
2505         } else {
2506                 partial->state = initial;
2507         }
2508 
2509         return 0;
2510 }
2511 
2512 /*
2513  * ext4_ext_rm_leaf() Removes the extents associated with the
2514  * blocks appearing between "start" and "end".  Both "start"
2515  * and "end" must appear in the same extent or EIO is returned.
2516  *
2517  * @handle: The journal handle
2518  * @inode:  The files inode
2519  * @path:   The path to the leaf
2520  * @partial_cluster: The cluster which we'll have to free if all extents
2521  *                   has been released from it.  However, if this value is
2522  *                   negative, it's a cluster just to the right of the
2523  *                   punched region and it must not be freed.
2524  * @start:  The first block to remove
2525  * @end:   The last block to remove
2526  */
2527 static int
2528 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2529                  struct ext4_ext_path *path,
2530                  struct partial_cluster *partial,
2531                  ext4_lblk_t start, ext4_lblk_t end)
2532 {
2533         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2534         int err = 0, correct_index = 0;
2535         int depth = ext_depth(inode), credits, revoke_credits;
2536         struct ext4_extent_header *eh;
2537         ext4_lblk_t a, b;
2538         unsigned num;
2539         ext4_lblk_t ex_ee_block;
2540         unsigned short ex_ee_len;
2541         unsigned unwritten = 0;
2542         struct ext4_extent *ex;
2543         ext4_fsblk_t pblk;
2544 
2545         /* the header must be checked already in ext4_ext_remove_space() */
2546         ext_debug("truncate since %u in leaf to %u\n", start, end);
2547         if (!path[depth].p_hdr)
2548                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2549         eh = path[depth].p_hdr;
2550         if (unlikely(path[depth].p_hdr == NULL)) {
2551                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2552                 return -EFSCORRUPTED;
2553         }
2554         /* find where to start removing */
2555         ex = path[depth].p_ext;
2556         if (!ex)
2557                 ex = EXT_LAST_EXTENT(eh);
2558 
2559         ex_ee_block = le32_to_cpu(ex->ee_block);
2560         ex_ee_len = ext4_ext_get_actual_len(ex);
2561 
2562         trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2563 
2564         while (ex >= EXT_FIRST_EXTENT(eh) &&
2565                         ex_ee_block + ex_ee_len > start) {
2566 
2567                 if (ext4_ext_is_unwritten(ex))
2568                         unwritten = 1;
2569                 else
2570                         unwritten = 0;
2571 
2572                 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2573                           unwritten, ex_ee_len);
2574                 path[depth].p_ext = ex;
2575 
2576                 a = ex_ee_block > start ? ex_ee_block : start;
2577                 b = ex_ee_block+ex_ee_len - 1 < end ?
2578                         ex_ee_block+ex_ee_len - 1 : end;
2579 
2580                 ext_debug("  border %u:%u\n", a, b);
2581 
2582                 /* If this extent is beyond the end of the hole, skip it */
2583                 if (end < ex_ee_block) {
2584                         /*
2585                          * We're going to skip this extent and move to another,
2586                          * so note that its first cluster is in use to avoid
2587                          * freeing it when removing blocks.  Eventually, the
2588                          * right edge of the truncated/punched region will
2589                          * be just to the left.
2590                          */
2591                         if (sbi->s_cluster_ratio > 1) {
2592                                 pblk = ext4_ext_pblock(ex);
2593                                 partial->pclu = EXT4_B2C(sbi, pblk);
2594                                 partial->state = nofree;
2595                         }
2596                         ex--;
2597                         ex_ee_block = le32_to_cpu(ex->ee_block);
2598                         ex_ee_len = ext4_ext_get_actual_len(ex);
2599                         continue;
2600                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2601                         EXT4_ERROR_INODE(inode,
2602                                          "can not handle truncate %u:%u "
2603                                          "on extent %u:%u",
2604                                          start, end, ex_ee_block,
2605                                          ex_ee_block + ex_ee_len - 1);
2606                         err = -EFSCORRUPTED;
2607                         goto out;
2608                 } else if (a != ex_ee_block) {
2609                         /* remove tail of the extent */
2610                         num = a - ex_ee_block;
2611                 } else {
2612                         /* remove whole extent: excellent! */
2613                         num = 0;
2614                 }
2615                 /*
2616                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2617                  * descriptor) for each block group; assume two block
2618                  * groups plus ex_ee_len/blocks_per_block_group for
2619                  * the worst case
2620                  */
2621                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2622                 if (ex == EXT_FIRST_EXTENT(eh)) {
2623                         correct_index = 1;
2624                         credits += (ext_depth(inode)) + 1;
2625                 }
2626                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2627                 /*
2628                  * We may end up freeing some index blocks and data from the
2629                  * punched range. Note that partial clusters are accounted for
2630                  * by ext4_free_data_revoke_credits().
2631                  */
2632                 revoke_credits =
2633                         ext4_free_metadata_revoke_credits(inode->i_sb,
2634                                                           ext_depth(inode)) +
2635                         ext4_free_data_revoke_credits(inode, b - a + 1);
2636 
2637                 err = ext4_datasem_ensure_credits(handle, inode, credits,
2638                                                   credits, revoke_credits);
2639                 if (err) {
2640                         if (err > 0)
2641                                 err = -EAGAIN;
2642                         goto out;
2643                 }
2644 
2645                 err = ext4_ext_get_access(handle, inode, path + depth);
2646                 if (err)
2647                         goto out;
2648 
2649                 err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2650                 if (err)
2651                         goto out;
2652 
2653                 if (num == 0)
2654                         /* this extent is removed; mark slot entirely unused */
2655                         ext4_ext_store_pblock(ex, 0);
2656 
2657                 ex->ee_len = cpu_to_le16(num);
2658                 /*
2659                  * Do not mark unwritten if all the blocks in the
2660                  * extent have been removed.
2661                  */
2662                 if (unwritten && num)
2663                         ext4_ext_mark_unwritten(ex);
2664                 /*
2665                  * If the extent was completely released,
2666                  * we need to remove it from the leaf
2667                  */
2668                 if (num == 0) {
2669                         if (end != EXT_MAX_BLOCKS - 1) {
2670                                 /*
2671                                  * For hole punching, we need to scoot all the
2672                                  * extents up when an extent is removed so that
2673                                  * we dont have blank extents in the middle
2674                                  */
2675                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2676                                         sizeof(struct ext4_extent));
2677 
2678                                 /* Now get rid of the one at the end */
2679                                 memset(EXT_LAST_EXTENT(eh), 0,
2680                                         sizeof(struct ext4_extent));
2681                         }
2682                         le16_add_cpu(&eh->eh_entries, -1);
2683                 }
2684 
2685                 err = ext4_ext_dirty(handle, inode, path + depth);
2686                 if (err)
2687                         goto out;
2688 
2689                 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2690                                 ext4_ext_pblock(ex));
2691                 ex--;
2692                 ex_ee_block = le32_to_cpu(ex->ee_block);
2693                 ex_ee_len = ext4_ext_get_actual_len(ex);
2694         }
2695 
2696         if (correct_index && eh->eh_entries)
2697                 err = ext4_ext_correct_indexes(handle, inode, path);
2698 
2699         /*
2700          * If there's a partial cluster and at least one extent remains in
2701          * the leaf, free the partial cluster if it isn't shared with the
2702          * current extent.  If it is shared with the current extent
2703          * we reset the partial cluster because we've reached the start of the
2704          * truncated/punched region and we're done removing blocks.
2705          */
2706         if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2707                 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2708                 if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2709                         int flags = get_default_free_blocks_flags(inode);
2710 
2711                         if (ext4_is_pending(inode, partial->lblk))
2712                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2713                         ext4_free_blocks(handle, inode, NULL,
2714                                          EXT4_C2B(sbi, partial->pclu),
2715                                          sbi->s_cluster_ratio, flags);
2716                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2717                                 ext4_rereserve_cluster(inode, partial->lblk);
2718                 }
2719                 partial->state = initial;
2720         }
2721 
2722         /* if this leaf is free, then we should
2723          * remove it from index block above */
2724         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2725                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2726 
2727 out:
2728         return err;
2729 }
2730 
2731 /*
2732  * ext4_ext_more_to_rm:
2733  * returns 1 if current index has to be freed (even partial)
2734  */
2735 static int
2736 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2737 {
2738         BUG_ON(path->p_idx == NULL);
2739 
2740         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2741                 return 0;
2742 
2743         /*
2744          * if truncate on deeper level happened, it wasn't partial,
2745          * so we have to consider current index for truncation
2746          */
2747         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2748                 return 0;
2749         return 1;
2750 }
2751 
2752 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2753                           ext4_lblk_t end)
2754 {
2755         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2756         int depth = ext_depth(inode);
2757         struct ext4_ext_path *path = NULL;
2758         struct partial_cluster partial;
2759         handle_t *handle;
2760         int i = 0, err = 0;
2761 
2762         partial.pclu = 0;
2763         partial.lblk = 0;
2764         partial.state = initial;
2765 
2766         ext_debug("truncate since %u to %u\n", start, end);
2767 
2768         /* probably first extent we're gonna free will be last in block */
2769         handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2770                         depth + 1,
2771                         ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2772         if (IS_ERR(handle))
2773                 return PTR_ERR(handle);
2774 
2775 again:
2776         trace_ext4_ext_remove_space(inode, start, end, depth);
2777 
2778         /*
2779          * Check if we are removing extents inside the extent tree. If that
2780          * is the case, we are going to punch a hole inside the extent tree
2781          * so we have to check whether we need to split the extent covering
2782          * the last block to remove so we can easily remove the part of it
2783          * in ext4_ext_rm_leaf().
2784          */
2785         if (end < EXT_MAX_BLOCKS - 1) {
2786                 struct ext4_extent *ex;
2787                 ext4_lblk_t ee_block, ex_end, lblk;
2788                 ext4_fsblk_t pblk;
2789 
2790                 /* find extent for or closest extent to this block */
2791                 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2792                 if (IS_ERR(path)) {
2793                         ext4_journal_stop(handle);
2794                         return PTR_ERR(path);
2795                 }
2796                 depth = ext_depth(inode);
2797                 /* Leaf not may not exist only if inode has no blocks at all */
2798                 ex = path[depth].p_ext;
2799                 if (!ex) {
2800                         if (depth) {
2801                                 EXT4_ERROR_INODE(inode,
2802                                                  "path[%d].p_hdr == NULL",
2803                                                  depth);
2804                                 err = -EFSCORRUPTED;
2805                         }
2806                         goto out;
2807                 }
2808 
2809                 ee_block = le32_to_cpu(ex->ee_block);
2810                 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2811 
2812                 /*
2813                  * See if the last block is inside the extent, if so split
2814                  * the extent at 'end' block so we can easily remove the
2815                  * tail of the first part of the split extent in
2816                  * ext4_ext_rm_leaf().
2817                  */
2818                 if (end >= ee_block && end < ex_end) {
2819 
2820                         /*
2821                          * If we're going to split the extent, note that
2822                          * the cluster containing the block after 'end' is
2823                          * in use to avoid freeing it when removing blocks.
2824                          */
2825                         if (sbi->s_cluster_ratio > 1) {
2826                                 pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
2827                                 partial.pclu = EXT4_B2C(sbi, pblk);
2828                                 partial.state = nofree;
2829                         }
2830 
2831                         /*
2832                          * Split the extent in two so that 'end' is the last
2833                          * block in the first new extent. Also we should not
2834                          * fail removing space due to ENOSPC so try to use
2835                          * reserved block if that happens.
2836                          */
2837                         err = ext4_force_split_extent_at(handle, inode, &path,
2838                                                          end + 1, 1);
2839                         if (err < 0)
2840                                 goto out;
2841 
2842                 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
2843                            partial.state == initial) {
2844                         /*
2845                          * If we're punching, there's an extent to the right.
2846                          * If the partial cluster hasn't been set, set it to
2847                          * that extent's first cluster and its state to nofree
2848                          * so it won't be freed should it contain blocks to be
2849                          * removed. If it's already set (tofree/nofree), we're
2850                          * retrying and keep the original partial cluster info
2851                          * so a cluster marked tofree as a result of earlier
2852                          * extent removal is not lost.
2853                          */
2854                         lblk = ex_end + 1;
2855                         err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2856                                                     &ex);
2857                         if (err)
2858                                 goto out;
2859                         if (pblk) {
2860                                 partial.pclu = EXT4_B2C(sbi, pblk);
2861                                 partial.state = nofree;
2862                         }
2863                 }
2864         }
2865         /*
2866          * We start scanning from right side, freeing all the blocks
2867          * after i_size and walking into the tree depth-wise.
2868          */
2869         depth = ext_depth(inode);
2870         if (path) {
2871                 int k = i = depth;
2872                 while (--k > 0)
2873                         path[k].p_block =
2874                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2875         } else {
2876                 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
2877                                GFP_NOFS);
2878                 if (path == NULL) {
2879                         ext4_journal_stop(handle);
2880                         return -ENOMEM;
2881                 }
2882                 path[0].p_maxdepth = path[0].p_depth = depth;
2883                 path[0].p_hdr = ext_inode_hdr(inode);
2884                 i = 0;
2885 
2886                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2887                         err = -EFSCORRUPTED;
2888                         goto out;
2889                 }
2890         }
2891         err = 0;
2892 
2893         while (i >= 0 && err == 0) {
2894                 if (i == depth) {
2895                         /* this is leaf block */
2896                         err = ext4_ext_rm_leaf(handle, inode, path,
2897                                                &partial, start, end);
2898                         /* root level has p_bh == NULL, brelse() eats this */
2899                         brelse(path[i].p_bh);
2900                         path[i].p_bh = NULL;
2901                         i--;
2902                         continue;
2903                 }
2904 
2905                 /* this is index block */
2906                 if (!path[i].p_hdr) {
2907                         ext_debug("initialize header\n");
2908                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2909                 }
2910 
2911                 if (!path[i].p_idx) {
2912                         /* this level hasn't been touched yet */
2913                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2914                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2915                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2916                                   path[i].p_hdr,
2917                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2918                 } else {
2919                         /* we were already here, see at next index */
2920                         path[i].p_idx--;
2921                 }
2922 
2923                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2924                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2925                                 path[i].p_idx);
2926                 if (ext4_ext_more_to_rm(path + i)) {
2927                         struct buffer_head *bh;
2928                         /* go to the next level */
2929                         ext_debug("move to level %d (block %llu)\n",
2930                                   i + 1, ext4_idx_pblock(path[i].p_idx));
2931                         memset(path + i + 1, 0, sizeof(*path));
2932                         bh = read_extent_tree_block(inode,
2933                                 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2934                                 EXT4_EX_NOCACHE);
2935                         if (IS_ERR(bh)) {
2936                                 /* should we reset i_size? */
2937                                 err = PTR_ERR(bh);
2938                                 break;
2939                         }
2940                         /* Yield here to deal with large extent trees.
2941                          * Should be a no-op if we did IO above. */
2942                         cond_resched();
2943                         if (WARN_ON(i + 1 > depth)) {
2944                                 err = -EFSCORRUPTED;
2945                                 break;
2946                         }
2947                         path[i + 1].p_bh = bh;
2948 
2949                         /* save actual number of indexes since this
2950                          * number is changed at the next iteration */
2951                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2952                         i++;
2953                 } else {
2954                         /* we finished processing this index, go up */
2955                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2956                                 /* index is empty, remove it;
2957                                  * handle must be already prepared by the
2958                                  * truncatei_leaf() */
2959                                 err = ext4_ext_rm_idx(handle, inode, path, i);
2960                         }
2961                         /* root level has p_bh == NULL, brelse() eats this */
2962                         brelse(path[i].p_bh);
2963                         path[i].p_bh = NULL;
2964                         i--;
2965                         ext_debug("return to level %d\n", i);
2966                 }
2967         }
2968 
2969         trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
2970                                          path->p_hdr->eh_entries);
2971 
2972         /*
2973          * if there's a partial cluster and we have removed the first extent
2974          * in the file, then we also free the partial cluster, if any
2975          */
2976         if (partial.state == tofree && err == 0) {
2977                 int flags = get_default_free_blocks_flags(inode);
2978 
2979                 if (ext4_is_pending(inode, partial.lblk))
2980                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2981                 ext4_free_blocks(handle, inode, NULL,
2982                                  EXT4_C2B(sbi, partial.pclu),
2983                                  sbi->s_cluster_ratio, flags);
2984                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2985                         ext4_rereserve_cluster(inode, partial.lblk);
2986                 partial.state = initial;
2987         }
2988 
2989         /* TODO: flexible tree reduction should be here */
2990         if (path->p_hdr->eh_entries == 0) {
2991                 /*
2992                  * truncate to zero freed all the tree,
2993                  * so we need to correct eh_depth
2994                  */
2995                 err = ext4_ext_get_access(handle, inode, path);
2996                 if (err == 0) {
2997                         ext_inode_hdr(inode)->eh_depth = 0;
2998                         ext_inode_hdr(inode)->eh_max =
2999                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3000                         err = ext4_ext_dirty(handle, inode, path);
3001                 }
3002         }
3003 out:
3004         ext4_ext_drop_refs(path);
3005         kfree(path);
3006         path = NULL;
3007         if (err == -EAGAIN)
3008                 goto again;
3009         ext4_journal_stop(handle);
3010 
3011         return err;
3012 }
3013 
3014 /*
3015  * called at mount time
3016  */
3017 void ext4_ext_init(struct super_block *sb)
3018 {
3019         /*
3020          * possible initialization would be here
3021          */
3022 
3023         if (ext4_has_feature_extents(sb)) {
3024 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3025                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3026 #ifdef AGGRESSIVE_TEST
3027                        ", aggressive tests"
3028 #endif
3029 #ifdef CHECK_BINSEARCH
3030                        ", check binsearch"
3031 #endif
3032 #ifdef EXTENTS_STATS
3033                        ", stats"
3034 #endif
3035                        "\n");
3036 #endif
3037 #ifdef EXTENTS_STATS
3038                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3039                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3040                 EXT4_SB(sb)->s_ext_max = 0;
3041 #endif
3042         }
3043 }
3044 
3045 /*
3046  * called at umount time
3047  */
3048 void ext4_ext_release(struct super_block *sb)
3049 {
3050         if (!ext4_has_feature_extents(sb))
3051                 return;
3052 
3053 #ifdef EXTENTS_STATS
3054         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3055                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3056                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3057                         sbi->s_ext_blocks, sbi->s_ext_extents,
3058                         sbi->s_ext_blocks / sbi->s_ext_extents);
3059                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3060                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3061         }
3062 #endif
3063 }
3064 
3065 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3066 {
3067         ext4_lblk_t  ee_block;
3068         ext4_fsblk_t ee_pblock;
3069         unsigned int ee_len;
3070 
3071         ee_block  = le32_to_cpu(ex->ee_block);
3072         ee_len    = ext4_ext_get_actual_len(ex);
3073         ee_pblock = ext4_ext_pblock(ex);
3074 
3075         if (ee_len == 0)
3076                 return 0;
3077 
3078         return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3079                                      EXTENT_STATUS_WRITTEN);
3080 }
3081 
3082 /* FIXME!! we need to try to merge to left or right after zero-out  */
3083 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3084 {
3085         ext4_fsblk_t ee_pblock;
3086         unsigned int ee_len;
3087 
3088         ee_len    = ext4_ext_get_actual_len(ex);
3089         ee_pblock = ext4_ext_pblock(ex);
3090         return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3091                                   ee_len);
3092 }
3093 
3094 /*
3095  * ext4_split_extent_at() splits an extent at given block.
3096  *
3097  * @handle: the journal handle
3098  * @inode: the file inode
3099  * @path: the path to the extent
3100  * @split: the logical block where the extent is splitted.
3101  * @split_flags: indicates if the extent could be zeroout if split fails, and
3102  *               the states(init or unwritten) of new extents.
3103  * @flags: flags used to insert new extent to extent tree.
3104  *
3105  *
3106  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3107  * of which are deterimined by split_flag.
3108  *
3109  * There are two cases:
3110  *  a> the extent are splitted into two extent.
3111  *  b> split is not needed, and just mark the extent.
3112  *
3113  * return 0 on success.
3114  */
3115 static int ext4_split_extent_at(handle_t *handle,
3116                              struct inode *inode,
3117                              struct ext4_ext_path **ppath,
3118                              ext4_lblk_t split,
3119                              int split_flag,
3120                              int flags)
3121 {
3122         struct ext4_ext_path *path = *ppath;
3123         ext4_fsblk_t newblock;
3124         ext4_lblk_t ee_block;
3125         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3126         struct ext4_extent *ex2 = NULL;
3127         unsigned int ee_len, depth;
3128         int err = 0;
3129 
3130         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3131                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3132 
3133         ext_debug("ext4_split_extents_at: inode %lu, logical"
3134                 "block %llu\n", inode->i_ino, (unsigned long long)split);
3135 
3136         ext4_ext_show_leaf(inode, path);
3137 
3138         depth = ext_depth(inode);
3139         ex = path[depth].p_ext;
3140         ee_block = le32_to_cpu(ex->ee_block);
3141         ee_len = ext4_ext_get_actual_len(ex);
3142         newblock = split - ee_block + ext4_ext_pblock(ex);
3143 
3144         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3145         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3146                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3147                              EXT4_EXT_MARK_UNWRIT1 |
3148                              EXT4_EXT_MARK_UNWRIT2));
3149 
3150         err = ext4_ext_get_access(handle, inode, path + depth);
3151         if (err)
3152                 goto out;
3153 
3154         if (split == ee_block) {
3155                 /*
3156                  * case b: block @split is the block that the extent begins with
3157                  * then we just change the state of the extent, and splitting
3158                  * is not needed.
3159                  */
3160                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3161                         ext4_ext_mark_unwritten(ex);
3162                 else
3163                         ext4_ext_mark_initialized(ex);
3164 
3165                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3166                         ext4_ext_try_to_merge(handle, inode, path, ex);
3167 
3168                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3169                 goto out;
3170         }
3171 
3172         /* case a */
3173         memcpy(&orig_ex, ex, sizeof(orig_ex));
3174         ex->ee_len = cpu_to_le16(split - ee_block);
3175         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3176                 ext4_ext_mark_unwritten(ex);
3177 
3178         /*
3179          * path may lead to new leaf, not to original leaf any more
3180          * after ext4_ext_insert_extent() returns,
3181          */
3182         err = ext4_ext_dirty(handle, inode, path + depth);
3183         if (err)
3184                 goto fix_extent_len;
3185 
3186         ex2 = &newex;
3187         ex2->ee_block = cpu_to_le32(split);
3188         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3189         ext4_ext_store_pblock(ex2, newblock);
3190         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3191                 ext4_ext_mark_unwritten(ex2);
3192 
3193         err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3194         if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3195                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3196                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3197                                 err = ext4_ext_zeroout(inode, ex2);
3198                                 zero_ex.ee_block = ex2->ee_block;
3199                                 zero_ex.ee_len = cpu_to_le16(
3200                                                 ext4_ext_get_actual_len(ex2));
3201                                 ext4_ext_store_pblock(&zero_ex,
3202                                                       ext4_ext_pblock(ex2));
3203                         } else {
3204                                 err = ext4_ext_zeroout(inode, ex);
3205                                 zero_ex.ee_block = ex->ee_block;
3206                                 zero_ex.ee_len = cpu_to_le16(
3207                                                 ext4_ext_get_actual_len(ex));
3208                                 ext4_ext_store_pblock(&zero_ex,
3209                                                       ext4_ext_pblock(ex));
3210                         }
3211                 } else {
3212                         err = ext4_ext_zeroout(inode, &orig_ex);
3213                         zero_ex.ee_block = orig_ex.ee_block;
3214                         zero_ex.ee_len = cpu_to_le16(
3215                                                 ext4_ext_get_actual_len(&orig_ex));
3216                         ext4_ext_store_pblock(&zero_ex,
3217                                               ext4_ext_pblock(&orig_ex));
3218                 }
3219 
3220                 if (err)
3221                         goto fix_extent_len;
3222                 /* update the extent length and mark as initialized */
3223                 ex->ee_len = cpu_to_le16(ee_len);
3224                 ext4_ext_try_to_merge(handle, inode, path, ex);
3225                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3226                 if (err)
3227                         goto fix_extent_len;
3228 
3229                 /* update extent status tree */
3230                 err = ext4_zeroout_es(inode, &zero_ex);
3231 
3232                 goto out;
3233         } else if (err)
3234                 goto fix_extent_len;
3235 
3236 out:
3237         ext4_ext_show_leaf(inode, path);
3238         return err;
3239 
3240 fix_extent_len:
3241         ex->ee_len = orig_ex.ee_len;
3242         ext4_ext_dirty(handle, inode, path + path->p_depth);
3243         return err;
3244 }
3245 
3246 /*
3247  * ext4_split_extents() splits an extent and mark extent which is covered
3248  * by @map as split_flags indicates
3249  *
3250  * It may result in splitting the extent into multiple extents (up to three)
3251  * There are three possibilities:
3252  *   a> There is no split required
3253  *   b> Splits in two extents: Split is happening at either end of the extent
3254  *   c> Splits in three extents: Somone is splitting in middle of the extent
3255  *
3256  */
3257 static int ext4_split_extent(handle_t *handle,
3258                               struct inode *inode,
3259                               struct ext4_ext_path **ppath,
3260                               struct ext4_map_blocks *map,
3261                               int split_flag,
3262                               int flags)
3263 {
3264         struct ext4_ext_path *path = *ppath;
3265         ext4_lblk_t ee_block;
3266         struct ext4_extent *ex;
3267         unsigned int ee_len, depth;
3268         int err = 0;
3269         int unwritten;
3270         int split_flag1, flags1;
3271         int allocated = map->m_len;
3272 
3273         depth = ext_depth(inode);
3274         ex = path[depth].p_ext;
3275         ee_block = le32_to_cpu(ex->ee_block);
3276         ee_len = ext4_ext_get_actual_len(ex);
3277         unwritten = ext4_ext_is_unwritten(ex);
3278 
3279         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3280                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3281                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3282                 if (unwritten)
3283                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3284                                        EXT4_EXT_MARK_UNWRIT2;
3285                 if (split_flag & EXT4_EXT_DATA_VALID2)
3286                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3287                 err = ext4_split_extent_at(handle, inode, ppath,
3288                                 map->m_lblk + map->m_len, split_flag1, flags1);
3289                 if (err)
3290                         goto out;
3291         } else {
3292                 allocated = ee_len - (map->m_lblk - ee_block);
3293         }
3294         /*
3295          * Update path is required because previous ext4_split_extent_at() may
3296          * result in split of original leaf or extent zeroout.
3297          */
3298         path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3299         if (IS_ERR(path))
3300                 return PTR_ERR(path);
3301         depth = ext_depth(inode);
3302         ex = path[depth].p_ext;
3303         if (!ex) {
3304                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3305                                  (unsigned long) map->m_lblk);
3306                 return -EFSCORRUPTED;
3307         }
3308         unwritten = ext4_ext_is_unwritten(ex);
3309         split_flag1 = 0;
3310 
3311         if (map->m_lblk >= ee_block) {
3312                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3313                 if (unwritten) {
3314                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3315                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3316                                                      EXT4_EXT_MARK_UNWRIT2);
3317                 }
3318                 err = ext4_split_extent_at(handle, inode, ppath,
3319                                 map->m_lblk, split_flag1, flags);
3320                 if (err)
3321                         goto out;
3322         }
3323 
3324         ext4_ext_show_leaf(inode, path);
3325 out:
3326         return err ? err : allocated;
3327 }
3328 
3329 /*
3330  * This function is called by ext4_ext_map_blocks() if someone tries to write
3331  * to an unwritten extent. It may result in splitting the unwritten
3332  * extent into multiple extents (up to three - one initialized and two
3333  * unwritten).
3334  * There are three possibilities:
3335  *   a> There is no split required: Entire extent should be initialized
3336  *   b> Splits in two extents: Write is happening at either end of the extent
3337  *   c> Splits in three extents: Somone is writing in middle of the extent
3338  *
3339  * Pre-conditions:
3340  *  - The extent pointed to by 'path' is unwritten.
3341  *  - The extent pointed to by 'path' contains a superset
3342  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3343  *
3344  * Post-conditions on success:
3345  *  - the returned value is the number of blocks beyond map->l_lblk
3346  *    that are allocated and initialized.
3347  *    It is guaranteed to be >= map->m_len.
3348  */
3349 static int ext4_ext_convert_to_initialized(handle_t *handle,
3350                                            struct inode *inode,
3351                                            struct ext4_map_blocks *map,
3352                                            struct ext4_ext_path **ppath,
3353                                            int flags)
3354 {
3355         struct ext4_ext_path *path = *ppath;
3356         struct ext4_sb_info *sbi;
3357         struct ext4_extent_header *eh;
3358         struct ext4_map_blocks split_map;
3359         struct ext4_extent zero_ex1, zero_ex2;
3360         struct ext4_extent *ex, *abut_ex;
3361         ext4_lblk_t ee_block, eof_block;
3362         unsigned int ee_len, depth, map_len = map->m_len;
3363         int allocated = 0, max_zeroout = 0;
3364         int err = 0;
3365         int split_flag = EXT4_EXT_DATA_VALID2;
3366 
3367         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3368                 "block %llu, max_blocks %u\n", inode->i_ino,
3369                 (unsigned long long)map->m_lblk, map_len);
3370 
3371         sbi = EXT4_SB(inode->i_sb);
3372         eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3373                         >> inode->i_sb->s_blocksize_bits;
3374         if (eof_block < map->m_lblk + map_len)
3375                 eof_block = map->m_lblk + map_len;
3376 
3377         depth = ext_depth(inode);
3378         eh = path[depth].p_hdr;
3379         ex = path[depth].p_ext;
3380         ee_block = le32_to_cpu(ex->ee_block);
3381         ee_len = ext4_ext_get_actual_len(ex);
3382         zero_ex1.ee_len = 0;
3383         zero_ex2.ee_len = 0;
3384 
3385         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3386 
3387         /* Pre-conditions */
3388         BUG_ON(!ext4_ext_is_unwritten(ex));
3389         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3390 
3391         /*
3392          * Attempt to transfer newly initialized blocks from the currently
3393          * unwritten extent to its neighbor. This is much cheaper
3394          * than an insertion followed by a merge as those involve costly
3395          * memmove() calls. Transferring to the left is the common case in
3396          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3397          * followed by append writes.
3398          *
3399          * Limitations of the current logic:
3400          *  - L1: we do not deal with writes covering the whole extent.
3401          *    This would require removing the extent if the transfer
3402          *    is possible.
3403          *  - L2: we only attempt to merge with an extent stored in the
3404          *    same extent tree node.
3405          */
3406         if ((map->m_lblk == ee_block) &&
3407                 /* See if we can merge left */
3408                 (map_len < ee_len) &&           /*L1*/
3409                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3410                 ext4_lblk_t prev_lblk;
3411                 ext4_fsblk_t prev_pblk, ee_pblk;
3412                 unsigned int prev_len;
3413 
3414                 abut_ex = ex - 1;
3415                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3416                 prev_len = ext4_ext_get_actual_len(abut_ex);
3417                 prev_pblk = ext4_ext_pblock(abut_ex);
3418                 ee_pblk = ext4_ext_pblock(ex);
3419 
3420                 /*
3421                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3422                  * upon those conditions:
3423                  * - C1: abut_ex is initialized,
3424                  * - C2: abut_ex is logically abutting ex,
3425                  * - C3: abut_ex is physically abutting ex,
3426                  * - C4: abut_ex can receive the additional blocks without
3427                  *   overflowing the (initialized) length limit.
3428                  */
3429                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3430                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3431                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3432                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3433                         err = ext4_ext_get_access(handle, inode, path + depth);
3434                         if (err)
3435                                 goto out;
3436 
3437                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3438                                 map, ex, abut_ex);
3439 
3440                         /* Shift the start of ex by 'map_len' blocks */
3441                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3442                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3443                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3444                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3445 
3446                         /* Extend abut_ex by 'map_len' blocks */
3447                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3448 
3449                         /* Result: number of initialized blocks past m_lblk */
3450                         allocated = map_len;
3451                 }
3452         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3453                    (map_len < ee_len) &&        /*L1*/
3454                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3455                 /* See if we can merge right */
3456                 ext4_lblk_t next_lblk;
3457                 ext4_fsblk_t next_pblk, ee_pblk;
3458                 unsigned int next_len;
3459 
3460                 abut_ex = ex + 1;
3461                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3462                 next_len = ext4_ext_get_actual_len(abut_ex);
3463                 next_pblk = ext4_ext_pblock(abut_ex);
3464                 ee_pblk = ext4_ext_pblock(ex);
3465 
3466                 /*
3467                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3468                  * upon those conditions:
3469                  * - C1: abut_ex is initialized,
3470                  * - C2: abut_ex is logically abutting ex,
3471                  * - C3: abut_ex is physically abutting ex,
3472                  * - C4: abut_ex can receive the additional blocks without
3473                  *   overflowing the (initialized) length limit.
3474                  */
3475                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3476                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3477                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3478                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3479                         err = ext4_ext_get_access(handle, inode, path + depth);
3480                         if (err)
3481                                 goto out;
3482 
3483                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3484                                 map, ex, abut_ex);
3485 
3486                         /* Shift the start of abut_ex by 'map_len' blocks */
3487                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3488                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3489                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3490                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3491 
3492                         /* Extend abut_ex by 'map_len' blocks */
3493                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3494 
3495                         /* Result: number of initialized blocks past m_lblk */
3496                         allocated = map_len;
3497                 }
3498         }
3499         if (allocated) {
3500                 /* Mark the block containing both extents as dirty */
3501                 ext4_ext_dirty(handle, inode, path + depth);
3502 
3503                 /* Update path to point to the right extent */
3504                 path[depth].p_ext = abut_ex;
3505                 goto out;
3506         } else
3507                 allocated = ee_len - (map->m_lblk - ee_block);
3508 
3509         WARN_ON(map->m_lblk < ee_block);
3510         /*
3511          * It is safe to convert extent to initialized via explicit
3512          * zeroout only if extent is fully inside i_size or new_size.
3513          */
3514         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3515 
3516         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3517                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3518                         (inode->i_sb->s_blocksize_bits - 10);
3519 
3520         /*
3521          * five cases:
3522          * 1. split the extent into three extents.
3523          * 2. split the extent into two extents, zeroout the head of the first
3524          *    extent.
3525          * 3. split the extent into two extents, zeroout the tail of the second
3526          *    extent.
3527          * 4. split the extent into two extents with out zeroout.
3528          * 5. no splitting needed, just possibly zeroout the head and / or the
3529          *    tail of the extent.
3530          */
3531         split_map.m_lblk = map->m_lblk;
3532         split_map.m_len = map->m_len;
3533 
3534         if (max_zeroout && (allocated > split_map.m_len)) {
3535                 if (allocated <= max_zeroout) {
3536                         /* case 3 or 5 */
3537                         zero_ex1.ee_block =
3538                                  cpu_to_le32(split_map.m_lblk +
3539                                              split_map.m_len);
3540                         zero_ex1.ee_len =
3541                                 cpu_to_le16(allocated - split_map.m_len);
3542                         ext4_ext_store_pblock(&zero_ex1,
3543                                 ext4_ext_pblock(ex) + split_map.m_lblk +
3544                                 split_map.m_len - ee_block);
3545                         err = ext4_ext_zeroout(inode, &zero_ex1);
3546                         if (err)
3547                                 goto out;
3548                         split_map.m_len = allocated;
3549                 }
3550                 if (split_map.m_lblk - ee_block + split_map.m_len <
3551                                                                 max_zeroout) {
3552                         /* case 2 or 5 */
3553                         if (split_map.m_lblk != ee_block) {
3554                                 zero_ex2.ee_block = ex->ee_block;
3555                                 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3556                                                         ee_block);
3557                                 ext4_ext_store_pblock(&zero_ex2,
3558                                                       ext4_ext_pblock(ex));
3559                                 err = ext4_ext_zeroout(inode, &zero_ex2);
3560                                 if (err)
3561                                         goto out;
3562                         }
3563 
3564                         split_map.m_len += split_map.m_lblk - ee_block;
3565                         split_map.m_lblk = ee_block;
3566                         allocated = map->m_len;
3567                 }
3568         }
3569 
3570         err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3571                                 flags);
3572         if (err > 0)
3573                 err = 0;
3574 out:
3575         /* If we have gotten a failure, don't zero out status tree */
3576         if (!err) {
3577                 err = ext4_zeroout_es(inode, &zero_ex1);
3578                 if (!err)
3579                         err = ext4_zeroout_es(inode, &zero_ex2);
3580         }
3581         return err ? err : allocated;
3582 }
3583 
3584 /*
3585  * This function is called by ext4_ext_map_blocks() from
3586  * ext4_get_blocks_dio_write() when DIO to write
3587  * to an unwritten extent.
3588  *
3589  * Writing to an unwritten extent may result in splitting the unwritten
3590  * extent into multiple initialized/unwritten extents (up to three)
3591  * There are three possibilities:
3592  *   a> There is no split required: Entire extent should be unwritten
3593  *   b> Splits in two extents: Write is happening at either end of the extent
3594  *   c> Splits in three extents: Somone is writing in middle of the extent
3595  *
3596  * This works the same way in the case of initialized -> unwritten conversion.
3597  *
3598  * One of more index blocks maybe needed if the extent tree grow after
3599  * the unwritten extent split. To prevent ENOSPC occur at the IO
3600  * complete, we need to split the unwritten extent before DIO submit
3601  * the IO. The unwritten extent called at this time will be split
3602  * into three unwritten extent(at most). After IO complete, the part
3603  * being filled will be convert to initialized by the end_io callback function
3604  * via ext4_convert_unwritten_extents().
3605  *
3606  * Returns the size of unwritten extent to be written on success.
3607  */
3608 static int ext4_split_convert_extents(handle_t *handle,
3609                                         struct inode *inode,
3610                                         struct ext4_map_blocks *map,
3611                                         struct ext4_ext_path **ppath,
3612                                         int flags)
3613 {
3614         struct ext4_ext_path *path = *ppath;
3615         ext4_lblk_t eof_block;
3616         ext4_lblk_t ee_block;
3617         struct ext4_extent *ex;
3618         unsigned int ee_len;
3619         int split_flag = 0, depth;
3620 
3621         ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3622                   __func__, inode->i_ino,
3623                   (unsigned long long)map->m_lblk, map->m_len);
3624 
3625         eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3626                         >> inode->i_sb->s_blocksize_bits;
3627         if (eof_block < map->m_lblk + map->m_len)
3628                 eof_block = map->m_lblk + map->m_len;
3629         /*
3630          * It is safe to convert extent to initialized via explicit
3631          * zeroout only if extent is fully insde i_size or new_size.
3632          */
3633         depth = ext_depth(inode);
3634         ex = path[depth].p_ext;
3635         ee_block = le32_to_cpu(ex->ee_block);
3636         ee_len = ext4_ext_get_actual_len(ex);
3637 
3638         /* Convert to unwritten */
3639         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3640                 split_flag |= EXT4_EXT_DATA_VALID1;
3641         /* Convert to initialized */
3642         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3643                 split_flag |= ee_block + ee_len <= eof_block ?
3644                               EXT4_EXT_MAY_ZEROOUT : 0;
3645                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3646         }
3647         flags |= EXT4_GET_BLOCKS_PRE_IO;
3648         return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3649 }
3650 
3651 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3652                                                 struct inode *inode,
3653                                                 struct ext4_map_blocks *map,
3654                                                 struct ext4_ext_path **ppath)
3655 {
3656         struct ext4_ext_path *path = *ppath;
3657         struct ext4_extent *ex;
3658         ext4_lblk_t ee_block;
3659         unsigned int ee_len;
3660         int depth;
3661         int err = 0;
3662 
3663         depth = ext_depth(inode);
3664         ex = path[depth].p_ext;
3665         ee_block = le32_to_cpu(ex->ee_block);
3666         ee_len = ext4_ext_get_actual_len(ex);
3667 
3668         ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3669                 "block %llu, max_blocks %u\n", inode->i_ino,
3670                   (unsigned long long)ee_block, ee_len);
3671 
3672         /* If extent is larger than requested it is a clear sign that we still
3673          * have some extent state machine issues left. So extent_split is still
3674          * required.
3675          * TODO: Once all related issues will be fixed this situation should be
3676          * illegal.
3677          */
3678         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3679 #ifdef CONFIG_EXT4_DEBUG
3680                 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3681                              " len %u; IO logical block %llu, len %u",
3682                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3683                              (unsigned long long)map->m_lblk, map->m_len);
3684 #endif
3685                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3686                                                  EXT4_GET_BLOCKS_CONVERT);
3687                 if (err < 0)
3688                         return err;
3689                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3690                 if (IS_ERR(path))
3691                         return PTR_ERR(path);
3692                 depth = ext_depth(inode);
3693                 ex = path[depth].p_ext;
3694         }
3695 
3696         err = ext4_ext_get_access(handle, inode, path + depth);
3697         if (err)
3698                 goto out;
3699         /* first mark the extent as initialized */
3700         ext4_ext_mark_initialized(ex);
3701 
3702         /* note: ext4_ext_correct_indexes() isn't needed here because
3703          * borders are not changed
3704          */
3705         ext4_ext_try_to_merge(handle, inode, path, ex);
3706 
3707         /* Mark modified extent as dirty */
3708         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3709 out:
3710         ext4_ext_show_leaf(inode, path);
3711         return err;
3712 }
3713 
3714 static int
3715 convert_initialized_extent(handle_t *handle, struct inode *inode,
3716                            struct ext4_map_blocks *map,
3717                            struct ext4_ext_path **ppath,
3718                            unsigned int *allocated)
3719 {
3720         struct ext4_ext_path *path = *ppath;
3721         struct ext4_extent *ex;
3722         ext4_lblk_t ee_block;
3723         unsigned int ee_len;
3724         int depth;
3725         int err = 0;
3726 
3727         /*
3728          * Make sure that the extent is no bigger than we support with
3729          * unwritten extent
3730          */
3731         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3732                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3733 
3734         depth = ext_depth(inode);
3735         ex = path[depth].p_ext;
3736         ee_block = le32_to_cpu(ex->ee_block);
3737         ee_len = ext4_ext_get_actual_len(ex);
3738 
3739         ext_debug("%s: inode %lu, logical"
3740                 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3741                   (unsigned long long)ee_block, ee_len);
3742 
3743         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3744                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3745                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3746                 if (err < 0)
3747                         return err;
3748                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3749                 if (IS_ERR(path))
3750                         return PTR_ERR(path);
3751                 depth = ext_depth(inode);
3752                 ex = path[depth].p_ext;
3753                 if (!ex) {
3754                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3755                                          (unsigned long) map->m_lblk);
3756                         return -EFSCORRUPTED;
3757                 }
3758         }
3759 
3760         err = ext4_ext_get_access(handle, inode, path + depth);
3761         if (err)
3762                 return err;
3763         /* first mark the extent as unwritten */
3764         ext4_ext_mark_unwritten(ex);
3765 
3766         /* note: ext4_ext_correct_indexes() isn't needed here because
3767          * borders are not changed
3768          */
3769         ext4_ext_try_to_merge(handle, inode, path, ex);
3770 
3771         /* Mark modified extent as dirty */
3772         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3773         if (err)
3774                 return err;
3775         ext4_ext_show_leaf(inode, path);
3776 
3777         ext4_update_inode_fsync_trans(handle, inode, 1);
3778 
3779         map->m_flags |= EXT4_MAP_UNWRITTEN;
3780         if (*allocated > map->m_len)
3781                 *allocated = map->m_len;
3782         map->m_len = *allocated;
3783         return 0;
3784 }
3785 
3786 static int
3787 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3788                         struct ext4_map_blocks *map,
3789                         struct ext4_ext_path **ppath, int flags,
3790                         unsigned int allocated, ext4_fsblk_t newblock)
3791 {
3792 #ifdef EXT_DEBUG
3793         struct ext4_ext_path *path = *ppath;
3794 #endif
3795         int ret = 0;
3796         int err = 0;
3797 
3798         ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
3799                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
3800                   inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3801                   flags, allocated);
3802         ext4_ext_show_leaf(inode, path);
3803 
3804         /*
3805          * When writing into unwritten space, we should not fail to
3806          * allocate metadata blocks for the new extent block if needed.
3807          */
3808         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3809 
3810         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
3811                                                     allocated, newblock);
3812 
3813         /* get_block() before submit the IO, split the extent */
3814         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
3815                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
3816                                          flags | EXT4_GET_BLOCKS_CONVERT);
3817                 if (ret <= 0)
3818                         goto out;
3819                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3820                 goto out;
3821         }
3822         /* IO end_io complete, convert the filled extent to written */
3823         if (flags & EXT4_GET_BLOCKS_CONVERT) {
3824                 if (flags & EXT4_GET_BLOCKS_ZERO) {
3825                         if (allocated > map->m_len)
3826                                 allocated = map->m_len;
3827                         err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
3828                                                  allocated);
3829                         if (err < 0)
3830                                 goto out2;
3831                 }
3832                 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
3833                                                            ppath);
3834                 if (ret >= 0)
3835                         ext4_update_inode_fsync_trans(handle, inode, 1);
3836                 else
3837                         err = ret;
3838                 map->m_flags |= EXT4_MAP_MAPPED;
3839                 map->m_pblk = newblock;
3840                 if (allocated > map->m_len)
3841                         allocated = map->m_len;
3842                 map->m_len = allocated;
3843                 goto out2;
3844         }
3845         /* buffered IO case */
3846         /*
3847          * repeat fallocate creation request
3848          * we already have an unwritten extent
3849          */
3850         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
3851                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3852                 goto map_out;
3853         }
3854 
3855         /* buffered READ or buffered write_begin() lookup */
3856         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3857                 /*
3858                  * We have blocks reserved already.  We
3859                  * return allocated blocks so that delalloc
3860                  * won't do block reservation for us.  But
3861                  * the buffer head will be unmapped so that
3862                  * a read from the block returns 0s.
3863                  */
3864                 map->m_flags |= EXT4_MAP_UNWRITTEN;
3865                 goto out1;
3866         }
3867 
3868         /* buffered write, writepage time, convert*/
3869         ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
3870         if (ret >= 0)
3871                 ext4_update_inode_fsync_trans(handle, inode, 1);
3872 out:
3873         if (ret <= 0) {
3874                 err = ret;
3875                 goto out2;
3876         } else
3877                 allocated = ret;
3878         map->m_flags |= EXT4_MAP_NEW;
3879         if (allocated > map->m_len)
3880                 allocated = map->m_len;
3881         map->m_len = allocated;
3882 
3883 map_out:
3884         map->m_flags |= EXT4_MAP_MAPPED;
3885 out1:
3886         if (allocated > map->m_len)
3887                 allocated = map->m_len;
3888         ext4_ext_show_leaf(inode, path);
3889         map->m_pblk = newblock;
3890         map->m_len = allocated;
3891 out2:
3892         return err ? err : allocated;
3893 }
3894 
3895 /*
3896  * get_implied_cluster_alloc - check to see if the requested
3897  * allocation (in the map structure) overlaps with a cluster already
3898  * allocated in an extent.
3899  *      @sb     The filesystem superblock structure
3900  *      @map    The requested lblk->pblk mapping
3901  *      @ex     The extent structure which might contain an implied
3902  *                      cluster allocation
3903  *
3904  * This function is called by ext4_ext_map_blocks() after we failed to
3905  * find blocks that were already in the inode's extent tree.  Hence,
3906  * we know that the beginning of the requested region cannot overlap
3907  * the extent from the inode's extent tree.  There are three cases we
3908  * want to catch.  The first is this case:
3909  *
3910  *               |--- cluster # N--|
3911  *    |--- extent ---|  |---- requested region ---|
3912  *                      |==========|
3913  *
3914  * The second case that we need to test for is this one:
3915  *
3916  *   |--------- cluster # N ----------------|
3917  *         |--- requested region --|   |------- extent ----|
3918  *         |=======================|
3919  *
3920  * The third case is when the requested region lies between two extents
3921  * within the same cluster:
3922  *          |------------- cluster # N-------------|
3923  * |----- ex -----|                  |---- ex_right ----|
3924  *                  |------ requested region ------|
3925  *                  |================|
3926  *
3927  * In each of the above cases, we need to set the map->m_pblk and
3928  * map->m_len so it corresponds to the return the extent labelled as
3929  * "|====|" from cluster #N, since it is already in use for data in
3930  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
3931  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3932  * as a new "allocated" block region.  Otherwise, we will return 0 and
3933  * ext4_ext_map_blocks() will then allocate one or more new clusters
3934  * by calling ext4_mb_new_blocks().
3935  */
3936 static int get_implied_cluster_alloc(struct super_block *sb,
3937                                      struct ext4_map_blocks *map,
3938                                      struct ext4_extent *ex,
3939                                      struct ext4_ext_path *path)
3940 {
3941         struct ext4_sb_info *sbi = EXT4_SB(sb);
3942         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
3943         ext4_lblk_t ex_cluster_start, ex_cluster_end;
3944         ext4_lblk_t rr_cluster_start;
3945         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3946         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3947         unsigned short ee_len = ext4_ext_get_actual_len(ex);
3948 
3949         /* The extent passed in that we are trying to match */
3950         ex_cluster_start = EXT4_B2C(sbi, ee_block);
3951         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3952 
3953         /* The requested region passed into ext4_map_blocks() */
3954         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3955 
3956         if ((rr_cluster_start == ex_cluster_end) ||
3957             (rr_cluster_start == ex_cluster_start)) {
3958                 if (rr_cluster_start == ex_cluster_end)
3959                         ee_start += ee_len - 1;
3960                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
3961                 map->m_len = min(map->m_len,
3962                                  (unsigned) sbi->s_cluster_ratio - c_offset);
3963                 /*
3964                  * Check for and handle this case:
3965                  *
3966                  *   |--------- cluster # N-------------|
3967                  *                     |------- extent ----|
3968                  *         |--- requested region ---|
3969                  *         |===========|
3970                  */
3971 
3972                 if (map->m_lblk < ee_block)
3973                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
3974 
3975                 /*
3976                  * Check for the case where there is already another allocated
3977                  * block to the right of 'ex' but before the end of the cluster.
3978                  *
3979                  *          |------------- cluster # N-------------|
3980                  * |----- ex -----|                  |---- ex_right ----|
3981                  *                  |------ requested region ------|
3982                  *                  |================|
3983                  */
3984                 if (map->m_lblk > ee_block) {
3985                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3986                         map->m_len = min(map->m_len, next - map->m_lblk);
3987                 }
3988 
3989                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3990                 return 1;
3991         }
3992 
3993         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3994         return 0;
3995 }
3996 
3997 
3998 /*
3999  * Block allocation/map/preallocation routine for extents based files
4000  *
4001  *
4002  * Need to be called with
4003  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4004  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4005  *
4006  * return > 0, number of of blocks already mapped/allocated
4007  *          if create == 0 and these are pre-allocated blocks
4008  *              buffer head is unmapped
4009  *          otherwise blocks are mapped
4010  *
4011  * return = 0, if plain look up failed (blocks have not been allocated)
4012  *          buffer head is unmapped
4013  *
4014  * return < 0, error case.
4015  */
4016 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4017                         struct ext4_map_blocks *map, int flags)
4018 {
4019         struct ext4_ext_path *path = NULL;
4020         struct ext4_extent newex, *ex, *ex2;
4021         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4022         ext4_fsblk_t newblock = 0;
4023         int err = 0, depth, ret;
4024         unsigned int allocated = 0, offset = 0;
4025         unsigned int allocated_clusters = 0;
4026         struct ext4_allocation_request ar;
4027         ext4_lblk_t cluster_offset;
4028 
4029         ext_debug("blocks %u/%u requested for inode %lu\n",
4030                   map->m_lblk, map->m_len, inode->i_ino);
4031         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4032 
4033         /* find extent for this block */
4034         path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4035         if (IS_ERR(path)) {
4036                 err = PTR_ERR(path);
4037                 path = NULL;
4038                 goto out2;
4039         }
4040 
4041         depth = ext_depth(inode);
4042 
4043         /*
4044          * consistent leaf must not be empty;
4045          * this situation is possible, though, _during_ tree modification;
4046          * this is why assert can't be put in ext4_find_extent()
4047          */
4048         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4049                 EXT4_ERROR_INODE(inode, "bad extent address "
4050                                  "lblock: %lu, depth: %d pblock %lld",
4051                                  (unsigned long) map->m_lblk, depth,
4052                                  path[depth].p_block);
4053                 err = -EFSCORRUPTED;
4054                 goto out2;
4055         }
4056 
4057         ex = path[depth].p_ext;
4058         if (ex) {
4059                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4060                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4061                 unsigned short ee_len;
4062 
4063 
4064                 /*
4065                  * unwritten extents are treated as holes, except that
4066                  * we split out initialized portions during a write.
4067                  */
4068                 ee_len = ext4_ext_get_actual_len(ex);
4069 
4070                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4071 
4072                 /* if found extent covers block, simply return it */
4073                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4074                         newblock = map->m_lblk - ee_block + ee_start;
4075                         /* number of remaining blocks in the extent */
4076                         allocated = ee_len - (map->m_lblk - ee_block);
4077                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4078                                   ee_block, ee_len, newblock);
4079 
4080                         /*
4081                          * If the extent is initialized check whether the
4082                          * caller wants to convert it to unwritten.
4083                          */
4084                         if ((!ext4_ext_is_unwritten(ex)) &&
4085                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4086                                 err = convert_initialized_extent(handle,
4087                                         inode, map, &path, &allocated);
4088                                 goto out2;
4089                         } else if (!ext4_ext_is_unwritten(ex)) {
4090                                 goto out;
4091                         }
4092 
4093                         ret = ext4_ext_handle_unwritten_extents(
4094                                 handle, inode, map, &path, flags,
4095                                 allocated, newblock);
4096                         if (ret < 0)
4097                                 err = ret;
4098                         else
4099                                 allocated = ret;
4100                         goto out2;
4101                 }
4102         }
4103 
4104         /*
4105          * requested block isn't allocated yet;
4106          * we couldn't try to create block if create flag is zero
4107          */
4108         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4109                 ext4_lblk_t hole_start, hole_len;
4110 
4111                 hole_start = map->m_lblk;
4112                 hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4113                 /*
4114                  * put just found gap into cache to speed up
4115                  * subsequent requests
4116                  */
4117                 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4118 
4119                 /* Update hole_len to reflect hole size after map->m_lblk */
4120                 if (hole_start != map->m_lblk)
4121                         hole_len -= map->m_lblk - hole_start;
4122                 map->m_pblk = 0;
4123                 map->m_len = min_t(unsigned int, map->m_len, hole_len);
4124 
4125                 goto out2;
4126         }
4127 
4128         /*
4129          * Okay, we need to do block allocation.
4130          */
4131         newex.ee_block = cpu_to_le32(map->m_lblk);
4132         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4133 
4134         /*
4135          * If we are doing bigalloc, check to see if the extent returned
4136          * by ext4_find_extent() implies a cluster we can use.
4137          */
4138         if (cluster_offset && ex &&
4139             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4140                 ar.len = allocated = map->m_len;
4141                 newblock = map->m_pblk;
4142                 goto got_allocated_blocks;
4143         }
4144 
4145         /* find neighbour allocated blocks */
4146         ar.lleft = map->m_lblk;
4147         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4148         if (err)
4149                 goto out2;
4150         ar.lright = map->m_lblk;
4151         ex2 = NULL;
4152         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4153         if (err)
4154                 goto out2;
4155 
4156         /* Check if the extent after searching to the right implies a
4157          * cluster we can use. */
4158         if ((sbi->s_cluster_ratio > 1) && ex2 &&
4159             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4160                 ar.len = allocated = map->m_len;
4161                 newblock = map->m_pblk;
4162                 goto got_allocated_blocks;
4163         }
4164 
4165         /*
4166          * See if request is beyond maximum number of blocks we can have in
4167          * a single extent. For an initialized extent this limit is
4168          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4169          * EXT_UNWRITTEN_MAX_LEN.
4170          */
4171         if (map->m_len > EXT_INIT_MAX_LEN &&
4172             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4173                 map->m_len = EXT_INIT_MAX_LEN;
4174         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4175                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4176                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4177 
4178         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4179         newex.ee_len = cpu_to_le16(map->m_len);
4180         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4181         if (err)
4182                 allocated = ext4_ext_get_actual_len(&newex);
4183         else
4184                 allocated = map->m_len;
4185 
4186         /* allocate new block */
4187         ar.inode = inode;
4188         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4189         ar.logical = map->m_lblk;
4190         /*
4191          * We calculate the offset from the beginning of the cluster
4192          * for the logical block number, since when we allocate a
4193          * physical cluster, the physical block should start at the
4194          * same offset from the beginning of the cluster.  This is
4195          * needed so that future calls to get_implied_cluster_alloc()
4196          * work correctly.
4197          */
4198         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4199         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4200         ar.goal -= offset;
4201         ar.logical -= offset;
4202         if (S_ISREG(inode->i_mode))
4203                 ar.flags = EXT4_MB_HINT_DATA;
4204         else
4205                 /* disable in-core preallocation for non-regular files */
4206                 ar.flags = 0;
4207         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4208                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4209         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4210                 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4211         if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4212                 ar.flags |= EXT4_MB_USE_RESERVED;
4213         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4214         if (!newblock)
4215                 goto out2;
4216         ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4217                   ar.goal, newblock, allocated);
4218         allocated_clusters = ar.len;
4219         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4220         if (ar.len > allocated)
4221                 ar.len = allocated;
4222 
4223 got_allocated_blocks:
4224         /* try to insert new extent into found leaf and return */
4225         ext4_ext_store_pblock(&newex, newblock + offset);
4226         newex.ee_len = cpu_to_le16(ar.len);
4227         /* Mark unwritten */
4228         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4229                 ext4_ext_mark_unwritten(&newex);
4230                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4231         }
4232 
4233         err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
4234         if (err) {
4235                 if (allocated_clusters) {
4236                         int fb_flags = 0;
4237 
4238                         /*
4239                          * free data blocks we just allocated.
4240                          * not a good idea to call discard here directly,
4241                          * but otherwise we'd need to call it every free().
4242                          */
4243                         ext4_discard_preallocations(inode);
4244                         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4245                                 fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
4246                         ext4_free_blocks(handle, inode, NULL, newblock,
4247                                          EXT4_C2B(sbi, allocated_clusters),
4248                                          fb_flags);
4249                 }
4250                 goto out2;
4251         }
4252 
4253         /* previous routine could use block we allocated */
4254         newblock = ext4_ext_pblock(&newex);
4255         allocated = ext4_ext_get_actual_len(&newex);
4256         if (allocated > map->m_len)
4257                 allocated = map->m_len;
4258         map->m_flags |= EXT4_MAP_NEW;
4259 
4260         /*
4261          * Reduce the reserved cluster count to reflect successful deferred
4262          * allocation of delayed allocated clusters or direct allocation of
4263          * clusters discovered to be delayed allocated.  Once allocated, a
4264          * cluster is not included in the reserved count.
4265          */
4266         if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
4267                 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4268                         /*
4269                          * When allocating delayed allocated clusters, simply
4270                          * reduce the reserved cluster count and claim quota
4271                          */
4272                         ext4_da_update_reserve_space(inode, allocated_clusters,
4273                                                         1);
4274                 } else {
4275                         ext4_lblk_t lblk, len;
4276                         unsigned int n;
4277 
4278                         /*
4279                          * When allocating non-delayed allocated clusters
4280                          * (from fallocate, filemap, DIO, or clusters
4281                          * allocated when delalloc has been disabled by
4282                          * ext4_nonda_switch), reduce the reserved cluster
4283                          * count by the number of allocated clusters that
4284                          * have previously been delayed allocated.  Quota
4285                          * has been claimed by ext4_mb_new_blocks() above,
4286                          * so release the quota reservations made for any
4287                          * previously delayed allocated clusters.
4288                          */
4289                         lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4290                         len = allocated_clusters << sbi->s_cluster_bits;
4291                         n = ext4_es_delayed_clu(inode, lblk, len);
4292                         if (n > 0)
4293                                 ext4_da_update_reserve_space(inode, (int) n, 0);
4294                 }
4295         }
4296 
4297         /*
4298          * Cache the extent and update transaction to commit on fdatasync only
4299          * when it is _not_ an unwritten extent.
4300          */
4301         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4302                 ext4_update_inode_fsync_trans(handle, inode, 1);
4303         else
4304                 ext4_update_inode_fsync_trans(handle, inode, 0);
4305 out:
4306         if (allocated > map->m_len)
4307                 allocated = map->m_len;
4308         ext4_ext_show_leaf(inode, path);
4309         map->m_flags |= EXT4_MAP_MAPPED;
4310         map->m_pblk = newblock;
4311         map->m_len = allocated;
4312 out2:
4313         ext4_ext_drop_refs(path);
4314         kfree(path);
4315 
4316         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4317                                        err ? err : allocated);
4318         return err ? err : allocated;
4319 }
4320 
4321 int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4322 {
4323         struct super_block *sb = inode->i_sb;
4324         ext4_lblk_t last_block;
4325         int err = 0;
4326 
4327         /*
4328          * TODO: optimization is possible here.
4329          * Probably we need not scan at all,
4330          * because page truncation is enough.
4331          */
4332 
4333         /* we have to know where to truncate from in crash case */
4334         EXT4_I(inode)->i_disksize = inode->i_size;
4335         err = ext4_mark_inode_dirty(handle, inode);
4336         if (err)
4337                 return err;
4338 
4339         last_block = (inode->i_size + sb->s_blocksize - 1)
4340                         >> EXT4_BLOCK_SIZE_BITS(sb);
4341 retry:
4342         err = ext4_es_remove_extent(inode, last_block,
4343                                     EXT_MAX_BLOCKS - last_block);
4344         if (err == -ENOMEM) {
4345                 cond_resched();
4346                 congestion_wait(BLK_RW_ASYNC, HZ/50);
4347                 goto retry;
4348         }
4349         if (err)
4350                 return err;
4351         return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4352 }
4353 
4354 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4355                                   ext4_lblk_t len, loff_t new_size,
4356                                   int flags)
4357 {
4358         struct inode *inode = file_inode(file);
4359         handle_t *handle;
4360         int ret = 0;
4361         int ret2 = 0, ret3 = 0;
4362         int retries = 0;
4363         int depth = 0;
4364         struct ext4_map_blocks map;
4365         unsigned int credits;
4366         loff_t epos;
4367 
4368         BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4369         map.m_lblk = offset;
4370         map.m_len = len;
4371         /*
4372          * Don't normalize the request if it can fit in one extent so
4373          * that it doesn't get unnecessarily split into multiple
4374          * extents.
4375          */
4376         if (len <= EXT_UNWRITTEN_MAX_LEN)
4377                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4378 
4379         /*
4380          * credits to insert 1 extent into extent tree
4381          */
4382         credits = ext4_chunk_trans_blocks(inode, len);
4383         depth = ext_depth(inode);
4384 
4385 retry:
4386         while (ret >= 0 && len) {
4387                 /*
4388                  * Recalculate credits when extent tree depth changes.
4389                  */
4390                 if (depth != ext_depth(inode)) {
4391                         credits = ext4_chunk_trans_blocks(inode, len);
4392                         depth = ext_depth(inode);
4393                 }
4394 
4395                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4396                                             credits);
4397                 if (IS_ERR(handle)) {
4398                         ret = PTR_ERR(handle);
4399                         break;
4400                 }
4401                 ret = ext4_map_blocks(handle, inode, &map, flags);
4402                 if (ret <= 0) {
4403                         ext4_debug("inode #%lu: block %u: len %u: "
4404                                    "ext4_ext_map_blocks returned %d",
4405                                    inode->i_ino, map.m_lblk,
4406                                    map.m_len, ret);
4407                         ext4_mark_inode_dirty(handle, inode);
4408                         ret2 = ext4_journal_stop(handle);
4409                         break;
4410                 }
4411                 map.m_lblk += ret;
4412                 map.m_len = len = len - ret;
4413                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4414                 inode->i_ctime = current_time(inode);
4415                 if (new_size) {
4416                         if (epos > new_size)
4417                                 epos = new_size;
4418                         if (ext4_update_inode_size(inode, epos) & 0x1)
4419                                 inode->i_mtime = inode->i_ctime;
4420                 }
4421                 ret2 = ext4_mark_inode_dirty(handle, inode);
4422                 ext4_update_inode_fsync_trans(handle, inode, 1);
4423                 ret3 = ext4_journal_stop(handle);
4424                 ret2 = ret3 ? ret3 : ret2;
4425                 if (unlikely(ret2))
4426                         break;
4427         }
4428         if (ret == -ENOSPC &&
4429                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
4430                 ret = 0;
4431                 goto retry;
4432         }
4433 
4434         return ret > 0 ? ret2 : ret;
4435 }
4436 
4437 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
4438 
4439 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
4440 
4441 static long ext4_zero_range(struct file *file, loff_t offset,
4442                             loff_t len, int mode)
4443 {
4444         struct inode *inode = file_inode(file);
4445         handle_t *handle = NULL;
4446         unsigned int max_blocks;
4447         loff_t new_size = 0;
4448         int ret = 0;
4449         int flags;
4450         int credits;
4451         int partial_begin, partial_end;
4452         loff_t start, end;
4453         ext4_lblk_t lblk;
4454         unsigned int blkbits = inode->i_blkbits;
4455 
4456         trace_ext4_zero_range(inode, offset, len, mode);
4457 
4458         /* Call ext4_force_commit to flush all data in case of data=journal. */
4459         if (ext4_should_journal_data(inode)) {
4460                 ret = ext4_force_commit(inode->i_sb);
4461                 if (ret)
4462                         return ret;
4463         }
4464 
4465         /*
4466          * Round up offset. This is not fallocate, we neet to zero out
4467          * blocks, so convert interior block aligned part of the range to
4468          * unwritten and possibly manually zero out unaligned parts of the
4469          * range.
4470          */
4471         start = round_up(offset, 1 << blkbits);
4472         end = round_down((offset + len), 1 << blkbits);
4473 
4474         if (start < offset || end > offset + len)
4475                 return -EINVAL;
4476         partial_begin = offset & ((1 << blkbits) - 1);
4477         partial_end = (offset + len) & ((1 << blkbits) - 1);
4478 
4479         lblk = start >> blkbits;
4480         max_blocks = (end >> blkbits);
4481         if (max_blocks < lblk)
4482                 max_blocks = 0;
4483         else
4484                 max_blocks -= lblk;
4485 
4486         inode_lock(inode);
4487 
4488         /*
4489          * Indirect files do not support unwritten extnets
4490          */
4491         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4492                 ret = -EOPNOTSUPP;
4493                 goto out_mutex;
4494         }
4495 
4496         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4497             (offset + len > inode->i_size ||
4498              offset + len > EXT4_I(inode)->i_disksize)) {
4499                 new_size = offset + len;
4500                 ret = inode_newsize_ok(inode, new_size);
4501                 if (ret)
4502                         goto out_mutex;
4503         }
4504 
4505         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4506         if (mode & FALLOC_FL_KEEP_SIZE)
4507                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4508 
4509         /* Wait all existing dio workers, newcomers will block on i_mutex */
4510         inode_dio_wait(inode);
4511 
4512         /* Preallocate the range including the unaligned edges */
4513         if (partial_begin || partial_end) {
4514                 ret = ext4_alloc_file_blocks(file,
4515                                 round_down(offset, 1 << blkbits) >> blkbits,
4516                                 (round_up((offset + len), 1 << blkbits) -
4517                                  round_down(offset, 1 << blkbits)) >> blkbits,
4518                                 new_size, flags);
4519                 if (ret)
4520                         goto out_mutex;
4521 
4522         }
4523 
4524         /* Zero range excluding the unaligned edges */
4525         if (max_blocks > 0) {
4526                 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4527                           EXT4_EX_NOCACHE);
4528 
4529                 /*
4530                  * Prevent page faults from reinstantiating pages we have
4531                  * released from page cache.
4532                  */
4533                 down_write(&EXT4_I(inode)->i_mmap_sem);
4534 
4535                 ret = ext4_break_layouts(inode);
4536                 if (ret) {
4537                         up_write(&EXT4_I(inode)->i_mmap_sem);
4538                         goto out_mutex;
4539                 }
4540 
4541                 ret = ext4_update_disksize_before_punch(inode, offset, len);
4542                 if (ret) {
4543                         up_write(&EXT4_I(inode)->i_mmap_sem);
4544                         goto out_mutex;
4545                 }
4546                 /* Now release the pages and zero block aligned part of pages */
4547                 truncate_pagecache_range(inode, start, end - 1);
4548                 inode->i_mtime = inode->i_ctime = current_time(inode);
4549 
4550                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4551                                              flags);
4552                 up_write(&EXT4_I(inode)->i_mmap_sem);
4553                 if (ret)
4554                         goto out_mutex;
4555         }
4556         if (!partial_begin && !partial_end)
4557                 goto out_mutex;
4558 
4559         /*
4560          * In worst case we have to writeout two nonadjacent unwritten
4561          * blocks and update the inode
4562          */
4563         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4564         if (ext4_should_journal_data(inode))
4565                 credits += 2;
4566         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4567         if (IS_ERR(handle)) {
4568                 ret = PTR_ERR(handle);
4569                 ext4_std_error(inode->i_sb, ret);
4570                 goto out_mutex;
4571         }
4572 
4573         inode->i_mtime = inode->i_ctime = current_time(inode);
4574         if (new_size)
4575                 ext4_update_inode_size(inode, new_size);
4576         ret = ext4_mark_inode_dirty(handle, inode);
4577         if (unlikely(ret))
4578                 goto out_handle;
4579 
4580         /* Zero out partial block at the edges of the range */
4581         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4582         if (ret >= 0)
4583                 ext4_update_inode_fsync_trans(handle, inode, 1);
4584 
4585         if (file->f_flags & O_SYNC)
4586                 ext4_handle_sync(handle);
4587 
4588 out_handle:
4589         ext4_journal_stop(handle);
4590 out_mutex:
4591         inode_unlock(inode);
4592         return ret;
4593 }
4594 
4595 /*
4596  * preallocate space for a file. This implements ext4's fallocate file
4597  * operation, which gets called from sys_fallocate system call.
4598  * For block-mapped files, posix_fallocate should fall back to the method
4599  * of writing zeroes to the required new blocks (the same behavior which is
4600  * expected for file systems which do not support fallocate() system call).
4601  */
4602 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4603 {
4604         struct inode *inode = file_inode(file);
4605         loff_t new_size = 0;
4606         unsigned int max_blocks;
4607         int ret = 0;
4608         int flags;
4609         ext4_lblk_t lblk;
4610         unsigned int blkbits = inode->i_blkbits;
4611 
4612         /*
4613          * Encrypted inodes can't handle collapse range or insert
4614          * range since we would need to re-encrypt blocks with a
4615          * different IV or XTS tweak (which are based on the logical
4616          * block number).
4617          */
4618         if (IS_ENCRYPTED(inode) &&
4619             (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4620                 return -EOPNOTSUPP;
4621 
4622         /* Return error if mode is not supported */
4623         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4624                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4625                      FALLOC_FL_INSERT_RANGE))
4626                 return -EOPNOTSUPP;
4627 
4628         if (mode & FALLOC_FL_PUNCH_HOLE)
4629                 return ext4_punch_hole(inode, offset, len);
4630 
4631         ret = ext4_convert_inline_data(inode);
4632         if (ret)
4633                 return ret;
4634 
4635         if (mode & FALLOC_FL_COLLAPSE_RANGE)
4636                 return ext4_collapse_range(inode, offset, len);
4637 
4638         if (mode & FALLOC_FL_INSERT_RANGE)
4639                 return ext4_insert_range(inode, offset, len);
4640 
4641         if (mode & FALLOC_FL_ZERO_RANGE)
4642                 return ext4_zero_range(file, offset, len, mode);
4643 
4644         trace_ext4_fallocate_enter(inode, offset, len, mode);
4645         lblk = offset >> blkbits;
4646 
4647         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4648         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4649         if (mode & FALLOC_FL_KEEP_SIZE)
4650                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4651 
4652         inode_lock(inode);
4653 
4654         /*
4655          * We only support preallocation for extent-based files only
4656          */
4657         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4658                 ret = -EOPNOTSUPP;
4659                 goto out;
4660         }
4661 
4662         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4663             (offset + len > inode->i_size ||
4664              offset + len > EXT4_I(inode)->i_disksize)) {
4665                 new_size = offset + len;
4666                 ret = inode_newsize_ok(inode, new_size);
4667                 if (ret)
4668                         goto out;
4669         }
4670 
4671         /* Wait all existing dio workers, newcomers will block on i_mutex */
4672         inode_dio_wait(inode);
4673 
4674         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4675         if (ret)
4676                 goto out;
4677 
4678         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4679                 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4680                                                 EXT4_I(inode)->i_sync_tid);
4681         }
4682 out:
4683         inode_unlock(inode);
4684         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4685         return ret;
4686 }
4687 
4688 /*
4689  * This function convert a range of blocks to written extents
4690  * The caller of this function will pass the start offset and the size.
4691  * all unwritten extents within this range will be converted to
4692  * written extents.
4693  *
4694  * This function is called from the direct IO end io call back
4695  * function, to convert the fallocated extents after IO is completed.
4696  * Returns 0 on success.
4697  */
4698 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4699                                    loff_t offset, ssize_t len)
4700 {
4701         unsigned int max_blocks;
4702         int ret = 0, ret2 = 0, ret3 = 0;
4703         struct ext4_map_blocks map;
4704         unsigned int blkbits = inode->i_blkbits;
4705         unsigned int credits = 0;
4706 
4707         map.m_lblk = offset >> blkbits;
4708         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4709 
4710         if (!handle) {
4711                 /*
4712                  * credits to insert 1 extent into extent tree
4713                  */
4714                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4715         }
4716         while (ret >= 0 && ret < max_blocks) {
4717                 map.m_lblk += ret;
4718                 map.m_len = (max_blocks -= ret);
4719                 if (credits) {
4720                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4721                                                     credits);
4722                         if (IS_ERR(handle)) {
4723                                 ret = PTR_ERR(handle);
4724                                 break;
4725                         }
4726                 }
4727                 ret = ext4_map_blocks(handle, inode, &map,
4728                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4729                 if (ret <= 0)
4730                         ext4_warning(inode->i_sb,
4731                                      "inode #%lu: block %u: len %u: "
4732                                      "ext4_ext_map_blocks returned %d",
4733                                      inode->i_ino, map.m_lblk,
4734                                      map.m_len, ret);
4735                 ret2 = ext4_mark_inode_dirty(handle, inode);
4736                 if (credits) {
4737                         ret3 = ext4_journal_stop(handle);
4738                         if (unlikely(ret3))
4739                                 ret2 = ret3;
4740                 }
4741 
4742                 if (ret <= 0 || ret2)
4743                         break;
4744         }
4745         return ret > 0 ? ret2 : ret;
4746 }
4747 
4748 int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4749 {
4750         int ret, err = 0;
4751         struct ext4_io_end_vec *io_end_vec;
4752 
4753         /*
4754          * This is somewhat ugly but the idea is clear: When transaction is
4755          * reserved, everything goes into it. Otherwise we rather start several
4756          * smaller transactions for conversion of each extent separately.
4757          */
4758         if (handle) {
4759                 handle = ext4_journal_start_reserved(handle,
4760                                                      EXT4_HT_EXT_CONVERT);
4761                 if (IS_ERR(handle))
4762                         return PTR_ERR(handle);
4763         }
4764 
4765         list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4766                 ret = ext4_convert_unwritten_extents(handle, io_end->inode,
4767                                                      io_end_vec->offset,
4768                                                      io_end_vec->size);
4769                 if (ret)
4770                         break;
4771         }
4772 
4773         if (handle)
4774                 err = ext4_journal_stop(handle);
4775 
4776         return ret < 0 ? ret : err;
4777 }
4778 
4779 static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
4780 {
4781         __u64 physical = 0;
4782         __u64 length = 0;
4783         int blockbits = inode->i_sb->s_blocksize_bits;
4784         int error = 0;
4785         u16 iomap_type;
4786 
4787         /* in-inode? */
4788         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4789                 struct ext4_iloc iloc;
4790                 int offset;     /* offset of xattr in inode */
4791 
4792                 error = ext4_get_inode_loc(inode, &iloc);
4793                 if (error)
4794                         return error;
4795                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4796                 offset = EXT4_GOOD_OLD_INODE_SIZE +
4797                                 EXT4_I(inode)->i_extra_isize;
4798                 physical += offset;
4799                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4800                 brelse(iloc.bh);
4801                 iomap_type = IOMAP_INLINE;
4802         } else if (EXT4_I(inode)->i_file_acl) { /* external block */
4803                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4804                 length = inode->i_sb->s_blocksize;
4805                 iomap_type = IOMAP_MAPPED;
4806         } else {
4807                 /* no in-inode or external block for xattr, so return -ENOENT */
4808                 error = -ENOENT;
4809                 goto out;
4810         }
4811 
4812         iomap->addr = physical;
4813         iomap->offset = 0;
4814         iomap->length = length;
4815         iomap->type = iomap_type;
4816         iomap->flags = 0;
4817 out:
4818         return error;
4819 }
4820 
4821 static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
4822                                   loff_t length, unsigned flags,
4823                                   struct iomap *iomap, struct iomap *srcmap)
4824 {
4825         int error;
4826 
4827         error = ext4_iomap_xattr_fiemap(inode, iomap);
4828         if (error == 0 && (offset >= iomap->length))
4829                 error = -ENOENT;
4830         return error;
4831 }
4832 
4833 static const struct iomap_ops ext4_iomap_xattr_ops = {
4834         .iomap_begin            = ext4_iomap_xattr_begin,
4835 };
4836 
4837 static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
4838 {
4839         u64 maxbytes;
4840 
4841         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4842                 maxbytes = inode->i_sb->s_maxbytes;
4843         else
4844                 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
4845 
4846         if (*len == 0)
4847                 return -EINVAL;
4848         if (start > maxbytes)
4849                 return -EFBIG;
4850 
4851         /*
4852          * Shrink request scope to what the fs can actually handle.
4853          */
4854         if (*len > maxbytes || (maxbytes - *len) < start)
4855                 *len = maxbytes - start;
4856         return 0;
4857 }
4858 
4859 static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4860                         __u64 start, __u64 len, bool from_es_cache)
4861 {
4862         ext4_lblk_t start_blk;
4863         u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR;
4864         int error = 0;
4865 
4866         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4867                 error = ext4_ext_precache(inode);
4868                 if (error)
4869                         return error;
4870                 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
4871         }
4872 
4873         if (from_es_cache)
4874                 ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
4875 
4876         if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
4877                 return -EBADR;
4878 
4879         /*
4880          * For bitmap files the maximum size limit could be smaller than
4881          * s_maxbytes, so check len here manually instead of just relying on the
4882          * generic check.
4883          */
4884         error = ext4_fiemap_check_ranges(inode, start, &len);
4885         if (error)
4886                 return error;
4887 
4888         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4889                 fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
4890                 error = iomap_fiemap(inode, fieinfo, start, len,
4891                                      &ext4_iomap_xattr_ops);
4892         } else if (!from_es_cache) {
4893                 error = iomap_fiemap(inode, fieinfo, start, len,
4894                                      &ext4_iomap_report_ops);
4895         } else {
4896                 ext4_lblk_t len_blks;
4897                 __u64 last_blk;
4898 
4899                 start_blk = start >> inode->i_sb->s_blocksize_bits;
4900                 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4901                 if (last_blk >= EXT_MAX_BLOCKS)
4902                         last_blk = EXT_MAX_BLOCKS-1;
4903                 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4904 
4905                 /*
4906                  * Walk the extent tree gathering extent information
4907                  * and pushing extents back to the user.
4908                  */
4909                 error = ext4_fill_es_cache_info(inode, start_blk, len_blks,
4910                                                 fieinfo);
4911         }
4912         return error;
4913 }
4914 
4915 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4916                 __u64 start, __u64 len)
4917 {
4918         return _ext4_fiemap(inode, fieinfo, start, len, false);
4919 }
4920 
4921 int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
4922                       __u64 start, __u64 len)
4923 {
4924         if (ext4_has_inline_data(inode)) {
4925                 int has_inline;
4926 
4927                 down_read(&EXT4_I(inode)->xattr_sem);
4928                 has_inline = ext4_has_inline_data(inode);
4929                 up_read(&EXT4_I(inode)->xattr_sem);
4930                 if (has_inline)
4931                         return 0;
4932         }
4933 
4934         return _ext4_fiemap(inode, fieinfo, start, len, true);
4935 }
4936 
4937 
4938 /*
4939  * ext4_access_path:
4940  * Function to access the path buffer for marking it dirty.
4941  * It also checks if there are sufficient credits left in the journal handle
4942  * to update path.
4943  */
4944 static int
4945 ext4_access_path(handle_t *handle, struct inode *inode,
4946                 struct ext4_ext_path *path)
4947 {
4948         int credits, err;
4949 
4950         if (!ext4_handle_valid(handle))
4951                 return 0;
4952 
4953         /*
4954          * Check if need to extend journal credits
4955          * 3 for leaf, sb, and inode plus 2 (bmap and group
4956          * descriptor) for each block group; assume two block
4957          * groups
4958          */
4959         credits = ext4_writepage_trans_blocks(inode);
4960         err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
4961         if (err < 0)
4962                 return err;
4963 
4964         err = ext4_ext_get_access(handle, inode, path);
4965         return err;
4966 }
4967 
4968 /*
4969  * ext4_ext_shift_path_extents:
4970  * Shift the extents of a path structure lying between path[depth].p_ext
4971  * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
4972  * if it is right shift or left shift operation.
4973  */
4974 static int
4975 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
4976                             struct inode *inode, handle_t *handle,
4977                             enum SHIFT_DIRECTION SHIFT)
4978 {
4979         int depth, err = 0;
4980         struct ext4_extent *ex_start, *ex_last;
4981         bool update = false;
4982         depth = path->p_depth;
4983 
4984         while (depth >= 0) {
4985                 if (depth == path->p_depth) {
4986                         ex_start = path[depth].p_ext;
4987                         if (!ex_start)
4988                                 return -EFSCORRUPTED;
4989 
4990                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
4991 
4992                         err = ext4_access_path(handle, inode, path + depth);
4993                         if (err)
4994                                 goto out;
4995 
4996                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
4997                                 update = true;
4998 
4999                         while (ex_start <= ex_last) {
5000                                 if (SHIFT == SHIFT_LEFT) {
5001                                         le32_add_cpu(&ex_start->ee_block,
5002                                                 -shift);
5003                                         /* Try to merge to the left. */
5004                                         if ((ex_start >
5005                                             EXT_FIRST_EXTENT(path[depth].p_hdr))
5006                                             &&
5007                                             ext4_ext_try_to_merge_right(inode,
5008                                             path, ex_start - 1))
5009                                                 ex_last--;
5010                                         else
5011                                                 ex_start++;
5012                                 } else {
5013                                         le32_add_cpu(&ex_last->ee_block, shift);
5014                                         ext4_ext_try_to_merge_right(inode, path,
5015                                                 ex_last);
5016                                         ex_last--;
5017                                 }
5018                         }
5019                         err = ext4_ext_dirty(handle, inode, path + depth);
5020                         if (err)
5021                                 goto out;
5022 
5023                         if (--depth < 0 || !update)
5024                                 break;
5025                 }
5026 
5027                 /* Update index too */
5028                 err = ext4_access_path(handle, inode, path + depth);
5029                 if (err)
5030                         goto out;
5031 
5032                 if (SHIFT == SHIFT_LEFT)
5033                         le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5034                 else
5035                         le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5036                 err = ext4_ext_dirty(handle, inode, path + depth);
5037                 if (err)
5038                         goto out;
5039 
5040                 /* we are done if current index is not a starting index */
5041                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5042                         break;
5043 
5044                 depth--;
5045         }
5046 
5047 out:
5048         return err;
5049 }
5050 
5051 /*
5052  * ext4_ext_shift_extents:
5053  * All the extents which lies in the range from @start to the last allocated
5054  * block for the @inode are shifted either towards left or right (depending
5055  * upon @SHIFT) by @shift blocks.
5056  * On success, 0 is returned, error otherwise.
5057  */
5058 static int
5059 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5060                        ext4_lblk_t start, ext4_lblk_t shift,
5061                        enum SHIFT_DIRECTION SHIFT)
5062 {
5063         struct ext4_ext_path *path;
5064         int ret = 0, depth;
5065         struct ext4_extent *extent;
5066         ext4_lblk_t stop, *iterator, ex_start, ex_end;
5067 
5068         /* Let path point to the last extent */
5069         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5070                                 EXT4_EX_NOCACHE);
5071         if (IS_ERR(path))
5072                 return PTR_ERR(path);
5073 
5074         depth = path->p_depth;
5075         extent = path[depth].p_ext;
5076         if (!extent)
5077                 goto out;
5078 
5079         stop = le32_to_cpu(extent->ee_block);
5080 
5081        /*
5082         * For left shifts, make sure the hole on the left is big enough to
5083         * accommodate the shift.  For right shifts, make sure the last extent
5084         * won't be shifted beyond EXT_MAX_BLOCKS.
5085         */
5086         if (SHIFT == SHIFT_LEFT) {
5087                 path = ext4_find_extent(inode, start - 1, &path,
5088                                         EXT4_EX_NOCACHE);
5089                 if (IS_ERR(path))
5090                         return PTR_ERR(path);
5091                 depth = path->p_depth;
5092                 extent =  path[depth].p_ext;
5093                 if (extent) {
5094                         ex_start = le32_to_cpu(extent->ee_block);
5095                         ex_end = le32_to_cpu(extent->ee_block) +
5096                                 ext4_ext_get_actual_len(extent);
5097                 } else {
5098                         ex_start = 0;
5099                         ex_end = 0;
5100                 }
5101 
5102                 if ((start == ex_start && shift > ex_start) ||
5103                     (shift > start - ex_end)) {
5104                         ret = -EINVAL;
5105                         goto out;
5106                 }
5107         } else {
5108                 if (shift > EXT_MAX_BLOCKS -
5109                     (stop + ext4_ext_get_actual_len(extent))) {
5110                         ret = -EINVAL;
5111                         goto out;
5112                 }
5113         }
5114 
5115         /*
5116          * In case of left shift, iterator points to start and it is increased
5117          * till we reach stop. In case of right shift, iterator points to stop
5118          * and it is decreased till we reach start.
5119          */
5120         if (SHIFT == SHIFT_LEFT)
5121                 iterator = &start;
5122         else
5123                 iterator = &stop;
5124 
5125         /*
5126          * Its safe to start updating extents.  Start and stop are unsigned, so
5127          * in case of right shift if extent with 0 block is reached, iterator
5128          * becomes NULL to indicate the end of the loop.
5129          */
5130         while (iterator && start <= stop) {
5131                 path = ext4_find_extent(inode, *iterator, &path,
5132                                         EXT4_EX_NOCACHE);
5133                 if (IS_ERR(path))
5134                         return PTR_ERR(path);
5135                 depth = path->p_depth;
5136                 extent = path[depth].p_ext;
5137                 if (!extent) {
5138                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5139                                          (unsigned long) *iterator);
5140                         return -EFSCORRUPTED;
5141                 }
5142                 if (SHIFT == SHIFT_LEFT && *iterator >
5143                     le32_to_cpu(extent->ee_block)) {
5144                         /* Hole, move to the next extent */
5145                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5146                                 path[depth].p_ext++;
5147                         } else {
5148                                 *iterator = ext4_ext_next_allocated_block(path);
5149                                 continue;
5150                         }
5151                 }
5152 
5153                 if (SHIFT == SHIFT_LEFT) {
5154                         extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5155                         *iterator = le32_to_cpu(extent->ee_block) +
5156                                         ext4_ext_get_actual_len(extent);
5157                 } else {
5158                         extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5159                         if (le32_to_cpu(extent->ee_block) > 0)
5160                                 *iterator = le32_to_cpu(extent->ee_block) - 1;
5161                         else
5162                                 /* Beginning is reached, end of the loop */
5163                                 iterator = NULL;
5164                         /* Update path extent in case we need to stop */
5165                         while (le32_to_cpu(extent->ee_block) < start)
5166                                 extent++;
5167                         path[depth].p_ext = extent;
5168                 }
5169                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5170                                 handle, SHIFT);
5171                 if (ret)
5172                         break;
5173         }
5174 out:
5175         ext4_ext_drop_refs(path);
5176         kfree(path);
5177         return ret;
5178 }
5179 
5180 /*
5181  * ext4_collapse_range:
5182  * This implements the fallocate's collapse range functionality for ext4
5183  * Returns: 0 and non-zero on error.
5184  */
5185 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5186 {
5187         struct super_block *sb = inode->i_sb;
5188         ext4_lblk_t punch_start, punch_stop;
5189         handle_t *handle;
5190         unsigned int credits;
5191         loff_t new_size, ioffset;
5192         int ret;
5193 
5194         /*
5195          * We need to test this early because xfstests assumes that a
5196          * collapse range of (0, 1) will return EOPNOTSUPP if the file
5197          * system does not support collapse range.
5198          */
5199         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5200                 return -EOPNOTSUPP;
5201 
5202         /* Collapse range works only on fs cluster size aligned regions. */
5203         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5204                 return -EINVAL;
5205 
5206         trace_ext4_collapse_range(inode, offset, len);
5207 
5208         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5209         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5210 
5211         /* Call ext4_force_commit to flush all data in case of data=journal. */
5212         if (ext4_should_journal_data(inode)) {
5213                 ret = ext4_force_commit(inode->i_sb);
5214                 if (ret)
5215                         return ret;
5216         }
5217 
5218         inode_lock(inode);
5219         /*
5220          * There is no need to overlap collapse range with EOF, in which case
5221          * it is effectively a truncate operation
5222          */
5223         if (offset + len >= inode->i_size) {
5224                 ret = -EINVAL;
5225                 goto out_mutex;
5226         }
5227 
5228         /* Currently just for extent based files */
5229         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5230                 ret = -EOPNOTSUPP;
5231                 goto out_mutex;
5232         }
5233 
5234         /* Wait for existing dio to complete */
5235         inode_dio_wait(inode);
5236 
5237         /*
5238          * Prevent page faults from reinstantiating pages we have released from
5239          * page cache.
5240          */
5241         down_write(&EXT4_I(inode)->i_mmap_sem);
5242 
5243         ret = ext4_break_layouts(inode);
5244         if (ret)
5245                 goto out_mmap;
5246 
5247         /*
5248          * Need to round down offset to be aligned with page size boundary
5249          * for page size > block size.
5250          */
5251         ioffset = round_down(offset, PAGE_SIZE);
5252         /*
5253          * Write tail of the last page before removed range since it will get
5254          * removed from the page cache below.
5255          */
5256         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5257         if (ret)
5258                 goto out_mmap;
5259         /*
5260          * Write data that will be shifted to preserve them when discarding
5261          * page cache below. We are also protected from pages becoming dirty
5262          * by i_mmap_sem.
5263          */
5264         ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5265                                            LLONG_MAX);
5266         if (ret)
5267                 goto out_mmap;
5268         truncate_pagecache(inode, ioffset);
5269 
5270         credits = ext4_writepage_trans_blocks(inode);
5271         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5272         if (IS_ERR(handle)) {
5273                 ret = PTR_ERR(handle);
5274                 goto out_mmap;
5275         }
5276 
5277         down_write(&EXT4_I(inode)->i_data_sem);
5278         ext4_discard_preallocations(inode);
5279 
5280         ret = ext4_es_remove_extent(inode, punch_start,
5281                                     EXT_MAX_BLOCKS - punch_start);
5282         if (ret) {
5283                 up_write(&EXT4_I(inode)->i_data_sem);
5284                 goto out_stop;
5285         }
5286 
5287         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5288         if (ret) {
5289                 up_write(&EXT4_I(inode)->i_data_sem);
5290                 goto out_stop;
5291         }
5292         ext4_discard_preallocations(inode);
5293 
5294         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5295                                      punch_stop - punch_start, SHIFT_LEFT);
5296         if (ret) {
5297                 up_write(&EXT4_I(inode)->i_data_sem);
5298                 goto out_stop;
5299         }
5300 
5301         new_size = inode->i_size - len;
5302         i_size_write(inode, new_size);
5303         EXT4_I(inode)->i_disksize = new_size;
5304 
5305         up_write(&EXT4_I(inode)->i_data_sem);
5306         if (IS_SYNC(inode))
5307                 ext4_handle_sync(handle);
5308         inode->i_mtime = inode->i_ctime = current_time(inode);
5309         ret = ext4_mark_inode_dirty(handle, inode);
5310         ext4_update_inode_fsync_trans(handle, inode, 1);
5311 
5312 out_stop:
5313         ext4_journal_stop(handle);
5314 out_mmap:
5315         up_write(&EXT4_I(inode)->i_mmap_sem);
5316 out_mutex:
5317         inode_unlock(inode);
5318         return ret;
5319 }
5320 
5321 /*
5322  * ext4_insert_range:
5323  * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5324  * The data blocks starting from @offset to the EOF are shifted by @len
5325  * towards right to create a hole in the @inode. Inode size is increased
5326  * by len bytes.
5327  * Returns 0 on success, error otherwise.
5328  */
5329 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5330 {
5331         struct super_block *sb = inode->i_sb;
5332         handle_t *handle;
5333         struct ext4_ext_path *path;
5334         struct ext4_extent *extent;
5335         ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5336         unsigned int credits, ee_len;
5337         int ret = 0, depth, split_flag = 0;
5338         loff_t ioffset;
5339 
5340         /*
5341          * We need to test this early because xfstests assumes that an
5342          * insert range of (0, 1) will return EOPNOTSUPP if the file
5343          * system does not support insert range.
5344          */
5345         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5346                 return -EOPNOTSUPP;
5347 
5348         /* Insert range works only on fs cluster size aligned regions. */
5349         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5350                 return -EINVAL;
5351 
5352         trace_ext4_insert_range(inode, offset, len);
5353 
5354         offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5355         len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5356 
5357         /* Call ext4_force_commit to flush all data in case of data=journal */
5358         if (ext4_should_journal_data(inode)) {
5359                 ret = ext4_force_commit(inode->i_sb);
5360                 if (ret)
5361                         return ret;
5362         }
5363 
5364         inode_lock(inode);
5365         /* Currently just for extent based files */
5366         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5367                 ret = -EOPNOTSUPP;
5368                 goto out_mutex;
5369         }
5370 
5371         /* Check whether the maximum file size would be exceeded */
5372         if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5373                 ret = -EFBIG;
5374                 goto out_mutex;
5375         }
5376 
5377         /* Offset must be less than i_size */
5378         if (offset >= inode->i_size) {
5379                 ret = -EINVAL;
5380                 goto out_mutex;
5381         }
5382 
5383         /* Wait for existing dio to complete */
5384         inode_dio_wait(inode);
5385 
5386         /*
5387          * Prevent page faults from reinstantiating pages we have released from
5388          * page cache.
5389          */
5390         down_write(&EXT4_I(inode)->i_mmap_sem);
5391 
5392         ret = ext4_break_layouts(inode);
5393         if (ret)
5394                 goto out_mmap;
5395 
5396         /*
5397          * Need to round down to align start offset to page size boundary
5398          * for page size > block size.
5399          */
5400         ioffset = round_down(offset, PAGE_SIZE);
5401         /* Write out all dirty pages */
5402         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5403                         LLONG_MAX);
5404         if (ret)
5405                 goto out_mmap;
5406         truncate_pagecache(inode, ioffset);
5407 
5408         credits = ext4_writepage_trans_blocks(inode);
5409         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5410         if (IS_ERR(handle)) {
5411                 ret = PTR_ERR(handle);
5412                 goto out_mmap;
5413         }
5414 
5415         /* Expand file to avoid data loss if there is error while shifting */
5416         inode->i_size += len;
5417         EXT4_I(inode)->i_disksize += len;
5418         inode->i_mtime = inode->i_ctime = current_time(inode);
5419         ret = ext4_mark_inode_dirty(handle, inode);
5420         if (ret)
5421                 goto out_stop;
5422 
5423         down_write(&EXT4_I(inode)->i_data_sem);
5424         ext4_discard_preallocations(inode);
5425 
5426         path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5427         if (IS_ERR(path)) {
5428                 up_write(&EXT4_I(inode)->i_data_sem);
5429                 goto out_stop;
5430         }
5431 
5432         depth = ext_depth(inode);
5433         extent = path[depth].p_ext;
5434         if (extent) {
5435                 ee_start_lblk = le32_to_cpu(extent->ee_block);
5436                 ee_len = ext4_ext_get_actual_len(extent);
5437 
5438                 /*
5439                  * If offset_lblk is not the starting block of extent, split
5440                  * the extent @offset_lblk
5441                  */
5442                 if ((offset_lblk > ee_start_lblk) &&
5443                                 (offset_lblk < (ee_start_lblk + ee_len))) {
5444                         if (ext4_ext_is_unwritten(extent))
5445                                 split_flag = EXT4_EXT_MARK_UNWRIT1 |
5446                                         EXT4_EXT_MARK_UNWRIT2;
5447                         ret = ext4_split_extent_at(handle, inode, &path,
5448                                         offset_lblk, split_flag,
5449                                         EXT4_EX_NOCACHE |
5450                                         EXT4_GET_BLOCKS_PRE_IO |
5451                                         EXT4_GET_BLOCKS_METADATA_NOFAIL);
5452                 }
5453 
5454                 ext4_ext_drop_refs(path);
5455                 kfree(path);
5456                 if (ret < 0) {
5457                         up_write(&EXT4_I(inode)->i_data_sem);
5458                         goto out_stop;
5459                 }
5460         } else {
5461                 ext4_ext_drop_refs(path);
5462                 kfree(path);
5463         }
5464 
5465         ret = ext4_es_remove_extent(inode, offset_lblk,
5466                         EXT_MAX_BLOCKS - offset_lblk);
5467         if (ret) {
5468                 up_write(&EXT4_I(inode)->i_data_sem);
5469                 goto out_stop;
5470         }
5471 
5472         /*
5473          * if offset_lblk lies in a hole which is at start of file, use
5474          * ee_start_lblk to shift extents
5475          */
5476         ret = ext4_ext_shift_extents(inode, handle,
5477                 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5478                 len_lblk, SHIFT_RIGHT);
5479 
5480         up_write(&EXT4_I(inode)->i_data_sem);
5481         if (IS_SYNC(inode))
5482                 ext4_handle_sync(handle);
5483         if (ret >= 0)
5484                 ext4_update_inode_fsync_trans(handle, inode, 1);
5485 
5486 out_stop:
5487         ext4_journal_stop(handle);
5488 out_mmap:
5489         up_write(&EXT4_I(inode)->i_mmap_sem);
5490 out_mutex:
5491         inode_unlock(inode);
5492         return ret;
5493 }
5494 
5495 /**
5496  * ext4_swap_extents() - Swap extents between two inodes
5497  * @handle: handle for this transaction
5498  * @inode1:     First inode
5499  * @inode2:     Second inode
5500  * @lblk1:      Start block for first inode
5501  * @lblk2:      Start block for second inode
5502  * @count:      Number of blocks to swap
5503  * @unwritten: Mark second inode's extents as unwritten after swap
5504  * @erp:        Pointer to save error value
5505  *
5506  * This helper routine does exactly what is promise "swap extents". All other
5507  * stuff such as page-cache locking consistency, bh mapping consistency or
5508  * extent's data copying must be performed by caller.
5509  * Locking:
5510  *              i_mutex is held for both inodes
5511  *              i_data_sem is locked for write for both inodes
5512  * Assumptions:
5513  *              All pages from requested range are locked for both inodes
5514  */
5515 int
5516 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5517                   struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5518                   ext4_lblk_t count, int unwritten, int *erp)
5519 {
5520         struct ext4_ext_path *path1 = NULL;
5521         struct ext4_ext_path *path2 = NULL;
5522         int replaced_count = 0;
5523 
5524         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5525         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5526         BUG_ON(!inode_is_locked(inode1));
5527         BUG_ON(!inode_is_locked(inode2));
5528 
5529         *erp = ext4_es_remove_extent(inode1, lblk1, count);
5530         if (unlikely(*erp))
5531                 return 0;
5532         *erp = ext4_es_remove_extent(inode2, lblk2, count);
5533         if (unlikely(*erp))
5534                 return 0;
5535 
5536         while (count) {
5537                 struct ext4_extent *ex1, *ex2, tmp_ex;
5538                 ext4_lblk_t e1_blk, e2_blk;
5539                 int e1_len, e2_len, len;
5540                 int split = 0;
5541 
5542                 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5543                 if (IS_ERR(path1)) {
5544                         *erp = PTR_ERR(path1);
5545                         path1 = NULL;
5546                 finish:
5547                         count = 0;
5548                         goto repeat;
5549                 }
5550                 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5551                 if (IS_ERR(path2)) {
5552                         *erp = PTR_ERR(path2);
5553                         path2 = NULL;
5554                         goto finish;
5555                 }
5556                 ex1 = path1[path1->p_depth].p_ext;
5557                 ex2 = path2[path2->p_depth].p_ext;
5558                 /* Do we have somthing to swap ? */
5559                 if (unlikely(!ex2 || !ex1))
5560                         goto finish;
5561 
5562                 e1_blk = le32_to_cpu(ex1->ee_block);
5563                 e2_blk = le32_to_cpu(ex2->ee_block);
5564                 e1_len = ext4_ext_get_actual_len(ex1);
5565                 e2_len = ext4_ext_get_actual_len(ex2);
5566 
5567                 /* Hole handling */
5568                 if (!in_range(lblk1, e1_blk, e1_len) ||
5569                     !in_range(lblk2, e2_blk, e2_len)) {
5570                         ext4_lblk_t next1, next2;
5571 
5572                         /* if hole after extent, then go to next extent */
5573                         next1 = ext4_ext_next_allocated_block(path1);
5574                         next2 = ext4_ext_next_allocated_block(path2);
5575                         /* If hole before extent, then shift to that extent */
5576                         if (e1_blk > lblk1)
5577                                 next1 = e1_blk;
5578                         if (e2_blk > lblk2)
5579                                 next2 = e2_blk;
5580                         /* Do we have something to swap */
5581                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5582                                 goto finish;
5583                         /* Move to the rightest boundary */
5584                         len = next1 - lblk1;
5585                         if (len < next2 - lblk2)
5586                                 len = next2 - lblk2;
5587                         if (len > count)
5588                                 len = count;
5589                         lblk1 += len;
5590                         lblk2 += len;
5591                         count -= len;
5592                         goto repeat;
5593                 }
5594 
5595                 /* Prepare left boundary */
5596                 if (e1_blk < lblk1) {
5597                         split = 1;
5598                         *erp = ext4_force_split_extent_at(handle, inode1,
5599                                                 &path1, lblk1, 0);
5600                         if (unlikely(*erp))
5601                                 goto finish;
5602                 }
5603                 if (e2_blk < lblk2) {
5604                         split = 1;
5605                         *erp = ext4_force_split_extent_at(handle, inode2,
5606                                                 &path2,  lblk2, 0);
5607                         if (unlikely(*erp))
5608                                 goto finish;
5609                 }
5610                 /* ext4_split_extent_at() may result in leaf extent split,
5611                  * path must to be revalidated. */
5612                 if (split)
5613                         goto repeat;
5614 
5615                 /* Prepare right boundary */
5616                 len = count;
5617                 if (len > e1_blk + e1_len - lblk1)
5618                         len = e1_blk + e1_len - lblk1;
5619                 if (len > e2_blk + e2_len - lblk2)
5620                         len = e2_blk + e2_len - lblk2;
5621 
5622                 if (len != e1_len) {
5623                         split = 1;
5624                         *erp = ext4_force_split_extent_at(handle, inode1,
5625                                                 &path1, lblk1 + len, 0);
5626                         if (unlikely(*erp))
5627                                 goto finish;
5628                 }
5629                 if (len != e2_len) {
5630                         split = 1;
5631                         *erp = ext4_force_split_extent_at(handle, inode2,
5632                                                 &path2, lblk2 + len, 0);
5633                         if (*erp)
5634                                 goto finish;
5635                 }
5636                 /* ext4_split_extent_at() may result in leaf extent split,
5637                  * path must to be revalidated. */
5638                 if (split)
5639                         goto repeat;
5640 
5641                 BUG_ON(e2_len != e1_len);
5642                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5643                 if (unlikely(*erp))
5644                         goto finish;
5645                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5646                 if (unlikely(*erp))
5647                         goto finish;
5648 
5649                 /* Both extents are fully inside boundaries. Swap it now */
5650                 tmp_ex = *ex1;
5651                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5652                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5653                 ex1->ee_len = cpu_to_le16(e2_len);
5654                 ex2->ee_len = cpu_to_le16(e1_len);
5655                 if (unwritten)
5656                         ext4_ext_mark_unwritten(ex2);
5657                 if (ext4_ext_is_unwritten(&tmp_ex))
5658                         ext4_ext_mark_unwritten(ex1);
5659 
5660                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5661                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5662                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5663                                       path2->p_depth);
5664                 if (unlikely(*erp))
5665                         goto finish;
5666                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5667                                       path1->p_depth);
5668                 /*
5669                  * Looks scarry ah..? second inode already points to new blocks,
5670                  * and it was successfully dirtied. But luckily error may happen
5671                  * only due to journal error, so full transaction will be
5672                  * aborted anyway.
5673                  */
5674                 if (unlikely(*erp))
5675                         goto finish;
5676                 lblk1 += len;
5677                 lblk2 += len;
5678                 replaced_count += len;
5679                 count -= len;
5680 
5681         repeat:
5682                 ext4_ext_drop_refs(path1);
5683                 kfree(path1);
5684                 ext4_ext_drop_refs(path2);
5685                 kfree(path2);
5686                 path1 = path2 = NULL;
5687         }
5688         return replaced_count;
5689 }
5690 
5691 /*
5692  * ext4_clu_mapped - determine whether any block in a logical cluster has
5693  *                   been mapped to a physical cluster
5694  *
5695  * @inode - file containing the logical cluster
5696  * @lclu - logical cluster of interest
5697  *
5698  * Returns 1 if any block in the logical cluster is mapped, signifying
5699  * that a physical cluster has been allocated for it.  Otherwise,
5700  * returns 0.  Can also return negative error codes.  Derived from
5701  * ext4_ext_map_blocks().
5702  */
5703 int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5704 {
5705         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5706         struct ext4_ext_path *path;
5707         int depth, mapped = 0, err = 0;
5708         struct ext4_extent *extent;
5709         ext4_lblk_t first_lblk, first_lclu, last_lclu;
5710 
5711         /* search for the extent closest to the first block in the cluster */
5712         path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5713         if (IS_ERR(path)) {
5714                 err = PTR_ERR(path);
5715                 path = NULL;
5716                 goto out;
5717         }
5718 
5719         depth = ext_depth(inode);
5720 
5721         /*
5722          * A consistent leaf must not be empty.  This situation is possible,
5723          * though, _during_ tree modification, and it's why an assert can't
5724          * be put in ext4_find_extent().
5725          */
5726         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5727                 EXT4_ERROR_INODE(inode,
5728                     "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5729                                  (unsigned long) EXT4_C2B(sbi, lclu),
5730                                  depth, path[depth].p_block);
5731                 err = -EFSCORRUPTED;
5732                 goto out;
5733         }
5734 
5735         extent = path[depth].p_ext;
5736 
5737         /* can't be mapped if the extent tree is empty */
5738         if (extent == NULL)
5739                 goto out;
5740 
5741         first_lblk = le32_to_cpu(extent->ee_block);
5742         first_lclu = EXT4_B2C(sbi, first_lblk);
5743 
5744         /*
5745          * Three possible outcomes at this point - found extent spanning
5746          * the target cluster, to the left of the target cluster, or to the
5747          * right of the target cluster.  The first two cases are handled here.
5748          * The last case indicates the target cluster is not mapped.
5749          */
5750         if (lclu >= first_lclu) {
5751                 last_lclu = EXT4_B2C(sbi, first_lblk +
5752                                      ext4_ext_get_actual_len(extent) - 1);
5753                 if (lclu <= last_lclu) {
5754                         mapped = 1;
5755                 } else {
5756                         first_lblk = ext4_ext_next_allocated_block(path);
5757                         first_lclu = EXT4_B2C(sbi, first_lblk);
5758                         if (lclu == first_lclu)
5759                                 mapped = 1;
5760                 }
5761         }
5762 
5763 out:
5764         ext4_ext_drop_refs(path);